diff --git a/.circleci/README.md b/.circleci/README.md deleted file mode 100644 index 31e80ebd3450c..0000000000000 --- a/.circleci/README.md +++ /dev/null @@ -1,89 +0,0 @@ -CircleCi integration is controlled by the `./circleci/config.yml` file. Our -config currently contains two workflows. One is triggered on every pull request update. -The other workflow runs nightly to verify our compatibility with prestodb internal protocol. - -The PR workflow is named `dist-compile` and has 4 jobs, 2 to build and run unit tests on linux and macos -and 2 to check code formatting and license headers: -* linux-build -* macos-build -* format-check -* header-check - -## Running locally - -The linux container based jobs can be run locally using the `circleci` cli: - -``` - circleci local execute --job JOB_NAME -``` - -For example to run unit tests use: - -``` - circleci local execute --job linux-build -``` - -A Nightly build with prestodb/master sync checks that the presto_protocol library -remains in sync with Presto Java. - -Run the nightly sync job locally: -``` - circleci local execute --job presto-sync -``` - -## Install CircleCi cli -``` - curl -fLSs https://circle.ci/cli | bash -``` - -To use containers Docker must be installed. Here are instructions to [Install -Docker on macos](https://docs.docker.com/docker-for-mac/install/). Docker deamon -must be running before issuing the `circleci` commands. - -### Macos testing - -Macos testing is done by using the CircleCi macos executor and installing -dependencies each time the job is run. This executor cannot be run locally. -The script `scripts/setup-macos.sh` contains commands that are run as part of -this job to install these dependencies. - -### Linux testing - -Linux testing uses a Docker container. The container build depends on the Velox CircleCi container. Check -velox/.circleci/config.yml to see that the base container in circleci-container.dockfile is using the latest. -The container build uses Docker and should be run on your macos or linux laptop with Docker installed and -running. - -#### Build the base container: - -* In an up-to-date clone of velox (maybe you have one?) - -``` -git clone git@github.com:facebookincubator/velox.git -cd velox -make base-container -``` -* Wait - This step takes rather a long time. It is building clang-format v8 to be compatible with fbcode -* When the base container is finished the new container name will be printed on the console. -* Push the container to DockerHub -``` -docker push prestocpp/base-container:$USER-YYYYMMDD -``` -* After the push, update `scripts/velox-container.dockfile` with the newly build base container name - -#### Build the dependencies container - -* If you have a new base-container update scripts/velox-container.dockfile to refer to it -* Build the velox container -``` -make velox-container.dockfile -``` -* Wait - This takes a few minutes, but not nearly as long as the base container. -* When the velox container is finished the new container name will be printed on the console. -* Push the container to DockerHub -``` -docker push prestocpp/velox-container:$USER-YYYYMMDD -``` -* Update `.circleci/config.yml` with the newly built circleci container name. - There are two places in the config.yml file that refer to the container, update - both. diff --git a/.circleci/config.yml b/.circleci/config.yml index 303f1356521e7..b121fa7dfd36b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -17,28 +17,17 @@ version: 2.1 # This allows us to use CircleCI's dynamic configuration feature setup: true - -# Path-filtering orb is required to continue a pipeline based on -# the path of an updated fileset -orbs: - path-filtering: circleci/path-filtering@0.1.1 +jobs: + noop-build: + docker: + - image: cimg/base:2024.02 + steps: + - run: circleci-agent step halt workflows: version: 2 path-filtering-workflow: jobs: + - noop-build - - path-filtering/filter: - name: check-sensitive-paths - - # Format is: - # Regex below will filter out paths with test in them. - mapping: | - velox/expression/((?!.*test).*).* run-longer-expression-fuzzer true - velox/exec/((?!.*test).*).* run-longer-expression-fuzzer true - velox/common/((?!.*test).*).* run-longer-expression-fuzzer true - velox/core/((?!.*test).*).* run-longer-expression-fuzzer true - velox/vector/((?!.*test).*).* run-longer-expression-fuzzer true - - config-path: .circleci/dist_compile.yml diff --git a/.circleci/dist_compile.yml b/.circleci/dist_compile.yml deleted file mode 100644 index a853ed985e11f..0000000000000 --- a/.circleci/dist_compile.yml +++ /dev/null @@ -1,703 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -version: 2.1 - - -# Default pipeline parameters, which will be updated according to -# the results of the path-filtering orb -parameters: - run-longer-expression-fuzzer: - type: boolean - default: false - -commands: - update-submodules: - steps: - - run: - name: "Update Submodules" - command: | - git submodule sync --recursive - git submodule update --init --recursive - - setup-environment: - steps: - - run: - name: "Setup Environment" - command: | - # Calculate ccache key. - git show -s --format=%cd --date="format:%Y%m%d" $(git merge-base origin/main HEAD) | tee merge-base-date - - # Set up xml gtest output. - mkdir -p /tmp/test_xml_output/ - echo "export XML_OUTPUT_FILE=\"/tmp/test_xml_output/\"" >> $BASH_ENV - - # Set up ccache configs. - mkdir -p .ccache - echo "export CCACHE_DIR=$(realpath .ccache)" >> $BASH_ENV - ccache -sz -M 5Gi - if [ -e /opt/rh/gcc-toolset-9/enable ]; then - source /opt/rh/gcc-toolset-9/enable - fi - - restore_cache: - name: "Restore CCache Cache" - keys: - - velox-ccache-debug-{{ arch }}-{{ checksum "merge-base-date" }} - - pre-steps: - steps: - - checkout - - update-submodules - - setup-environment - - post-steps: - steps: - - save_cache: - name: "Save CCache Cache" - key: velox-ccache-debug-{{ arch }}-{{ checksum "merge-base-date" }} - paths: - - .ccache/ - - store_artifacts: - path: '_build/debug/.ninja_log' - - store_test_results: - path: '/tmp/test_xml_output/' - - build-benchmarks: - parameters: - binary_output: - type: string - benchmark_class: - type: string - steps: - - run: - name: "Build Benchmarks - << parameters.benchmark_class >>" - command: | - make benchmarks-basic-build NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=8 - ccache -s - mkdir -p << parameters.binary_output >> - cp -r --verbose _build/release/velox/benchmarks/basic/* << parameters.binary_output >> - - fuzzer-run: - parameters: - fuzzer_repro: - type: string - fuzzer_output: - type: string - fuzzer_name: - type: string - fuzzer_exe: - type: string - fuzzer_args: - type: string - steps: - - pre-steps - - run: - name: Build - command: | - make debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=4 - ccache -s - no_output_timeout: 1h - - run: - name: "Run << parameters.fuzzer_name >> Fuzzer" - command: | - eval ' << parameters.fuzzer_exe >> << parameters.fuzzer_args >> ' \ - 2>&1 | tee "<< parameters.fuzzer_output >>" || ( \ - tail -n 1000 "<< parameters.fuzzer_output >>" ; \ - echo "FAIL: << parameters.fuzzer_name >> run failed"; \ - exit 1; \ - ) - echo -e "\n << parameters.fuzzer_name >> run finished successfully." - no_output_timeout: 120m - - store_artifacts: - path: << parameters.fuzzer_output >> - - store_artifacts: - path: << parameters.fuzzer_repro >> - - post-steps - -executors: - build: - docker: - - image : ghcr.io/facebookincubator/velox-dev:circleci-avx - resource_class: 2xlarge - environment: - CC: /opt/rh/gcc-toolset-9/root/bin/gcc - CXX: /opt/rh/gcc-toolset-9/root/bin/g++ - VELOX_DEPENDENCY_SOURCE: BUNDLED - simdjson_SOURCE: BUNDLED - check: - docker: - - image : ghcr.io/facebookincubator/velox-dev:check-avx - macos-intel: - macos: - xcode: "14.3.0" - resource_class: macos.x86.medium.gen2 - macos-m1: - macos: - xcode: "14.2.0" - resource_class: macos.m1.large.gen1 - -jobs: - macos-build: - parameters: - os: - type: executor - executor: << parameters.os >> - environment: - ICU_SOURCE: BUNDLED - simdjson_SOURCE: BUNDLED - steps: - - checkout - - update-submodules - - restore_cache: - name: "Restore Dependency Cache" - # The version number in the key can be incremented - # to manually avoid the case where bad dependencies - # are cached, and has no other meaning. - # If you update it, be sure to update save_cache too. - key: velox-circleci-macos-{{ arch }}-deps-v1-{{ checksum ".circleci/config.yml" }}-{{ checksum "scripts/setup-macos.sh" }} - - run: - name: "Install dependencies" - command: | - set -xu - mkdir -p ~/deps ~/deps-src - curl -L https://github.com/Homebrew/brew/tarball/master | tar xz --strip 1 -C ~/deps - PATH=~/deps/bin:${PATH} DEPENDENCY_DIR=~/deps-src INSTALL_PREFIX=~/deps PROMPT_ALWAYS_RESPOND=n ./scripts/setup-macos.sh - rm -rf ~/deps/.git ~/deps/Library/Taps/ # Reduce cache size by 70%. - no_output_timeout: 20m - - save_cache: - name: "Save Dependency Cache" - # The version number in the key can be incremented - # to manually avoid the case where bad dependencies - # are cached, and has no other meaning. - # If you update it, be sure to update restore_cache too. - key: velox-circleci-macos-{{ arch }}-deps-v1-{{ checksum ".circleci/config.yml" }}-{{ checksum "scripts/setup-macos.sh" }} - paths: - - ~/deps - - run: - name: "Calculate merge-base date for CCache" - command: git show -s --format=%cd --date="format:%Y%m%d" $(git merge-base origin/main HEAD) | tee merge-base-date - - restore_cache: - name: "Restore CCache cache" - keys: - - velox-ccache-debug-{{ arch }}-{{ checksum "merge-base-date" }} - - run: - name: "Build on MacOS" - command: | - export PATH=~/deps/bin:~/deps/opt/bison/bin:~/deps/opt/flex/bin:${PATH} - mkdir -p .ccache - export CCACHE_DIR=$(pwd)/.ccache - ccache -sz -M 5Gi - brew install openssl@1.1 - brew link --overwrite --force openssl@1.1 - export PATH="/Users/distiller/deps/opt/openssl@1.1/bin:$PATH" - export OPENSSL_ROOT_DIR=$(brew --prefix openssl@1.1) - cmake \ - -B _build/debug \ - -GNinja \ - -DTREAT_WARNINGS_AS_ERRORS=1 \ - -DENABLE_ALL_WARNINGS=1 \ - -DVELOX_ENABLE_PARQUET=ON \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_PREFIX_PATH=~/deps \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DFLEX_INCLUDE_DIR=~/deps/opt/flex/include - ninja -C _build/debug - ccache -s - no_output_timeout: 1h - - save_cache: - name: "Save CCache cache" - key: velox-ccache-debug-{{ arch }}-{{ checksum "merge-base-date" }} - paths: - - .ccache/ - - linux-build: - executor: build - steps: - - pre-steps - - run: - name: "Build" - command: | - make debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=5 EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON" - ccache -s - no_output_timeout: 1h - - run: - name: "Run Unit Tests" - command: | - cd _build/debug && ctest -j 16 -VV --output-on-failure - no_output_timeout: 1h - - store_test_results: - path: /tmp/test_xml_output/ - - run: - name: "Run Fuzzer Tests" - # Run fuzzer using the built executable - we do this instead of make - # since currently make fuzzertest tends to rebuild the project. - command: | - mkdir -p /tmp/fuzzer_repro/ - chmod -R 777 /tmp/fuzzer_repro - _build/debug/velox/expression/tests/velox_expression_fuzzer_test \ - --seed ${RANDOM} \ - --enable_variadic_signatures \ - --velox_fuzzer_enable_complex_types \ - --lazy_vector_generation_ratio 0.2 \ - --velox_fuzzer_enable_column_reuse \ - --velox_fuzzer_enable_expression_reuse \ - --max_expression_trees_per_step 2 \ - --retry_with_try \ - --enable_dereference \ - --duration_sec 60 \ - --logtostderr=1 \ - --minloglevel=0 \ - --repro_persist_path=/tmp/fuzzer_repro \ - && echo -e "\n\nFuzzer run finished successfully." - no_output_timeout: 5m - - store_artifacts: - path: '/tmp/fuzzer_repro' - - run: - name: "Run Spark Fuzzer Tests" - command: | - mkdir -p /tmp/spark_fuzzer_repro/ - chmod -R 777 /tmp/spark_fuzzer_repro - _build/debug/velox/expression/tests/spark_expression_fuzzer_test \ - --seed ${RANDOM} \ - --duration_sec 60 \ - --enable_variadic_signatures \ - --lazy_vector_generation_ratio 0.2 \ - --velox_fuzzer_enable_column_reuse \ - --velox_fuzzer_enable_expression_reuse \ - --max_expression_trees_per_step 2 \ - --retry_with_try \ - --logtostderr=1 \ - --minloglevel=0 \ - --repro_persist_path=/tmp/spark_fuzzer_repro \ - && echo -e "\n\nSpark Fuzzer run finished successfully." - no_output_timeout: 5m - - store_artifacts: - path: '/tmp/spark_fuzzer_repro' - - run: - name: "Run Spark Aggregate Fuzzer Tests" - command: | - mkdir -p /tmp/spark_aggregate_fuzzer_repro/ - chmod -R 777 /tmp/spark_aggregate_fuzzer_repro - _build/debug/velox/exec/tests/spark_aggregation_fuzzer_test \ - --seed ${RANDOM} \ - --duration_sec 60 \ - --logtostderr=1 \ - --minloglevel=0 \ - --repro_persist_path=/tmp/spark_aggregate_fuzzer_repro \ - && echo -e "\n\nSpark Aggregation Fuzzer run finished successfully." - no_output_timeout: 5m - - store_artifacts: - path: '/tmp/spark_aggregate_fuzzer_repro' - - run: - name: "Run Aggregate Fuzzer Tests" - # Run aggregation fuzzer using the built executable. - command: | - mkdir -p /tmp/aggregate_fuzzer_repro/ - rm -rfv /tmp/aggregate_fuzzer_repro/* - chmod -R 777 /tmp/aggregate_fuzzer_repro - _build/debug/velox/exec/tests/velox_aggregation_fuzzer_test \ - --seed ${RANDOM} \ - --duration_sec 60 \ - --logtostderr=1 \ - --minloglevel=0 \ - --repro_persist_path=/tmp/aggregate_fuzzer_repro \ - && echo -e "\n\nAggregation fuzzer run finished successfully." - no_output_timeout: 5m - - store_artifacts: - path: '/tmp/aggregate_fuzzer_repro' - - run: - name: "Run Example Binaries" - command: | - find _build/debug/velox/examples/ -maxdepth 1 -type f -executable -exec "{}" \; - - post-steps - - linux-build-release: - executor: build - steps: - - pre-steps - - run: - name: Build - command: | - make release NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=8 - ccache -s - no_output_timeout: 1h - - run: - name: "Run Unit Tests" - command: | - cd _build/release && ctest -j 16 -VV --output-on-failure - no_output_timeout: 1h - - post-steps - - # Build with different options - linux-build-options: - executor: build - steps: - - pre-steps - - run: - name: "Build Velox Minimal" - command: | - make min_debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=16 - ccache -s - no_output_timeout: 1h - - run: - name: "Build Velox Without Testing" - command: | - make clean - make debug EXTRA_CMAKE_FLAGS="-DVELOX_BUILD_TESTING=OFF" NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=16 - ccache -s - no_output_timeout: 1h - - post-steps - - linux-adapters: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - ICU_SOURCE: BUNDLED - simdjson_SOURCE: BUNDLED - steps: - - pre-steps - - run: - name: "Install Adapter Dependencies" - command: | - mkdir -p ~/adapter-deps/install/bin - source /opt/rh/gcc-toolset-9/enable - set -xu - DEPENDENCY_DIR=~/adapter-deps PROMPT_ALWAYS_RESPOND=n ./scripts/setup-adapters.sh - - run: - name: "Install Minio Server" - command: | - set -xu - cd ~/adapter-deps/install/bin/ - wget https://dl.min.io/server/minio/release/linux-amd64/archive/minio-20220526054841.0.0.x86_64.rpm - rpm -i minio-20220526054841.0.0.x86_64.rpm - rm minio-20220526054841.0.0.x86_64.rpm - - run: - name: "Install Hadoop Dependency" - command: | - set -xu - yum -y install java-1.8.0-openjdk - - run: - name: Build including all Benchmarks - command: | - EXTRA_CMAKE_FLAGS=( - "-DVELOX_ENABLE_BENCHMARKS=ON" - "-DVELOX_ENABLE_ARROW=ON" - "-DVELOX_ENABLE_PARQUET=ON" - "-DVELOX_ENABLE_HDFS=ON" - "-DVELOX_ENABLE_S3=ON" - "-DVELOX_ENABLE_GCS=ON" - "-DVELOX_ENABLE_SUBSTRAIT=ON" - "-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON" - ) - make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}" AWSSDK_ROOT_DIR=~/adapter-deps/install GCSSDK_ROOT_DIR=~/adapter-deps/install NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=8 - ccache -s - no_output_timeout: 1h - - run: - name: "Run Unit Tests" - command: | - conda init bash - source ~/.bashrc - conda create -y --name testbench python=3.7 - conda activate testbench - pip install https://github.com/googleapis/storage-testbench/archive/refs/tags/v0.36.0.tar.gz - export LC_ALL=C - export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk - export HADOOP_ROOT_LOGGER="WARN,DRFA" - export LIBHDFS3_CONF=$(pwd)/.circleci/hdfs-client.xml - export HADOOP_HOME='/usr/local/hadoop' - export PATH=~/adapter-deps/install/bin:/usr/local/hadoop/bin:${PATH} - cd _build/release && ctest -j 16 -VV --output-on-failure - no_output_timeout: 1h - - post-steps - - linux-presto-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/fuzzer.log" - fuzzer_repro: "/tmp/fuzzer_repro" - fuzzer_name: "Expression" - fuzzer_exe: "_build/debug/velox/expression/tests/velox_expression_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --lazy_vector_generation_ratio 0.2 \ - --duration_sec 1800 --enable_variadic_signatures \ - --velox_fuzzer_enable_complex_types \ - --velox_fuzzer_enable_column_reuse \ - --velox_fuzzer_enable_expression_reuse \ - --max_expression_trees_per_step 2 \ - --retry_with_try \ - --enable_dereference \ - --logtostderr=1 --minloglevel=0 \ - --repro_persist_path=/tmp/fuzzer_repro" - - linux-spark-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/spark_fuzzer.log" - fuzzer_repro: "/tmp/spark_fuzzer_repro" - fuzzer_name: "Spark" - fuzzer_exe: "_build/debug/velox/expression/tests/spark_expression_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --duration_sec 600 --logtostderr=1 --minloglevel=0 \ - --repro_persist_path=/tmp/spark_fuzzer_repro" - - linux-spark-aggregate-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/spark_aggregate_fuzzer.log" - fuzzer_repro: "/tmp/spark_aggregate_fuzzer_repro" - fuzzer_name: "SparkAggregate" - fuzzer_exe: "_build/debug/velox/exec/tests/spark_aggregation_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --duration_sec 600 --logtostderr=1 --minloglevel=0 \ - --repro_persist_path=/tmp/spark_aggregate_fuzzer_repro" - - - linux-aggregate-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/aggregate_fuzzer.log" - fuzzer_repro: "/tmp/aggregate_fuzzer_repro" - fuzzer_name: "Aggregate" - fuzzer_exe: "_build/debug/velox/exec/tests/velox_aggregation_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --duration_sec 3600 --logtostderr=1 --minloglevel=0 \ - --repro_persist_path=/tmp/aggregate_fuzzer_repro" - - linux-join-fuzzer-run: - executor: build - environment: - VELOX_DEPENDENCY_SOURCE: SYSTEM - simdjson_SOURCE: BUNDLED - steps: - - fuzzer-run: - fuzzer_output: "/tmp/join_fuzzer.log" - fuzzer_repro: "/tmp/join_fuzzer_repro" - fuzzer_name: "Join" - fuzzer_exe: "_build/debug/velox/exec/tests/velox_join_fuzzer_test" - fuzzer_args: " --seed ${RANDOM} --duration_sec 3600 --logtostderr=1 --minloglevel=0 --v=1" - - format-check: - executor: check - steps: - - checkout - - run: - name: Check formatting - command: | - if ! make format-check; then - make format-fix - echo -e "\n==== Apply using:" - echo "patch -p1 \<> - jobs: - - linux-build - - linux-pr-fuzzer-run - - linux-build-options - - linux-adapters - - linux-presto-fuzzer-run - - macos-build: - matrix: - parameters: - os: [macos-intel] - - format-check - - header-check - - doc-gen-job: - filters: - branches: - only: - - main - - macos-build: - matrix: - parameters: - os: [ macos-m1 ] - filters: - branches: - only: - - main - - shorter-fuzzer: - unless: << pipeline.parameters.run-longer-expression-fuzzer >> - jobs: - - linux-build - - linux-pr-fuzzer-run - - linux-build-options - - linux-adapters - - macos-build: - matrix: - parameters: - os: [ macos-intel ] - - format-check - - header-check - - doc-gen-job: - filters: - branches: - only: - - main - - macos-build: - matrix: - parameters: - os: [ macos-m1 ] - filters: - branches: - only: - - main diff --git a/.clang-format b/.clang-format index eab4576fe09a5..2211eaceb3164 100644 --- a/.clang-format +++ b/.clang-format @@ -1,20 +1,26 @@ --- AccessModifierOffset: -1 AlignAfterOpenBracket: AlwaysBreak +AlignConsecutiveMacros: false AlignConsecutiveAssignments: false +AlignConsecutiveBitFields: false AlignConsecutiveDeclarations: false -AlignEscapedNewlinesLeft: true -AlignOperands: false +AlignEscapedNewlines: Left +AlignOperands: DontAlign AlignTrailingComments: false +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: false -AllowShortBlocksOnASingleLine: false +AllowShortEnumsOnASingleLine: true +AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty -AllowShortIfStatementsOnASingleLine: false +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: true +AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false BraceWrapping: @@ -31,19 +37,27 @@ BraceWrapping: IndentBraces: false BreakBeforeBinaryOperators: None BreakBeforeBraces: Attach +BreakInheritanceList: BeforeColon BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true +DeriveLineEnding: true DerivePointerAlignment: false -DisableFormat: false -ForEachMacros: [ FOR_EACH, FOR_EACH_R, FOR_EACH_RANGE, ] +DisableFormat: false +FixNamespaceComments: true +ForEachMacros: + - FOR_EACH + - FOR_EACH_R + - FOR_EACH_RANGE +IncludeBlocks: Preserve IncludeCategories: - Regex: '^<.*\.h(pp)?>' Priority: 1 @@ -52,36 +66,59 @@ IncludeCategories: - Regex: '.*' Priority: 3 IndentCaseLabels: true -IndentWidth: 2 +IndentCaseBlocks: false +IndentGotoLabels: true +IndentPPDirectives: None +IndentExternBlock: AfterExternBlock +IndentWidth: 2 IndentWrappedFunctionNames: false +InsertNewlineAtEOF: true +InsertTrailingCommas: None +JavaScriptQuotes: Leave +JavaScriptWrapImports: true KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' -MacroBlockEnd: '' +MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None +ObjCBinPackProtocolList: Auto ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: false +PenaltyBreakAssignment: 2 PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left -ReflowComments: true -SortIncludes: true +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 -SpacesInAngles: false +SpacesInAngles: false +SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false -Standard: Cpp11 -TabWidth: 8 -UseTab: Never +SpaceBeforeSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseCRLF: false +UseTab: Never ... diff --git a/.cmake-format.yaml b/.cmake-format.yaml new file mode 100644 index 0000000000000..91c373bf6b3b4 --- /dev/null +++ b/.cmake-format.yaml @@ -0,0 +1,84 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Currently this config mostly mirrors the default with the addition of custom functions +format: + line_width: 80 + tab_size: 2 + use_tabchars: false + max_pargs_hwrap: 4 + max_subgroups_hwrap: 2 + min_prefix_chars: 4 + max_prefix_chars: 6 + separate_ctrl_name_with_space: false + separate_fn_name_with_space: false + dangle_parens: false + command_case: "canonical" + keyword_case: "unchanged" + always_wrap: + - set_target_properties + - target_sources + - target_link_libraries + +parse: + # We define these for our custom + # functions so they get formatted correctly + additional_commands: + velox_add_library: + pargs: + nargs: 1+ + flags: + - OBJECT + - STATIC + - SHARED + - INTERFACE + kwargs: {} + + velox_base_add_library: + pargs: + nargs: 1+ + flags: + - OBJECT + - STATIC + - SHARED + - INTERFACE + kwargs: {} + + velox_compile_definitions: + pargs: 1 + kwargs: + PRIVATE: '*' + PUBLIC: '*' + INTERFACE: '*' + + velox_include_directories: + pargs: '1+' + flags: + - SYSTEM + - BEFORE + - AFTER + kwargs: + PRIVATE: '*' + PUBLIC: '*' + INTERFACE: '*' + + velox_link_libraries: + pargs: '1+' + kwargs: + PRIVATE: '*' + PUBLIC: '*' + INTERFACE: '*' + +markup: + first_comment_is_literal: true diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000000..62d6bf44b67c1 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,21 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 4bbd3d0c66db8..6b9c96d0426f9 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -22,6 +22,8 @@ on: - 'third_party/**' - 'pyvelox/**' - '.github/workflows/benchmark.yml' + - 'scripts/benchmark-requirements.txt' + push: branches: [main] @@ -35,11 +37,11 @@ concurrency: defaults: run: shell: bash -#TODO concurrency groups? + jobs: benchmark: if: github.repository == 'facebookincubator/velox' - runs-on: 8-core + runs-on: 8-core-ubuntu env: CCACHE_DIR: "${{ github.workspace }}/.ccache/" CCACHE_BASEDIR: "${{ github.workspace }}" @@ -69,10 +71,6 @@ jobs: fetch-depth: 0 submodules: 'recursive' - - name: "Install dependencies" - if: ${{ github.event_name == 'pull_request' }} - run: source velox/scripts/setup-ubuntu.sh - - name: "Checkout Merge Base" if: ${{ github.event_name == 'pull_request' }} working-directory: velox @@ -90,6 +88,10 @@ jobs: git submodule update --init --recursive echo $(git log -n 1) + - name: "Install dependencies" + if: ${{ github.event_name == 'pull_request' }} + run: source velox/scripts/setup-ubuntu.sh && install_apt_deps && install_duckdb + - name: Build Baseline Benchmarks if: ${{ github.event_name == 'pull_request' }} working-directory: velox @@ -115,7 +117,7 @@ jobs: submodules: 'recursive' - name: "Install dependencies" - run: source velox/scripts/setup-ubuntu.sh + run: source velox/scripts/setup-ubuntu.sh && install_apt_deps - name: Build Contender Benchmarks working-directory: velox @@ -198,7 +200,7 @@ jobs: echo "::endgroup::" - name: "Save PR number" - run: echo "${{ github.event.pull_request.number }}" > pr_number.txt + run: echo "${{ github.event.pull_request.number || 0 }}" > pr_number.txt - name: "Upload PR number" uses: actions/upload-artifact@v3 diff --git a/.github/workflows/build-metrics.yml b/.github/workflows/build-metrics.yml new file mode 100644 index 0000000000000..2a3285e2f6ac2 --- /dev/null +++ b/.github/workflows/build-metrics.yml @@ -0,0 +1,173 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Collect Build Metrics + +on: + pull_request: + paths: + - ".github/workflows/build-metrics.yml" + - "script/bm-report/**" + + workflow_dispatch: + inputs: + ref: + description: "ref to check" + required: true + + schedule: + # Run every day at 04:05 + - cron: "5 4 * * *" + +permissions: + contents: read + +jobs: + metrics: + name: Linux ${{ matrix.type }} with adapters + if: ${{ github.repository == 'facebookincubator/velox' }} + runs-on: ${{ matrix.runner }} + container: ghcr.io/facebookincubator/velox-dev:adapters + strategy: + fail-fast: false + matrix: + runner: ["16-core-ubuntu"] + type: ["debug", "release"] + defaults: + run: + shell: bash + env: + VELOX_DEPENDENCY_SOURCE: SYSTEM + simdjson_SOURCE: BUNDLED + xsimd_SOURCE: BUNDLED + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.sha }} + + - name: Fix git permissions + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: git config --global --add safe.directory ${GITHUB_WORKSPACE} + + - name: Make ${{ matrix.type }} Build + env: + MAKEFLAGS: 'MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=4' + run: | + EXTRA_CMAKE_FLAGS=( + "-DVELOX_ENABLE_BENCHMARKS=ON" + "-DVELOX_ENABLE_ARROW=ON" + "-DVELOX_ENABLE_PARQUET=ON" + "-DVELOX_ENABLE_HDFS=ON" + "-DVELOX_ENABLE_S3=ON" + "-DVELOX_ENABLE_GCS=ON" + "-DVELOX_ENABLE_ABFS=ON" + "-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON" + ) + make '${{ matrix.type }}' + + - name: Log binary sizes + run: | + mkdir -p /tmp/metrics + sizes_file=/tmp/metrics/object_sizes + pushd '_build/${{ matrix.type }}' + + find velox -type f -name '*.so' -o -name '*.a' -exec ls -l -BB {} \; | + awk '{print $5, $9; total += $5} END {print total," total_lib_size"}' > $sizes_file + + find velox -type f -name '*.o' -exec ls -l -BB {} \; | + awk '{print $5, $9; total += $5} END {print total," total_obj_size"}' >> $sizes_file + + find velox -type f -name 'velox_*' -exec ls -l -BB {} \; | + awk '{print $5, $9; total += $5} END {print total," total_exec_size"}' >> $sizes_file + + - name: Copy ninja_log + run: cp _build/${{ matrix.type }}/.ninja_log /tmp/metrics/.ninja_log + + - name: "Install dependencies" + run: | + python3 -m pip install setuptools + python3 -m pip install -r scripts/benchmark-requirements.txt + + - name: "Upload Metrics" + # This disables the upload and report generation on fork PRs but allows it for forks from within the main repo. + if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == 'facebookincubator/velox' }} + env: + CONBENCH_URL: "https://velox-conbench.voltrondata.run/" + CONBENCH_MACHINE_INFO_NAME: "GitHub-runner-${{ matrix.runner }}" + CONBENCH_EMAIL: "${{ secrets.CONBENCH_EMAIL }}" + CONBENCH_PASSWORD: "${{ secrets.CONBENCH_PASSWORD }}" + # These don't actually work https://github.com/conbench/conbench/issues/1484 + # but have to be there to work regardless?? + CONBENCH_PROJECT_REPOSITORY: "${{ github.repository }}" + CONBENCH_PROJECT_COMMIT: "${{ inputs.ref || github.sha }}" + run: | + ./scripts/build-metrics.py upload \ + --build_type "${{ matrix.type }}" \ + --run_id "BM-${{ matrix.type }}-${{ github.run_id }}-${{ github.run_attempt }}" \ + --pr_number "${{ github.event.number }}" \ + --sha "${{ inputs.ref || github.sha }}" \ + "/tmp/metrics" + + upload-report: + if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == 'facebookincubator/velox' }} + permissions: + contents: write + runs-on: ubuntu-latest + name: Generate and Upload Build Metric Report + needs: metrics + steps: + - name: Checkout + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + fetch-depth: 0 + + - name: Setup Git User + run: | + git config --global user.email "velox@users.noreply.github.com" + git config --global user.name "velox" + + - uses: cachix/install-nix-action@8887e596b4ee1134dae06b98d573bd674693f47c # v26 + with: + nix_path: nixpkgs=channel:nixos-unstable + + - name: Build Environment + run: | + cd scripts/bm-report + nix-build + + - name: Build Documentation + env: + CONBENCH_URL: "https://velox-conbench.voltrondata.run/" + CONBENCH_EMAIL: "${{ secrets.CONBENCH_EMAIL }}" + CONBENCH_PASSWORD: "${{ secrets.CONBENCH_PASSWORD }}" + run: | + cd scripts/bm-report + nix-shell --run "quarto render report.qmd" + + - name: Push Report + # The report only uses conbench data from 'main' + # so any data generated in a PR won't be included + if: ${{ github.event_name != 'pull_request' && github.repository == 'facebookincubator/velox'}} + run: | + git checkout gh-pages + mkdir -p docs/bm-report + cp -R scripts/bm-report/report.html docs/bm-report/index.html + git add docs + + if [ -n "$(git status --porcelain --untracked-files=no)" ] + then + git commit -m "Update build metrics" + git push + fi diff --git a/.github/workflows/build_pyvelox.yml b/.github/workflows/build_pyvelox.yml index bb01d8e804ce4..4ebd5dd17319f 100644 --- a/.github/workflows/build_pyvelox.yml +++ b/.github/workflows/build_pyvelox.yml @@ -46,13 +46,15 @@ concurrency: cancel-in-progress: true jobs: + # TODO: https://github.com/facebookincubator/velox/issues/9014 + if: false build_wheels: name: Build wheels on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: [ubuntu-22.04, macos-11] + os: [ubuntu-22.04] steps: - uses: actions/checkout@v3 with: diff --git a/.github/workflows/conbench_upload.yml b/.github/workflows/conbench_upload.yml index b59a30c142cd1..bc1fc42f03f11 100644 --- a/.github/workflows/conbench_upload.yml +++ b/.github/workflows/conbench_upload.yml @@ -94,7 +94,18 @@ jobs: run: | unzip benchmark-results.zip -d benchmark-results unzip pr_number.zip - echo "pr_number=$(cat pr_number.txt)" >> $GITHUB_OUTPUT + + pr_number=$(grep -ox '[[:digit:]]*' pr_number.txt | head -1) + + if [ "$pr_number" -ge 0 ]; then + echo "Found PR number: $pr_number" + else + echo '::error :: Malformed input, aborting!' + exit 1 + fi + + echo "pr_number=$pr_number" >> $GITHUB_OUTPUT + - uses: actions/checkout@v3 with: path: velox @@ -134,7 +145,7 @@ jobs: - name: "Create a GitHub Status on the contender commit (whether the upload was successful)" uses: actions/github-script@v6 - if: always() + if: ${{ !cancelled() && steps.extract.conclusion != 'failure' }} with: script: | let url = 'https://github.com/${{github.repository}}/actions/runs/${{ github.run_id }}' diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9a9988910e495..932fd41d13399 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -20,6 +20,7 @@ on: - scripts/*.dockerfile - scripts/setup-*.sh - .github/workflows/docker.yml + - docker-compose.yml push: branches: [main] paths: @@ -38,7 +39,23 @@ permissions: jobs: linux: + name: "Build and Push ${{ matrix.name }}" runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - name: Check + file: "scripts/check-container.dockfile" + tags: "ghcr.io/facebookincubator/velox-dev:check" + - name: Centos 9 + file: "scripts/centos.dockerfile" + tags: "ghcr.io/facebookincubator/velox-dev:centos9" + - name: Dev + file: "scripts/ubuntu-22.04-cpp.dockerfile" + args: "" + tags: "ghcr.io/facebookincubator/velox-dev:ubuntu-22.04" + steps: - name: Login to GitHub Container Registry uses: docker/login-action@v2 @@ -53,38 +70,52 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 - - uses: actions/checkout@v3 - - - name: Build and Push check + - name: Build and Push uses: docker/build-push-action@v3 with: - context: scripts - file: scripts/check-container.dockfile - build-args: cpu_target=avx + file: "${{ matrix.file }}" + build-args: "${{ matrix.args }}" push: ${{ github.repository == 'facebookincubator/velox' && github.event_name != 'pull_request'}} - tags: ghcr.io/facebookincubator/velox-dev:check-avx + tags: "${{ matrix.tags }}" - - name: Build and Push circle-ci - uses: docker/build-push-action@v3 - with: - context: scripts - file: scripts/circleci-container.dockfile - build-args: cpu_target=avx - push: ${{ github.repository == 'facebookincubator/velox' && github.event_name != 'pull_request'}} - tags: ghcr.io/facebookincubator/velox-dev:circleci-avx + linux-needs: + name: "Build and Push ${{ matrix.name }}" + needs: linux + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - name: Adapters + file: "scripts/adapters.dockerfile" + tags: "ghcr.io/facebookincubator/velox-dev:adapters" + - name: Presto Java + file: "scripts/prestojava-container.dockerfile" + args: "PRESTO_VERSION=0.288" + tags: "ghcr.io/facebookincubator/velox-dev:presto-java" + - name: Spark server + file: "scripts/spark-container.dockerfile" + args: "SPARK_VERSION=3.5.1" + tags: "ghcr.io/facebookincubator/velox-dev:spark-server" - - name: Build and Push velox-torcharrow - uses: docker/build-push-action@v3 + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 with: - context: scripts - file: scripts/velox-torcharrow-container.dockfile - build-args: cpu_target=avx - push: ${{ github.repository == 'facebookincubator/velox' && github.event_name != 'pull_request'}} - tags: ghcr.io/facebookincubator/velox-dev:torcharrow-avx + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 - - name: Build and Push dev-image + - name: Build and Push uses: docker/build-push-action@v3 with: - file: scripts/ubuntu-22.04-cpp.dockerfile + file: "${{ matrix.file }}" + build-args: "${{ matrix.args }}" push: ${{ github.repository == 'facebookincubator/velox' && github.event_name != 'pull_request'}} - tags: ghcr.io/facebookincubator/velox-dev:amd64-ubuntu-22.04-avx + tags: "${{ matrix.tags }}" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000000000..471cdaebc9285 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,85 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Update Documentation + +on: + push: + paths: + - "velox/docs/**" + - ".github/workflows/docs.yml" + + pull_request: + paths: + - "velox/docs/**" + - ".github/workflows/docs.yml" + +permissions: + contents: write + +concurrency: + group: ${{ github.workflow }}-${{ github.repository }}-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +jobs: + build_docs: + name: Build and Push + runs-on: ubuntu-latest + steps: + + - name: Checkout + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + fetch-depth: 0 + + - name: Setup Git User + run: | + git config --global user.email "velox@users.noreply.github.com" + git config --global user.name "velox" + + - name: Install Dependencies + run: | + sudo apt update + sudo apt install -y pandoc + pip install sphinx sphinx-tabs breathe sphinx_rtd_theme chardet + + - name: Build Documentation + run: | + cd velox/docs + make clean + # pyvelox + mkdir -p bindings/python + pandoc ../../pyvelox/README.md --from markdown --to rst -s -o bindings/python/README_generated_pyvelox.rst + # velox + make html + + - name: Push Documentation + if: ${{ github.event_name == 'push' && github.repository == 'facebookincubator/velox'}} + run: | + git checkout gh-pages + cp -R velox/docs/_build/html/* docs + git add docs + + if [ -n "$(git status --porcelain --untracked-files=no)" ] + then + git commit -m "Update documentation" + git push + fi + + - name: Upload Documentation + if: github.event_name == 'pull_request' + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + with: + path: velox/docs/_build/html + retention-days: 3 diff --git a/.github/workflows/experimental.yml b/.github/workflows/experimental.yml new file mode 100644 index 0000000000000..9c55c338f678d --- /dev/null +++ b/.github/workflows/experimental.yml @@ -0,0 +1,251 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "Experimental Fuzzer Jobs" + +on: + pull_request: + paths: + - ".github/workflows/experimental.yml" + + + workflow_dispatch: + inputs: + ref: + description: 'Ref to checkout out' + default: 'main' + numThreads: + description: 'Number of threads' + default: 16 + maxHighMemJobs: + description: 'Number of high memory jobs' + default: 8 + maxLinkJobs: + description: 'Maximum number of link jobs' + default: 4 + extraCMakeFlags: + description: 'Additional CMake flags' + default: '' + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + compile: + runs-on: 8-core-ubuntu + timeout-minutes: 120 + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache/" + LINUX_DISTRO: "ubuntu" + steps: + + - name: "Restore ccache" + uses: actions/cache@v3 + with: + path: "${{ env.CCACHE_DIR }}" + # We are using the benchmark ccache as it has all + # required features enabled, so no need to create a new one + key: ccache-benchmark-${{ github.sha }} + restore-keys: | + ccache-benchmark- + - name: "Checkout Repo" + uses: actions/checkout@v3 + with: + path: velox + submodules: 'recursive' + ref: "${{ inputs.ref || 'main' }}" + + - name: "Install dependencies" + run: cd velox && source ./scripts/setup-ubuntu.sh && install_apt_deps + + - name: "Build" + run: | + cd velox + make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}" + ccache -s + + - name: Upload aggregation fuzzer + uses: actions/upload-artifact@v3 + with: + name: aggregation + path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test + + - name: Upload spark aggregation fuzzer + uses: actions/upload-artifact@v3 + with: + name: spark_aggregation_fuzzer + path: velox/_build/debug/velox/functions/sparksql/fuzzer/spark_aggregation_fuzzer_test + + - name: Upload aggregation fuzzer + uses: actions/upload-artifact@v3 + with: + name: aggregation + path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test + + - name: Upload join fuzzer + uses: actions/upload-artifact@v3 + with: + name: join + path: velox/_build/debug/velox/exec/tests/velox_join_fuzzer_test + + presto-java-aggregation-fuzzer-run: + runs-on: 8-core-ubuntu + container: ghcr.io/facebookincubator/velox-dev:presto-java + timeout-minutes: 120 + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache/" + LINUX_DISTRO: "centos" + steps: + + - name: "Restore ccache" + uses: actions/cache@v3 + with: + path: "${{ env.CCACHE_DIR }}" + # We are using the benchmark ccache as it has all + # required features enabled, so no need to create a new one + key: ccache-presto-${{ github.sha }} + restore-keys: | + ccache-presto- + + - name: "Checkout Repo" + uses: actions/checkout@v3 + with: + path: velox + submodules: 'recursive' + ref: "${{ inputs.ref || 'main' }}" + + + - name: "Build" + run: | + cd velox + source /opt/rh/gcc-toolset-12/enable + make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}" + ccache -s + + - name: "Run Aggregate Fuzzer" + run: | + cd velox + cp ./scripts/presto/etc/hive.properties $PRESTO_HOME/etc/catalog + ls -lR $PRESTO_HOME/etc + $PRESTO_HOME/bin/launcher run -v > /tmp/server.log 2>&1 & + # Sleep for 60 seconds to allow Presto server to start. + sleep 60 + /opt/presto-cli --server 127.0.0.1:8080 --execute 'CREATE SCHEMA hive.tpch;' + mkdir -p /tmp/aggregate_fuzzer_repro/ + rm -rfv /tmp/aggregate_fuzzer_repro/* + chmod -R 777 /tmp/aggregate_fuzzer_repro + _build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec 3600 \ + --logtostderr=1 \ + --minloglevel=0 \ + --repro_persist_path=/tmp/aggregate_fuzzer_repro \ + --enable_sorted_aggregations=true \ + --presto_url=http://127.0.0.1:8080 \ + && echo -e "\n\nAggregation fuzzer run finished successfully." + + - name: Archive aggregate production artifacts + if: always() + uses: actions/upload-artifact@v3 + with: + name: aggregate-fuzzer-failure-artifacts + path: | + /tmp/aggregate_fuzzer_repro + /tmp/server.log + + linux-spark-fuzzer-run: + runs-on: ubuntu-latest + needs: compile + timeout-minutes: 120 + steps: + + - name: "Checkout Repo" + uses: actions/checkout@v3 + with: + ref: "${{ inputs.ref || 'main' }}" + + - name: "Install dependencies" + run: source ./scripts/setup-ubuntu.sh && install_apt_deps + + - name: Download spark aggregation fuzzer + uses: actions/download-artifact@v3 + with: + name: spark_aggregation_fuzzer + + - name: "Run Spark Aggregate Fuzzer" + run: | + mkdir -p /tmp/spark_aggregate_fuzzer_repro/ + chmod -R 777 /tmp/spark_aggregate_fuzzer_repro + chmod +x spark_aggregation_fuzzer_test + ./spark_aggregation_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec 1800 \ + --logtostderr=1 \ + --minloglevel=0 \ + --repro_persist_path=/tmp/spark_aggregate_fuzzer_repro \ + --enable_sorted_aggregations=true \ + && echo -e "\n\nSpark Aggregation Fuzzer run finished successfully." + + - name: Archive Spark aggregate production artifacts + if: always() + uses: actions/upload-artifact@v3 + with: + name: spark-agg-fuzzer-failure-artifacts + path: | + /tmp/spark_aggregate_fuzzer_repro + + linux-join-fuzzer-run: + runs-on: ubuntu-latest + needs: compile + timeout-minutes: 120 + steps: + + - name: "Checkout Repo" + uses: actions/checkout@v3 + with: + ref: "${{ inputs.ref || 'main' }}" + + - name: "Install dependencies" + run: source ./scripts/setup-ubuntu.sh && install_apt_deps + + - name: Download join fuzzer + uses: actions/download-artifact@v3 + with: + name: join + + - name: "Run Join Fuzzer" + run: | + ls /lib64 + mkdir -p /tmp/join_fuzzer_repro/ + rm -rfv /tmp/join_fuzzer_repro/* + chmod -R 777 /tmp/join_fuzzer_repro + chmod +x velox_join_fuzzer_test + ./velox_join_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec 1800 \ + --logtostderr=1 \ + --minloglevel=0 \ + && echo -e "\n\nAggregation fuzzer run finished successfully." + + - name: Archive aggregate production artifacts + if: always() + uses: actions/upload-artifact@v3 + with: + name: join-fuzzer-failure-artifacts + path: | + /tmp/join_fuzzer_repro diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml new file mode 100644 index 0000000000000..1ca5cce9c7ecf --- /dev/null +++ b/.github/workflows/linux-build.yml @@ -0,0 +1,194 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Linux Build + +on: + push: + branches: + - "main" + paths: + - "velox/**" + - "!velox/docs/**" + - "CMakeLists.txt" + - "CMake/**" + - "third_party/**" + - "scripts/setup-ubuntu.sh" + - "scripts/setup-helper-functions.sh" + - ".github/workflows/linux-build.yml" + + pull_request: + paths: + - "velox/**" + - "!velox/docs/**" + - "CMakeLists.txt" + - "CMake/**" + - "third_party/**" + - "scripts/setup-ubuntu.sh" + - "scripts/setup-helper-functions.sh" + - ".github/workflows/linux-build.yml" + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.repository }}-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +jobs: + adapters: + name: Linux release with adapters + # prevent errors when forks ff their main branch + if: ${{ github.repository == 'facebookincubator/velox' }} + runs-on: 8-core-ubuntu + container: ghcr.io/facebookincubator/velox-dev:adapters + defaults: + run: + shell: bash + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache" + VELOX_DEPENDENCY_SOURCE: SYSTEM + simdjson_SOURCE: BUNDLED + xsimd_SOURCE: BUNDLED + CUDA_VERSION: "12.4" + steps: + - uses: actions/checkout@v4 + + - name: Fix git permissions + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: git config --global --add safe.directory ${GITHUB_WORKSPACE} + + - name: Install Dependencies + run: | + # Allows to install arbitrary cuda-version whithout needing to update + # docker container before. It simplifies testing new/different versions + if ! yum list installed cuda-nvcc-$(echo ${CUDA_VERSION} | tr '.' '-') 1>/dev/null; then + source scripts/setup-centos9.sh + install_cuda ${CUDA_VERSION} + fi + + - name: Install Minio + run: | + MINIO_BINARY="minio-2022-05-26" + if [ ! -f /usr/local/bin/${MINIO_BINARY} ]; then + wget https://dl.min.io/server/minio/release/linux-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z -O ${MINIO_BINARY} + chmod +x ./${MINIO_BINARY} + mv ./${MINIO_BINARY} /usr/local/bin/ + fi + + - uses: assignUser/stash/restore@v1 + with: + path: '${{ env.CCACHE_DIR }}' + key: ccache-linux-adapters + + - name: "Zero Ccache Statistics" + run: | + ccache -sz + + - name: Make Release Build + env: + MAKEFLAGS: 'NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4' + CUDA_ARCHITECTURES: 70 + CUDA_COMPILER: /usr/local/cuda-${CUDA_VERSION}/bin/nvcc + # Set compiler to GCC 12 + CUDA_FLAGS: "-ccbin /opt/rh/gcc-toolset-12/root/usr/bin" + run: | + EXTRA_CMAKE_FLAGS=( + "-DVELOX_ENABLE_BENCHMARKS=ON" + "-DVELOX_ENABLE_ARROW=ON" + "-DVELOX_ENABLE_PARQUET=ON" + "-DVELOX_ENABLE_HDFS=ON" + "-DVELOX_ENABLE_S3=ON" + "-DVELOX_ENABLE_GCS=ON" + "-DVELOX_ENABLE_ABFS=ON" + "-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON" + "-DVELOX_ENABLE_GPU=ON" + "-DVELOX_MONO_LIBRARY=ON" + ) + make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}" + + - name: Ccache after + run: ccache -s + + - uses: assignUser/stash/save@v1 + with: + path: '${{ env.CCACHE_DIR }}' + key: ccache-linux-adapters + + - name: Run Tests + # Some of the adapters dependencies are in the 'adapters' conda env + shell: mamba run --no-capture-output -n adapters /usr/bin/bash -e {0} + env: + LIBHDFS3_CONF: "${{ github.workspace }}/scripts/hdfs-client.xml" + working-directory: _build/release + run: | + ctest -j 8 --output-on-failure --no-tests=error + + ubuntu-debug: + runs-on: 8-core-ubuntu + # prevent errors when forks ff their main branch + if: ${{ github.repository == 'facebookincubator/velox' }} + name: "Ubuntu debug with resolve_dependency" + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache" + defaults: + run: + shell: bash + working-directory: velox + steps: + + - name: Get Ccache Stash + uses: assignUser/stash/restore@v1 + with: + path: '${{ env.CCACHE_DIR }}' + key: ccache-ubuntu-debug-default + + - name: Ensure Stash Dirs Exists + working-directory: ${{ github.workspace }} + run: | + mkdir -p '${{ env.CCACHE_DIR }}' + + - uses: actions/checkout@v4 + with: + path: velox + + - name: Install Dependencies + run: | + source scripts/setup-ubuntu.sh && install_apt_deps + + - name: Clear CCache Statistics + run: | + ccache -sz + + - name: Make Debug Build + env: + VELOX_DEPENDENCY_SOURCE: BUNDLED + MAKEFLAGS: "NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=3" + EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_ENABLE_PARQUET=ON" + run: | + make debug + + - name: CCache after + run: | + ccache -vs + + - uses: assignUser/stash/save@v1 + with: + path: '${{ env.CCACHE_DIR }}' + key: ccache-ubuntu-debug-default + + - name: Run Tests + run: | + cd _build/debug && ctest -j 8 --output-on-failure --no-tests=error diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml new file mode 100644 index 0000000000000..63672ec7aa87a --- /dev/null +++ b/.github/workflows/macos.yml @@ -0,0 +1,109 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +name: macOS Build + +on: + push: + paths: + - "velox/**" + - "!velox/docs/**" + - "CMakeLists.txt" + - "CMake/**" + - "third_party/**" + - ".github/workflows/macos.yml" + + pull_request: + paths: + - "velox/**" + - "!velox/docs/**" + - "CMakeLists.txt" + - "CMake/**" + - "third_party/**" + - ".github/workflows/macos.yml" + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.repository }}-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +jobs: + macos-build: + name: "${{ matrix.os }}" + strategy: + fail-fast: false + matrix: + # macos-13 = x86_64 Mac + # macos-14 = arm64 Mac + os: [macos-13, macos-14] + runs-on: ${{ matrix.os }} + env: + CCACHE_DIR: '${{ github.workspace }}/.ccache' + # The arm runners have only 7GB RAM + BUILD_TYPE: "${{ matrix.os == 'macos-14' && 'Release' || 'Debug' }}" + INSTALL_PREFIX: "/tmp/deps-install" + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Install Dependencies + env: + HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: "TRUE" + run: | + source scripts/setup-macos.sh + install_build_prerequisites + install_velox_deps_from_brew + install_double_conversion + + echo "NJOBS=`sysctl -n hw.ncpu`" >> $GITHUB_ENV + brew unlink protobuf || echo "protobuf not installed" + brew link --force protobuf@21 + + - name: Cache ccache + uses: assignUser/stash/restore@v1 + with: + path: '${{ env.CCACHE_DIR }}' + key: ccache-macos-1-${{ matrix.os }} + + - name: Configure Build + env: + folly_SOURCE: BUNDLED #brew folly does not have int128 + fmt_SOURCE: BUNDLED #brew fmt11 is not supported + run: | + ccache -sz -M 5Gi + cmake \ + -B _build/$BUILD_TYPE \ + -GNinja \ + -DTREAT_WARNINGS_AS_ERRORS=1 \ + -DENABLE_ALL_WARNINGS=1 \ + -DVELOX_ENABLE_PARQUET=ON \ + -DVELOX_MONO_LIBRARY=ON \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE + + - name: Build + run: | + cmake --build _build/$BUILD_TYPE -j $NJOBS + ccache -s + + - uses: assignUser/stash/save@v1 + with: + path: '${{ env.CCACHE_DIR }}' + key: ccache-macos-1-${{ matrix.os }} + + - name: Run Tests + if: false + run: ctest -j $NJOBS --test-dir _build/debug --output-on-failure diff --git a/.github/workflows/preliminary_checks.yml b/.github/workflows/preliminary_checks.yml new file mode 100644 index 0000000000000..cba762d02cb0e --- /dev/null +++ b/.github/workflows/preliminary_checks.yml @@ -0,0 +1,72 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Run Checks + +on: + pull_request: + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.repository }}-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +jobs: + check-matrix: + name: ${{ matrix.config.name }} + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:check + strategy: + fail-fast: false + matrix: + config: + - { name: "License Header", + command: "header-fix", + message: "Found missing License Header(s)", + } + - { name: "Code Format", + command: "format-fix", + message: "Found format issues" + } + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fix git permissions + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: git config --global --add safe.directory ${GITHUB_WORKSPACE} + + - name: Check ${{ matrix.config.name }} + run: | + make ${{ matrix.config.command }} + + if ! git diff --quiet; then + diff=`git --no-pager diff` + echo "${{ matrix.command.message }} in the following files:" + git --no-pager diff --name-only + echo "Check the Job summary for a copy-pasteable patch." + + echo "> [!IMPORTANT]" >> $GITHUB_STEP_SUMMARY + echo "${{ matrix.config.message }}" >> $GITHUB_STEP_SUMMARY + echo "> Please apply fix using:" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`sh" >> $GITHUB_STEP_SUMMARY + echo "patch -p1 <> $GITHUB_STEP_SUMMARY + echo "$diff" >> $GITHUB_STEP_SUMMARY + echo "EOF" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + exit 1 + fi diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml index a514f7e6fc61e..a487dd4bbb4c6 100644 --- a/.github/workflows/scheduled.yml +++ b/.github/workflows/scheduled.yml @@ -12,44 +12,369 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: "Scheduled Fuzzer Jobs" +name: "Fuzzer Jobs" on: + pull_request: + paths: + - "velox/**" + - "!velox/docs/**" + - "CMakeLists.txt" + - "CMake/**" + - "third_party/**" + - "scripts/setup-ubuntu.sh" + - "scripts/setup-helper-functions.sh" + - ".github/workflows/scheduled.yml" + + push: + branches: + - "main" + paths: + - "velox/**" + - "!velox/docs/**" + - "CMakeLists.txt" + - "CMake/**" + - "third_party/**" + - "scripts/setup-ubuntu.sh" + - "scripts/setup-helper-functions.sh" + - ".github/workflows/scheduled.yml" + schedule: - cron: '0 3 * * *' + workflow_dispatch: + inputs: + ref: + description: 'Ref to checkout out' + default: 'main' + numThreads: + description: 'Number of threads' + default: 16 + maxHighMemJobs: + description: 'Number of high memory jobs' + default: 8 + maxLinkJobs: + description: 'Maximum number of link jobs' + default: 4 + extraCMakeFlags: + description: 'Additional CMake flags' + default: '' + duration: + description: 'Duration of fuzzer run in seconds' + default: 1800 + defaults: run: shell: bash +permissions: + contents: read + +concurrency: + # This will not cancel fuzzer runs on main (regardless of which trigger) + # by making the commit sha part of the group but will use the branch + # name in PRs to cancel on going runs on a new commit. + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event_name == 'pull_request' && github.head_ref || github.sha }} + cancel-in-progress: true + +env: + # Run for 15 minute on PRs + DURATION: "${{ inputs.duration || ( github.event_name != 'schedule' && 900 || 1800 )}}" + # minimize artifact duration for PRs, keep them a bit longer for nightly runs + RETENTION: "${{ github.event_name == 'pull_request' && 1 || 3 }}" + jobs: - linux-presto-fuzzer-run: - runs-on: 8-core + compile: + name: Build + # prevent errors when forks ff their main branch + if: ${{ github.repository == 'facebookincubator/velox' }} + runs-on: 16-core-ubuntu + container: ghcr.io/facebookincubator/velox-dev:centos9 timeout-minutes: 120 env: - CCACHE_DIR: "${{ github.workspace }}/.ccache/" - CCACHE_BASEDIR: "${{ github.workspace }}" + CCACHE_DIR: "${{ github.workspace }}/.ccache" LINUX_DISTRO: "ubuntu" + MAKEFLAGS: "NUM_THREADS=${{ inputs.numThreads || 16 }} MAX_HIGH_MEM_JOBS=${{ inputs.maxHighMemJobs || 8 }} MAX_LINK_JOBS=${{ inputs.maxLinkJobs || 4 }}" + + defaults: + run: + shell: bash + working-directory: velox + outputs: + presto_bias: ${{ steps.sig-check.outputs.presto_functions }} + presto_error: ${{ steps.sig-check.outputs.presto_error }} + spark_bias: ${{ steps.sig-check.outputs.spark_functions }} + spark_error: ${{ steps.sig-check.outputs.spark_error }} + presto_aggregate_bias: ${{ steps.sig-check.outputs.presto_aggregate_functions }} + presto_aggregate_error: ${{ steps.sig-check.outputs.presto_aggregate_error }} + steps: - - name: "Checkout Repo" - uses: actions/checkout@v3 + - name: Get latest commit from main + if: ${{ github.event_name != 'schedule' }} + working-directory: ${{ github.workspace }} + env: + GH_TOKEN: ${{ github.token }} + id: get-head + run: | + if [ '${{ github.event_name == 'push' }}' == "true" ]; then + # get the parent commit of the current one to get the relevant function signatures + head_main=$(gh api -q '.parents.[0].sha' '/repos/facebookincubator/velox/commits/${{ github.sha }}') + else + head_main=$(gh api -H "Accept: application/vnd.github.sha" /repos/facebookincubator/velox/commits/heads/main) + fi + echo "head_main=$head_main" >> $GITHUB_OUTPUT + + - name: Get Function Signature Stash + uses: assignUser/stash/restore@v1 + id: get-sig + with: + path: /tmp/signatures + key: function-signatures-${{ steps.get-head.outputs.head_main || github.sha }} + + - name: Restore ccache + uses: assignUser/stash/restore@v1 + with: + path: "${{ env.CCACHE_DIR }}" + key: ccache-fuzzer-centos + + - name: Fix git permissions + working-directory: ${{ github.workspace }} + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: | + git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox + git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox_main + + - name: Ensure Stash Dirs Exists + working-directory: ${{ github.workspace }} + run: | + mkdir -p '${{ env.CCACHE_DIR }}' + mkdir -p /tmp/signatures + + - name: Checkout Main + if: ${{ github.event_name != 'schedule' && steps.get-sig.outputs.stash-hit != 'true' }} + uses: actions/checkout@v4 with: + ref: ${{ steps.get-head.outputs.head_main || 'main' }} + path: velox_main + + - name: Build PyVelox + if: ${{ github.event_name != 'schedule' && steps.get-sig.outputs.stash-hit != 'true' }} + working-directory: velox_main + run: | + python3 -m venv .venv + source .venv/bin/activate + + make python-build + + - name: Create Baseline Signatures + if: ${{ github.event_name != 'schedule' && steps.get-sig.outputs.stash-hit != 'true' }} + working-directory: velox_main + run: | + source .venv/bin/activate + python3 -m pip install deepdiff + python3 scripts/signature.py export --spark /tmp/signatures/spark_signatures_main.json + python3 scripts/signature.py export --presto /tmp/signatures/presto_signatures_main.json + python3 scripts/signature.py export_aggregates --presto /tmp/signatures/presto_aggregate_signatures_main.json + + - name: Save Function Signature Stash + if: ${{ github.event_name == 'pull_request' && steps.get-sig.outputs.stash-hit != 'true' }} + uses: assignUser/stash/save@v1 + with: + path: /tmp/signatures + key: function-signatures-${{ steps.get-head.outputs.head_main }} + + - name: Checkout Contender + uses: actions/checkout@v4 + with: + path: velox submodules: 'recursive' + ref: "${{ inputs.ref }}" - - name: "Install dependencies" - run: source ./scripts/setup-ubuntu.sh + - name: Zero Ccache Statistics + run: | + ccache -sz - - name: "Build" + - name: Build + env: + EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_BUILD_PYTHON_PACKAGE=ON ${{ inputs.extraCMakeFlags }}" run: | - make debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=6 EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON" - ccache -s + EXTRA_CMAKE_FLAGS="-DPYTHON_EXECUTABLE=$(which python3) $EXTRA_CMAKE_FLAGS" + make debug - - name: "Run Presto Fuzzer" + - name: Ccache after + run: ccache -s + + - name: Save ccache + # see https://github.com/actions/upload-artifact/issues/543 + continue-on-error: true + if: ${{ github.event_name != 'schedule' }} + uses: assignUser/stash/save@v1 + with: + path: "${{ env.CCACHE_DIR }}" + key: ccache-fuzzer-centos + + - name: Build PyVelox + if: ${{ github.event_name != 'schedule' }} + env: + VELOX_BUILD_DIR: "_build/debug" + run: | + python3 -m venv .venv + source .venv/bin/activate + python3 -m pip install -e . + + - name: Create and test new function signatures + if: ${{ github.event_name != 'schedule' }} + id: sig-check run: | - mkdir -p /tmp/fuzzer_repro/ + source .venv/bin/activate + python3 -m pip install deepdiff + python3 scripts/signature.py gh_bias_check presto spark + python3 scripts/signature.py export_aggregates --presto /tmp/signatures/presto_aggregate_signatures_contender.json + python3 scripts/signature.py bias_aggregates /tmp/signatures/presto_aggregate_signatures_main.json \ + /tmp/signatures/presto_aggregate_signatures_contender.json /tmp/signatures/presto_aggregate_bias_functions \ + /tmp/signatures/presto_aggregate_errors + + - name: Upload Signature Artifacts + if: ${{ github.event_name != 'schedule' }} + uses: actions/upload-artifact@v4 + with: + name: signatures + path: /tmp/signatures + retention-days: "${{ env.RETENTION }}" + + - name: Prepare signatures + working-directory: /tmp/signatures + if: ${{ github.event_name == 'push' }} + run: | + # Remove irrelevant artifacts + rm -f *_bias_functions + rm -f *_signatures_main.json + # Rename signature files as 'main' files + for f in *_signatures_contender.json; do + mv "$f" "${f/_contender.json/_main.json}" + done + + - name: Save Function Signature Stash + if: ${{ github.event_name == 'push' }} + uses: assignUser/stash/save@v1 + with: + path: /tmp/signatures + key: function-signatures-${{ github.sha }} + + - name: Upload presto fuzzer + uses: actions/upload-artifact@v4 + with: + name: presto + path: velox/_build/debug/velox/expression/fuzzer/velox_expression_fuzzer_test + retention-days: "${{ env.RETENTION }}" + + - name: Upload spark expression fuzzer + uses: actions/upload-artifact@v4 + with: + name: spark_expression_fuzzer + path: velox/_build/debug/velox/expression/fuzzer/spark_expression_fuzzer_test + retention-days: "${{ env.RETENTION }}" + + - name: Upload spark aggregation fuzzer + uses: actions/upload-artifact@v4 + with: + name: spark_aggregation_fuzzer + path: velox/_build/debug/velox/functions/sparksql/fuzzer/spark_aggregation_fuzzer_test + retention-days: "${{ env.RETENTION }}" + + - name: Upload aggregation fuzzer + uses: actions/upload-artifact@v4 + with: + name: aggregation + path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test + retention-days: "${{ env.RETENTION }}" + + - name: Upload join fuzzer + uses: actions/upload-artifact@v4 + with: + name: join + path: velox/_build/debug/velox/exec/tests/velox_join_fuzzer_test + retention-days: "${{ env.RETENTION }}" + + - name: Upload exchange fuzzer + uses: actions/upload-artifact@v4 + with: + name: exchange + path: velox/_build/debug//velox/exec/tests/velox_exchange_fuzzer_test + retention-days: "${{ env.RETENTION }}" + + - name: Upload window fuzzer + uses: actions/upload-artifact@v4 + with: + name: window + path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_window_fuzzer_test + retention-days: "${{ env.RETENTION }}" + + - name: Upload row number fuzzer + uses: actions/upload-artifact@v4 + with: + name: row_number + path: velox/_build/debug//velox/exec/tests/velox_row_number_fuzzer_test + retention-days: "${{ env.RETENTION }}" + + - name: Upload writer fuzzer + uses: actions/upload-artifact@v4 + with: + name: writer + path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_writer_fuzzer_test + retention-days: "${{ env.RETENTION }}" + + presto-fuzzer-run: + name: Presto Fuzzer + if: ${{ needs.compile.outputs.presto_bias != 'true' }} + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos9 + needs: compile + timeout-minutes: 120 + steps: + + - uses: dorny/paths-filter@v3 + if: github.event_name == 'pull_request' + id: changes + with: + filters: | + presto: + - 'velox/expression/!(test)**' + - 'velox/exec/!(test)**' + - 'velox/common/!(test)**' + - 'velox/core/!(test)**' + - 'velox/vector/!(test)**' + + - name: Set presto specific fuzzer duration + env: + # Run for 30 minutes instead of 15, when files relevant to presto are touched + pr_duration: "${{ steps.changes.outputs.presto == 'true' && 1800 || 900 }}" + # Run for 60 minutes if its a scheduled run + other_duration: "${{ inputs.duration || (github.event_name == 'push' && 1800 || 3600) }}" + is_pr: "${{ github.event_name == 'pull_request' }}" + run: | + + if [ "$is_pr" == "true" ]; then + duration=$pr_duration + else + duration=$other_duration + fi + + echo "DURATION=$duration" >> $GITHUB_ENV + + - name: Download presto fuzzer + uses: actions/download-artifact@v4 + with: + name: presto + + - name: Run Presto Fuzzer + run: | + mkdir -p /tmp/fuzzer_repro/logs/ chmod -R 777 /tmp/fuzzer_repro - _build/debug/velox/expression/tests/velox_expression_fuzzer_test \ + chmod +x velox_expression_fuzzer_test + ./velox_expression_fuzzer_test \ --seed ${RANDOM} \ --enable_variadic_signatures \ --velox_fuzzer_enable_complex_types \ @@ -59,49 +384,172 @@ jobs: --max_expression_trees_per_step 2 \ --retry_with_try \ --enable_dereference \ - --duration_sec 3600 \ - --logtostderr=1 \ + --duration_sec $DURATION \ --minloglevel=0 \ + --stderrthreshold=2 \ + --log_dir=/tmp/fuzzer_repro/logs \ --repro_persist_path=/tmp/fuzzer_repro \ && echo -e "\n\nFuzzer run finished successfully." - name: Archive production artifacts - if: always() - uses: actions/upload-artifact@v3 + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 with: name: presto-fuzzer-failure-artifacts path: | /tmp/fuzzer_repro - - - linux-spark-fuzzer-run: - runs-on: 8-core + presto-bias-fuzzer: + name: Presto Bias Fuzzer + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos9 + needs: compile + if: ${{ needs.compile.outputs.presto_bias == 'true' }} timeout-minutes: 120 - env: - CCACHE_DIR: "${{ github.workspace }}/.ccache/" - CCACHE_BASEDIR: "${{ github.workspace }}" - LINUX_DISTRO: "ubuntu" steps: - - name: "Checkout Repo" - uses: actions/checkout@v3 + - name: Download presto expression fuzzer + uses: actions/download-artifact@v4 with: - submodules: 'recursive' + name: presto + + - name: Download Signatures + uses: actions/download-artifact@v4 + with: + name: signatures + path: /tmp/signatures + + - name: Run Presto Expression Fuzzer + run: | + ls /tmp/signatures + mkdir -p /tmp/presto_bias_fuzzer_repro/logs/ + chmod -R 777 /tmp/presto_bias_fuzzer_repro + chmod +x velox_expression_fuzzer_test + ./velox_expression_fuzzer_test \ + --seed ${RANDOM} \ + --lazy_vector_generation_ratio 0.2 \ + --assign_function_tickets $(cat /tmp/signatures/presto_bias_functions) \ + --duration_sec 3600 \ + --enable_variadic_signatures \ + --velox_fuzzer_enable_complex_types \ + --velox_fuzzer_enable_column_reuse \ + --velox_fuzzer_enable_expression_reuse \ + --max_expression_trees_per_step 2 \ + --retry_with_try \ + --enable_dereference \ + --minloglevel=0 \ + --stderrthreshold=2 \ + --log_dir=/tmp/presto_bias_fuzzer_repro/logs \ + --repro_persist_path=/tmp/presto_bias_fuzzer_repro \ + && echo -e "\n\nPresto Fuzzer run finished successfully." - - name: "Install dependencies" - run: source ./scripts/setup-ubuntu.sh + - name: Archive Spark expression production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: presto-bias-fuzzer-failure-artifacts + path: | + /tmp/presto_bias_fuzzer_repro - - name: "Build" + spark-aggregate-fuzzer-run: + name: Spark Aggregate Fuzzer + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos9 + needs: compile + timeout-minutes: 60 + steps: + + - name: Download spark aggregation fuzzer + uses: actions/download-artifact@v4 + with: + name: spark_aggregation_fuzzer + + - name: Run Spark Aggregate Fuzzer run: | - make debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=6 EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON" - ccache -s + mkdir -p /tmp/spark_aggregate_fuzzer_repro/logs/ + chmod -R 777 /tmp/spark_aggregate_fuzzer_repro + chmod +x spark_aggregation_fuzzer_test + ./spark_aggregation_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec $DURATION \ + --minloglevel=0 \ + --stderrthreshold=2 \ + --log_dir=/tmp/spark_aggregate_fuzzer_repro/logs \ + --repro_persist_path=/tmp/spark_aggregate_fuzzer_repro \ + && echo -e "\n\nSpark Aggregation Fuzzer run finished successfully." + + - name: Archive Spark aggregate production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: spark-agg-fuzzer-failure-artifacts + path: | + /tmp/spark_aggregate_fuzzer_repro + + spark-bias-fuzzer: + name: Spark Bias Fuzzer + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos9 + needs: compile + if: ${{ needs.compile.outputs.spark_bias == 'true' }} + timeout-minutes: 120 + steps: + + - name: Download spark expression fuzzer + uses: actions/download-artifact@v4 + with: + name: spark_expression_fuzzer + + - name: Download Signatures + uses: actions/download-artifact@v4 + with: + name: signatures + path: /tmp/signatures - - name: "Run Spark Fuzzer" + - name: Run Spark Expression Fuzzer run: | - mkdir -p /tmp/spark_fuzzer_repro/ + ls /tmp/signatures + mkdir -p /tmp/spark_bias_fuzzer_repro/logs/ + chmod -R 777 /tmp/spark_bias_fuzzer_repro + chmod +x spark_expression_fuzzer_test + ./spark_expression_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec $DURATION \ + --minloglevel=0 \ + --stderrthreshold=2 \ + --log_dir=/tmp/spark_bias_fuzzer_repro/logs \ + --assign_function_tickets $(cat /tmp/signatures/spark_bias_functions) \ + --repro_persist_path=/tmp/spark_bias_fuzzer_repro \ + && echo -e "\n\nSpark Fuzzer run finished successfully." + + - name: Archive Spark expression production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: spark-fuzzer-failure-artifacts + path: | + /tmp/spark_bias_fuzzer_repro + + spark-fuzzer: + name: Spark Fuzzer + if: ${{ needs.compile.outputs.spark_bias != 'true' }} + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos9 + needs: compile + timeout-minutes: 120 + steps: + + - name: Download spark expression fuzzer + uses: actions/download-artifact@v4 + with: + name: spark_expression_fuzzer + + - name: Run Spark Expression Fuzzer + run: | + mkdir -p /tmp/spark_fuzzer_repro/logs/ chmod -R 777 /tmp/spark_fuzzer_repro - _build/debug/velox/expression/tests/spark_expression_fuzzer_test \ + chmod +x spark_expression_fuzzer_test + ./spark_expression_fuzzer_test \ --seed ${RANDOM} \ --enable_variadic_signatures \ --lazy_vector_generation_ratio 0.2 \ @@ -109,122 +557,436 @@ jobs: --velox_fuzzer_enable_expression_reuse \ --max_expression_trees_per_step 2 \ --retry_with_try \ - --duration_sec 1800 \ - --logtostderr=1 \ + --enable_dereference \ + --duration_sec $DURATION \ --minloglevel=0 \ + --stderrthreshold=2 \ + --log_dir=/tmp/spark_fuzzer_repro/logs \ --repro_persist_path=/tmp/spark_fuzzer_repro \ && echo -e "\n\nSpark Fuzzer run finished successfully." - - name: Archive Spark production artifacts - if: always() - uses: actions/upload-artifact@v3 + - name: Archive Spark expression production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 with: name: spark-fuzzer-failure-artifacts path: | /tmp/spark_fuzzer_repro - - name: "Run Spark Aggregate Fuzzer" + presto-java-join-fuzzer-run: + name: Join Fuzzer + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:presto-java + needs: compile + timeout-minutes: 120 + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache/" + LINUX_DISTRO: "centos" + steps: + + - name: Download join fuzzer + uses: actions/download-artifact@v4 + with: + name: join + + - name: "Checkout Repo" + uses: actions/checkout@v4 + with: + path: velox + submodules: 'recursive' + ref: "${{ inputs.ref }}" + + - name: Fix git permissions + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox + + - name: Run Join Fuzzer run: | - mkdir -p /tmp/spark_aggregate_fuzzer_repro/ - chmod -R 777 /tmp/spark_aggregate_fuzzer_repro - _build/debug/velox/exec/tests/spark_aggregation_fuzzer_test \ + cd velox + cp ./scripts/presto/etc/hive.properties $PRESTO_HOME/etc/catalog + ls -lR $PRESTO_HOME/etc + $PRESTO_HOME/bin/launcher run -v > /tmp/server.log 2>&1 & + # Sleep for 60 seconds to allow Presto server to start. + sleep 60 + /opt/presto-cli --server 127.0.0.1:8080 --execute 'CREATE SCHEMA hive.tpch;' + cd - + mkdir -p /tmp/join_fuzzer_repro/logs/ + chmod -R 777 /tmp/join_fuzzer_repro + chmod +x velox_join_fuzzer_test + ./velox_join_fuzzer_test \ --seed ${RANDOM} \ - --duration_sec 1800 \ - --logtostderr=1 \ + --duration_sec $DURATION \ --minloglevel=0 \ - --repro_persist_path=/tmp/spark_aggregate_fuzzer_repro \ - && echo -e "\n\nSpark Aggregation Fuzzer run finished successfully." - - - name: Archive Spark aggregate production artifacts - if: always() - uses: actions/upload-artifact@v3 + --stderrthreshold=2 \ + --log_dir=/tmp/join_fuzzer_repro/logs \ + --presto_url=http://127.0.0.1:8080 \ + --req_timeout_ms=2000 \ + && echo -e "\n\nJoin fuzzer run finished successfully." + + - name: Archive join production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 with: - name: spark-agg-fuzzer-failure-artifacts + name: presto-sot-join-fuzzer-failure-artifacts path: | - /tmp/spark_aggregate_fuzzer_repro + /tmp/join_fuzzer_repro + /tmp/server.log + exchange-fuzzer-run: + name: Exchange Fuzzer + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos9 + needs: compile + timeout-minutes: 120 + steps: + + - name: Download exchange fuzzer + uses: actions/download-artifact@v4 + with: + name: exchange + + - name: Run exchange Fuzzer + run: | + cat /proc/sys/vm/max_map_count + mkdir -p /tmp/exchange_fuzzer_repro/logs/ + chmod -R 777 /tmp/exchange_fuzzer_repro + chmod +x velox_exchange_fuzzer_test + ./velox_exchange_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec $DURATION \ + --minloglevel=0 \ + --stderrthreshold=2 \ + --log_dir=/tmp/exchange_fuzzer_repro/logs \ + --repro_path=/tmp/exchange_fuzzer_repro \ + && echo -e "\n\Exchange fuzzer run finished successfully." - linux-aggregate-fuzzer-run: - runs-on: 8-core + - name: Archive Exchange production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: exchange-fuzzer-failure-artifacts + path: | + /tmp/exchange_fuzzer_repro + + row-number-fuzzer-run: + name: RowNumber Fuzzer + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:centos9 + needs: compile + timeout-minutes: 120 + steps: + + - name: Download row number fuzzer + uses: actions/download-artifact@v4 + with: + name: row_number + + - name: Run RowNumber Fuzzer + run: | + cat /proc/sys/vm/max_map_count + mkdir -p /tmp/row_fuzzer_repro/logs/ + chmod -R 777 /tmp/row_fuzzer_repro + chmod +x velox_row_number_fuzzer_test + ./velox_row_number_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec $DURATION \ + --minloglevel=0 \ + --stderrthreshold=2 \ + --log_dir=/tmp/row_fuzzer_repro/logs \ + && echo -e "\n\Row number fuzzer run finished successfully." + + - name: Archive row number production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: row-fuzzer-failure-artifacts + path: | + /tmp/row_fuzzer_repro + presto-java-aggregation-fuzzer-run: + name: Aggregation Fuzzer with Presto as source of truth + needs: compile + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:presto-java timeout-minutes: 120 env: CCACHE_DIR: "${{ github.workspace }}/.ccache/" - CCACHE_BASEDIR: "${{ github.workspace }}" - LINUX_DISTRO: "ubuntu" + LINUX_DISTRO: "centos" steps: + - name: Download aggregation fuzzer + uses: actions/download-artifact@v4 + with: + name: aggregation + - name: "Checkout Repo" - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: + path: velox submodules: 'recursive' + ref: "${{ inputs.ref }}" - - name: "Install dependencies" - run: source ./scripts/setup-ubuntu.sh + - name: Fix git permissions + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox - - name: "Build" - run: | - make debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=6 EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON" - ccache -s - name: "Run Aggregate Fuzzer" run: | - mkdir -p /tmp/aggregate_fuzzer_repro/ - rm -rfv /tmp/aggregate_fuzzer_repro/* - chmod -R 777 /tmp/aggregate_fuzzer_repro - _build/debug/velox/exec/tests/velox_aggregation_fuzzer_test \ + cd velox + cp ./scripts/presto/etc/hive.properties $PRESTO_HOME/etc/catalog + ls -lR $PRESTO_HOME/etc + $PRESTO_HOME/bin/launcher run -v > /tmp/server.log 2>&1 & + # Sleep for 60 seconds to allow Presto server to start. + sleep 60 + /opt/presto-cli --server 127.0.0.1:8080 --execute 'CREATE SCHEMA hive.tpch;' + cd - + mkdir -p /tmp/aggregate_fuzzer_repro/logs/ + chmod -R 777 /tmp/aggregate_fuzzer_repro + chmod +x velox_aggregation_fuzzer_test + ./velox_aggregation_fuzzer_test \ --seed ${RANDOM} \ - --duration_sec 1800 \ - --logtostderr=1 \ + --duration_sec $DURATION \ --minloglevel=0 \ + --stderrthreshold=2 \ + --log_dir=/tmp/aggregate_fuzzer_repro/logs \ --repro_persist_path=/tmp/aggregate_fuzzer_repro \ - && echo -e "\n\nAggregation fuzzer run finished successfully." + --enable_sorted_aggregations=true \ + --presto_url=http://127.0.0.1:8080 \ + && echo -e "\n\nAggregation fuzzer run finished successfully." - name: Archive aggregate production artifacts - if: always() - uses: actions/upload-artifact@v3 + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 with: - name: aggregate-fuzzer-failure-artifacts + name: presto-sot-aggregate-fuzzer-failure-artifacts path: | /tmp/aggregate_fuzzer_repro + /tmp/server.log - linux-join-fuzzer-run: - runs-on: 8-core + + presto-bias-java-aggregation-fuzzer-run: + name: Biased Aggregation Fuzzer with Presto as source of truth + needs: compile + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:presto-java timeout-minutes: 120 + if: ${{ needs.compile.outputs.presto_aggregate_bias == 'true' }} env: CCACHE_DIR: "${{ github.workspace }}/.ccache/" - CCACHE_BASEDIR: "${{ github.workspace }}" - LINUX_DISTRO: "ubuntu" + LINUX_DISTRO: "centos" steps: + - name: Download aggregation fuzzer + uses: actions/download-artifact@v4 + with: + name: aggregation + - name: "Checkout Repo" - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: + path: velox submodules: 'recursive' + ref: "${{ inputs.ref }}" - - name: "Install dependencies" - run: source ./scripts/setup-ubuntu.sh + - name: Fix git permissions + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox + + - name: Download Signatures + uses: actions/download-artifact@v4 + with: + name: signatures + path: /tmp/signatures - - name: "Build" + - name: "Run Bias Aggregate Fuzzer" run: | - make debug NUM_THREADS=16 MAX_HIGH_MEM_JOBS=8 MAX_LINK_JOBS=6 EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON" - ccache -s + cd velox + cp ./scripts/presto/etc/hive.properties $PRESTO_HOME/etc/catalog + ls -lR $PRESTO_HOME/etc + $PRESTO_HOME/bin/launcher run -v > /tmp/server.log 2>&1 & + # Sleep for 60 seconds to allow Presto server to start. + sleep 60 + /opt/presto-cli --server 127.0.0.1:8080 --execute 'CREATE SCHEMA hive.tpch;' + cd - + mkdir -p /tmp/aggregate_fuzzer_repro/logs/ + chmod -R 777 /tmp/aggregate_fuzzer_repro + chmod +x velox_aggregation_fuzzer_test + echo "signatures folder" + ls /tmp/signatures/ + echo "Biased functions:" + cat /tmp/signatures/presto_aggregate_bias_functions + echo "Running Fuzzer for $DURATION" + ./velox_aggregation_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec $DURATION \ + --minloglevel=0 \ + --stderrthreshold=2 \ + --log_dir=/tmp/aggregate_fuzzer_repro/logs \ + --repro_persist_path=/tmp/aggregate_fuzzer_repro \ + --enable_sorted_aggregations=true \ + --only=$(cat /tmp/signatures/presto_aggregate_bias_functions) \ + --presto_url=http://127.0.0.1:8080 \ + && echo -e "\n\nAggregation fuzzer run finished successfully." - - name: "Run Aggregate Fuzzer" + - name: Archive bias aggregate production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: presto-bias-sot-aggregate-fuzzer-failure-artifacts + path: | + /tmp/aggregate_fuzzer_repro + /tmp/server.log + + surface-signature-errors: + name: Signature Changes + if: ${{ github.event_name != 'schedule' }} + needs: compile + runs-on: ubuntu-latest + steps: + - name: Download Signatures + uses: actions/download-artifact@v4 + with: + name: signatures + path: /tmp/signatures + + - name: Surface Presto function signature errors + if: ${{ needs.compile.outputs.presto_error == 'true' }} run: | - mkdir -p /tmp/join_fuzzer_repro/ - rm -rfv /tmp/join_fuzzer_repro/* - chmod -R 777 /tmp/join_fuzzer_repro - _build/debug/velox/exec/tests/velox_join_fuzzer_test \ + cat /tmp/signatures/presto_errors + exit 1 + + - name: Surface Aggregate function signature errors + if: ${{ needs.compile.outputs.presto_aggregate_error == 'true' }} + run: | + cat /tmp/signatures/presto_aggregate_errors + exit 1 + + presto-java-window-fuzzer-run: + name: Window Fuzzer with Presto as source of truth + needs: compile + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:presto-java + timeout-minutes: 120 + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache/" + LINUX_DISTRO: "centos" + steps: + + - name: Download window fuzzer + uses: actions/download-artifact@v4 + with: + name: window + + - name: "Checkout Repo" + uses: actions/checkout@v4 + with: + path: velox + submodules: 'recursive' + ref: "${{ inputs.ref }}" + + - name: Fix git permissions + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox + + + - name: "Run Window Fuzzer" + run: | + cd velox + cp ./scripts/presto/etc/hive.properties $PRESTO_HOME/etc/catalog + ls -lR $PRESTO_HOME/etc + $PRESTO_HOME/bin/launcher run -v > /tmp/server.log 2>&1 & + # Sleep for 60 seconds to allow Presto server to start. + sleep 60 + /opt/presto-cli --server 127.0.0.1:8080 --execute 'CREATE SCHEMA hive.tpch;' + cd - + mkdir -p /tmp/window_fuzzer_repro/logs/ + chmod -R 777 /tmp/window_fuzzer_repro + chmod +x velox_window_fuzzer_test + ./velox_window_fuzzer_test \ --seed ${RANDOM} \ - --duration_sec 1800 \ - --logtostderr=1 \ + --duration_sec $DURATION \ --minloglevel=0 \ - && echo -e "\n\nAggregation fuzzer run finished successfully." + --stderrthreshold=2 \ + --log_dir=/tmp/window_fuzzer_repro/logs \ + --repro_persist_path=/tmp/window_fuzzer_repro \ + --enable_window_reference_verification \ + --presto_url=http://127.0.0.1:8080 \ + && echo -e "\n\nWindow fuzzer run finished successfully." - - name: Archive aggregate production artifacts - if: always() - uses: actions/upload-artifact@v3 + - name: Archive window production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 with: - name: join-fuzzer-failure-artifacts + name: presto-sot-window-fuzzer-failure-artifacts path: | - /tmp/join_fuzzer_repro + /tmp/window_fuzzer_repro + /tmp/server.log + + presto-java-writer-fuzzer-run: + name: Writer Fuzzer with Presto as source of truth + needs: compile + runs-on: ubuntu-latest + container: ghcr.io/facebookincubator/velox-dev:presto-java + timeout-minutes: 120 + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache/" + LINUX_DISTRO: "centos" + steps: + + - name: Download writer fuzzer + uses: actions/download-artifact@v4 + with: + name: writer + + - name: "Checkout Repo" + uses: actions/checkout@v4 + with: + path: velox + submodules: 'recursive' + ref: "${{ inputs.ref }}" + + - name: Fix git permissions + # Usually actions/checkout does this but as we run in a container + # it doesn't work + run: git config --global --add safe.directory ${GITHUB_WORKSPACE}/velox + + - name: "Run Writer Fuzzer" + run: | + cd velox + cp ./scripts/presto/etc/hive.properties $PRESTO_HOME/etc/catalog + ls -lR $PRESTO_HOME/etc + echo "jvm config content:" + cat $PRESTO_HOME/etc/jvm.config + $PRESTO_HOME/bin/launcher run -v > /tmp/server.log 2>&1 & + ls -lR /var/log + # Sleep for 60 seconds to allow Presto server to start. + sleep 60 + /opt/presto-cli --version + /opt/presto-cli --server 127.0.0.1:8080 --execute 'CREATE SCHEMA hive.tpch;' + cd - + mkdir -p /tmp/writer_fuzzer_repro/logs/ + chmod -R 777 /tmp/writer_fuzzer_repro + chmod +x velox_writer_fuzzer_test + ./velox_writer_fuzzer_test \ + --seed ${RANDOM} \ + --duration_sec $DURATION \ + --minloglevel=0 \ + --stderrthreshold=2 \ + --req_timeout_ms 60000 \ + --log_dir=/tmp/writer_fuzzer_repro/logs \ + --presto_url=http://127.0.0.1:8080 \ + && echo -e "\n\Writer fuzzer run finished successfully." + + - name: Archive writer production artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: presto-sot-writer-fuzzer-failure-artifacts + path: | + /tmp/writer_fuzzer_repro + /tmp/server.log + /var/log diff --git a/.gitignore b/.gitignore index 6c37c57b563b5..f2b64bee7a554 100644 --- a/.gitignore +++ b/.gitignore @@ -48,7 +48,9 @@ autoconf/autom4te.cache projects/* !projects/*.* !projects/Makefile - +.venv +deps-install +deps-download #==============================================================================# # Autotools artifacts @@ -88,7 +90,8 @@ _build/ *.pdf *.swp a.out -CMake/resolve_dependency_module/boost/FindBoost.cmake +CMake/resolve_dependency_modules/boost/FindBoost.cmake +__cmake_systeminformation/ #==============================================================================# # Kate Swap Files @@ -278,6 +281,7 @@ settings.json # User's build configuration Makefile.config +CMakeUserPresets.json # build, distribute, and bins (+ python proto bindings) build @@ -319,3 +323,4 @@ src/amalgamation/ #docs velox/docs/sphinx/source/README_generated_* velox/docs/bindings/python/_generate/* +scripts/bm-report/report.html diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index c4d716cbc4f56..0000000000000 --- a/.gitmodules +++ /dev/null @@ -1,6 +0,0 @@ -[submodule "third_party/googletest"] - path = third_party/googletest - url = https://github.com/google/googletest.git -[submodule "third_party/xsimd"] - path = third_party/xsimd - url = https://github.com/xtensor-stack/xsimd.git diff --git a/CMake/FindArrow.cmake b/CMake/FindArrow.cmake new file mode 100644 index 0000000000000..8c73a29e39137 --- /dev/null +++ b/CMake/FindArrow.cmake @@ -0,0 +1,48 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +find_library(ARROW_LIB libarrow.a) +find_library(PARQUET_LIB libparquet.a) +find_library(ARROW_TESTING_LIB libarrow_testing.a) +if("${ARROW_LIB}" STREQUAL "ARROW_LIB-NOTFOUND" + OR "${ARROW_TESTING_LIB}" STREQUAL "ARROW_TESTING_LIB-NOTFOUND") + set(Arrow_FOUND false) + return() +endif() +find_package(Thrift) +if(NOT Thrift_FOUND) + # Requires building arrow from source with thrift bundled. + set(Arrow_FOUND false) + return() +endif() +add_library(thrift ALIAS thrift::thrift) + +set(Arrow_FOUND true) + +# Only add the libraries once. +if(NOT TARGET arrow) + add_library(arrow STATIC IMPORTED GLOBAL) + add_library(parquet STATIC IMPORTED GLOBAL) + add_library(arrow_testing STATIC IMPORTED GLOBAL) + + find_path(ARROW_INCLUDE_PATH arrow/api.h) + set_target_properties( + arrow arrow_testing parquet PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + ${ARROW_INCLUDE_PATH}) + set_target_properties(arrow PROPERTIES IMPORTED_LOCATION ${ARROW_LIB} + INTERFACE_LINK_LIBRARIES thrift) + set_target_properties(parquet PROPERTIES IMPORTED_LOCATION ${PARQUET_LIB}) + set_target_properties(arrow_testing PROPERTIES IMPORTED_LOCATION + ${ARROW_TESTING_LIB}) +endif() diff --git a/CMake/FindThrift.cmake b/CMake/FindThrift.cmake new file mode 100644 index 0000000000000..273500a6ae369 --- /dev/null +++ b/CMake/FindThrift.cmake @@ -0,0 +1,167 @@ +# Copyright 2012 Cloudera Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# * Find Thrift (a cross platform RPC lib/tool) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Thrift_ROOT - When set, this path is inspected instead of standard library +# locations as the root of the Thrift installation. The environment variable +# THRIFT_HOME overrides this variable. +# +# This module defines Thrift_FOUND, whether Thrift is found or not +# Thrift_COMPILER_FOUND, whether Thrift compiler is found or not +# +# thrift::thrift, a library target to use Thrift thrift::compiler, a executable +# target to use Thrift compiler + +function(EXTRACT_THRIFT_VERSION) + if(THRIFT_INCLUDE_DIR) + file(READ "${THRIFT_INCLUDE_DIR}/thrift/config.h" THRIFT_CONFIG_H_CONTENT) + string(REGEX MATCH "#define PACKAGE_VERSION \"[0-9.]+\"" + THRIFT_VERSION_DEFINITION "${THRIFT_CONFIG_H_CONTENT}") + string(REGEX MATCH "[0-9.]+" Thrift_VERSION "${THRIFT_VERSION_DEFINITION}") + set(Thrift_VERSION + "${Thrift_VERSION}" + PARENT_SCOPE) + else() + set(Thrift_VERSION + "" + PARENT_SCOPE) + endif() +endfunction(EXTRACT_THRIFT_VERSION) + +if(MSVC_TOOLCHAIN AND NOT DEFINED THRIFT_MSVC_LIB_SUFFIX) + if(NOT ARROW_THRIFT_USE_SHARED) + if(ARROW_USE_STATIC_CRT) + if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + set(THRIFT_MSVC_LIB_SUFFIX "mtd") + else() + set(THRIFT_MSVC_LIB_SUFFIX "mt") + endif() + else() + if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + set(THRIFT_MSVC_LIB_SUFFIX "mdd") + else() + set(THRIFT_MSVC_LIB_SUFFIX "md") + endif() + endif() + endif() +endif() +set(THRIFT_LIB_NAME_BASE "thrift${THRIFT_MSVC_LIB_SUFFIX}") + +if(ARROW_THRIFT_USE_SHARED) + set(THRIFT_LIB_NAMES thrift) + if(CMAKE_IMPORT_LIBRARY_SUFFIX) + list( + APPEND + THRIFT_LIB_NAMES + "${CMAKE_IMPORT_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" + ) + endif() + list( + APPEND + THRIFT_LIB_NAMES + "${CMAKE_SHARED_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}" + ) +else() + set(THRIFT_LIB_NAMES + "${CMAKE_STATIC_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) +endif() + +if(Thrift_ROOT) + find_library( + THRIFT_LIB + NAMES ${THRIFT_LIB_NAMES} + PATHS ${Thrift_ROOT} + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") + find_path( + THRIFT_INCLUDE_DIR thrift/Thrift.h + PATHS ${Thrift_ROOT} + PATH_SUFFIXES "include") + find_program( + THRIFT_COMPILER thrift + PATHS ${Thrift_ROOT} + PATH_SUFFIXES "bin") + extract_thrift_version() +else() + # THRIFT-4760: The pkgconfig files are currently only installed when using + # autotools. Starting with 0.13, they are also installed for the CMake-based + # installations of Thrift. + find_package(PkgConfig QUIET) + pkg_check_modules(THRIFT_PC thrift) + if(THRIFT_PC_FOUND) + set(THRIFT_INCLUDE_DIR "${THRIFT_PC_INCLUDEDIR}") + + list(APPEND THRIFT_PC_LIBRARY_DIRS "${THRIFT_PC_LIBDIR}") + + find_library( + THRIFT_LIB + NAMES ${THRIFT_LIB_NAMES} + PATHS ${THRIFT_PC_LIBRARY_DIRS} + NO_DEFAULT_PATH) + find_program( + THRIFT_COMPILER thrift + HINTS ${THRIFT_PC_PREFIX} + NO_DEFAULT_PATH + PATH_SUFFIXES "bin") + set(Thrift_VERSION ${THRIFT_PC_VERSION}) + else() + find_library( + THRIFT_LIB + NAMES ${THRIFT_LIB_NAMES} + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") + find_path(THRIFT_INCLUDE_DIR thrift/Thrift.h PATH_SUFFIXES "include") + find_program(THRIFT_COMPILER thrift PATH_SUFFIXES "bin") + extract_thrift_version() + endif() +endif() + +if(THRIFT_COMPILER) + set(Thrift_COMPILER_FOUND TRUE) +else() + set(Thrift_COMPILER_FOUND FALSE) +endif() + +find_package_handle_standard_args( + Thrift + REQUIRED_VARS THRIFT_LIB THRIFT_INCLUDE_DIR + VERSION_VAR Thrift_VERSION + HANDLE_COMPONENTS) + +if(Thrift_FOUND) + if(ARROW_THRIFT_USE_SHARED) + add_library(thrift::thrift SHARED IMPORTED) + else() + add_library(thrift::thrift STATIC IMPORTED) + endif() + set_target_properties( + thrift::thrift + PROPERTIES IMPORTED_LOCATION "${THRIFT_LIB}" INTERFACE_INCLUDE_DIRECTORIES + "${THRIFT_INCLUDE_DIR}") + if(WIN32 AND NOT MSVC_TOOLCHAIN) + # We don't need this for Visual C++ because Thrift uses "#pragma + # comment(lib, "Ws2_32.lib")" in thrift/windows/config.h for Visual C++. + set_target_properties(thrift::thrift PROPERTIES INTERFACE_LINK_LIBRARIES + "ws2_32") + endif() + + if(Thrift_COMPILER_FOUND) + add_executable(thrift::compiler IMPORTED) + set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION + "${THRIFT_COMPILER}") + endif() +endif() diff --git a/CMake/Findc-ares.cmake b/CMake/Findc-ares.cmake new file mode 100644 index 0000000000000..0e3f1fe0938a6 --- /dev/null +++ b/CMake/Findc-ares.cmake @@ -0,0 +1,40 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +find_package(c-ares CONFIG) +if(c-ares_FOUND) + if(TARGET c-ares::cares) + return() + endif() +endif() + +find_path( + C_ARES_INCLUDE_DIR + NAMES ares.h + PATH_SUFFIXES c-ares) +find_library(C_ARES_LIBRARY NAMES c-ares) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(c-ares DEFAULT_MSG C_ARES_LIBRARY + C_ARES_INCLUDE_DIR) + +if(c-ares_FOUND AND NOT TARGET c-ares::cares) + add_library(c-ares::cares UNKNOWN IMPORTED) + set_target_properties( + c-ares::cares + PROPERTIES IMPORTED_LOCATION "${C_ARES_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${C_ARES_INCLUDE_DIR}") +endif() + +mark_as_advanced(C_ARES_INCLUDE_DIR C_ARES_LIBRARY) diff --git a/CMake/Finddouble-conversion.cmake b/CMake/Finddouble-conversion.cmake new file mode 100644 index 0000000000000..457bd88e26c17 --- /dev/null +++ b/CMake/Finddouble-conversion.cmake @@ -0,0 +1,34 @@ +# Copyright (C) 2022 The Qt Company Ltd. +# SPDX-License-Identifier: BSD-3-Clause + +# Fallback find module for double-conversion if double-conversion is built with +# CMake it'll install a config module, which we prefer if it's built with Scons +# (their default), we search ourselves + +find_package(double-conversion CONFIG) +if(double-conversion_FOUND) + if(TARGET double-conversion::double-conversion) + return() + endif() +endif() + +find_path( + DOUBLE_CONVERSION_INCLUDE_DIR + NAMES double-conversion.h + PATH_SUFFIXES double-conversion) +find_library(DOUBLE_CONVERSION_LIBRARY NAMES double-conversion) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + double-conversion DEFAULT_MSG DOUBLE_CONVERSION_LIBRARY + DOUBLE_CONVERSION_INCLUDE_DIR) + +if(double-conversion_FOUND AND NOT TARGET double-conversion::double-conversion) + add_library(double-conversion::double-conversion UNKNOWN IMPORTED) + set_target_properties( + double-conversion::double-conversion + PROPERTIES IMPORTED_LOCATION "${DOUBLE_CONVERSION_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${DOUBLE_CONVERSION_INCLUDE_DIR}") +endif() + +mark_as_advanced(DOUBLE_CONVERSION_INCLUDE_DIR DOUBLE_CONVERSION_LIBRARY) diff --git a/CMake/Findre2.cmake b/CMake/Findre2.cmake index 8791e612ed536..1a438cce0e0ab 100644 --- a/CMake/Findre2.cmake +++ b/CMake/Findre2.cmake @@ -11,19 +11,55 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -find_library(re2_lib re2) -find_path(re2_include re2/re2.h) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(re2 REQUIRED_VARS re2_lib re2_include) +find_package(re2 QUIET CONFIG) if(re2_FOUND) - set(re2_LIBRARIES ${re2_lib}) - set(re2_INCLUDE_DIRS ${re2_include}) + message(STATUS "Found RE2 via CMake.") + return() +endif() + +# As per https://github.com/grpc/grpc/issues/25434, idempotence is necessary +# because CMake fails when another target with the same name already exists. +if(TARGET re2::re2) + message(STATUS "Found RE2 via pkg-config already?") + return() +endif() + +find_package(PkgConfig REQUIRED) +# TODO(junyer): Use the IMPORTED_TARGET option whenever CMake 3.6 (or newer) +# becomes the minimum required: that will take care of the add_library() and +# set_property() calls; then we can simply alias PkgConfig::RE2 as re2::re2. For +# now, we can only set INTERFACE_* properties that existed in CMake 3.5. +pkg_check_modules(RE2 QUIET re2) +if(RE2_FOUND) + set(re2_FOUND "${RE2_FOUND}") + add_library(re2::re2 INTERFACE IMPORTED) + if(RE2_INCLUDE_DIRS) + set_property(TARGET re2::re2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES + "${RE2_INCLUDE_DIRS}") + endif() + if(RE2_CFLAGS_OTHER) + # Filter out the -std flag, which is handled by CMAKE_CXX_STANDARD. + # TODO(junyer): Use the FILTER option whenever CMake 3.6 (or newer) becomes + # the minimum required: that will allow this to be concise. + foreach(flag IN LISTS RE2_CFLAGS_OTHER) + if("${flag}" MATCHES "^-std=") + list(REMOVE_ITEM RE2_CFLAGS_OTHER "${flag}") + endif() + endforeach() + set_property(TARGET re2::re2 PROPERTY INTERFACE_COMPILE_OPTIONS + "${RE2_CFLAGS_OTHER}") + endif() + if(RE2_LDFLAGS) + set_property(TARGET re2::re2 PROPERTY INTERFACE_LINK_LIBRARIES + "${RE2_LDFLAGS}") + endif() + message(STATUS "Found RE2 via pkg-config.") + return() +endif() - add_library(re2::re2 UNKNOWN IMPORTED) - target_include_directories(re2::re2 INTERFACE ${re2_INCLUDE_DIRS}) - target_link_libraries(re2::re2 INTERFACE ${re2_LIBRARIES}) - set_target_properties(re2::re2 PROPERTIES IMPORTED_LOCATION - "${re2_LIBRARIES}") +if(re2_FIND_REQUIRED) + message(FATAL_ERROR "Failed to find RE2.") +elseif(NOT re2_FIND_QUIETLY) + message(WARNING "Failed to find RE2.") endif() diff --git a/CMake/ResolveDependency.cmake b/CMake/ResolveDependency.cmake index ecd861472afb8..ad8a8fd31bca0 100644 --- a/CMake/ResolveDependency.cmake +++ b/CMake/ResolveDependency.cmake @@ -43,7 +43,7 @@ macro(build_dependency dependency_name) include(${dependency_name_lower}) endmacro() -# * Macro to resolve thirparty dependencies. +# * Macro to resolve third-party dependencies. # # Provides the macro resolve_dependency(). This macro will allow us to find the # dependency via the usage of find_package or use the custom @@ -52,9 +52,9 @@ endmacro() # # resolve_dependency(dependency_name [...] ) # -# [...]: the macro will pass all arguments after DELPENDENCY_NAME on to +# [...]: the macro will pass all arguments after DEPENDENCY_NAME on to # find_package. ${dependency_name}_SOURCE is expected to be set to either AUTO, -# SYSTEM or BUNDLED. If ${dependency_name}_SOURCE is SYSTEM it will try to find +# SYSTEM or BUNDLED. If ${dependency_name}_SOURCE is AUTO it will try to find # the corresponding package via find_package and if not found it will call the # build_dependency macro to download and build the third party dependency. If # ${dependency_name}_SOURCE is SYSTEM it will force to find via find_package. If @@ -94,10 +94,14 @@ macro(set_source dependency_name) STATUS "Setting ${dependency_name} source to ${${dependency_name}_SOURCE}") endmacro() -# Set a variable to the value of $ENV{envvar_name} if defined, set to ${DEFAULT} -# if not defined. If called from within a nested scope the variable will not +# If the var_name is not defined then set var_name to the value of +# $ENV{envvar_name} if it is defined. If neither is defined then set var_name to +# ${DEFAULT}. If called from within a nested scope the variable will not # propagate into outer scopes automatically! Use PARENT_SCOPE. function(set_with_default var_name envvar_name default) + if(DEFINED ${var_name}) + return() + endif() if(DEFINED ENV{${envvar_name}}) set(${var_name} $ENV{${envvar_name}} diff --git a/CMake/VeloxUtils.cmake b/CMake/VeloxUtils.cmake new file mode 100644 index 0000000000000..fa8f9456d7e2a --- /dev/null +++ b/CMake/VeloxUtils.cmake @@ -0,0 +1,132 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +# TODO use file sets +function(velox_install_library_headers) + # Find any headers and install them relative to the source tree in include. + file(GLOB _hdrs "*.h") + if(NOT "${_hdrs}" STREQUAL "") + cmake_path( + RELATIVE_PATH + CMAKE_CURRENT_SOURCE_DIR + BASE_DIRECTORY + "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE + _hdr_dir) + install(FILES ${_hdrs} DESTINATION include/${_hdr_dir}) + endif() +endfunction() + +# Base add velox library call to add a library and install it. +function(velox_base_add_library TARGET) + add_library(${TARGET} ${ARGN}) + install(TARGETS ${TARGET} DESTINATION lib/velox) + velox_install_library_headers() +endfunction() + +# This is extremely hackish but presents an easy path to installation. +function(velox_add_library TARGET) + set(options OBJECT STATIC SHARED INTERFACE) + set(oneValueArgs) + set(multiValueArgs) + cmake_parse_arguments( + VELOX + "${options}" + "${oneValueArgs}" + "${multiValueArgs}" + ${ARGN}) + + # Remove library type specifiers from ARGN + set(library_type) + if(VELOX_OBJECT) + set(library_type OBJECT) + elseif(VELOX_STATIC) + set(library_type STATIC) + elseif(VELOX_SHARED) + set(library_type SHARED) + elseif(VELOX_INTERFACE) + set(library_type INTERFACE) + endif() + + list(REMOVE_ITEM ARGN OBJECT) + list(REMOVE_ITEM ARGN STATIC) + list(REMOVE_ITEM ARGN SHARED) + list(REMOVE_ITEM ARGN INTERFACE) + # Propagate to the underlying add_library and then install the target. + if(VELOX_MONO_LIBRARY) + if(TARGET velox) + # Target already exists, append sources to it. + target_sources(velox PRIVATE ${ARGN}) + else() + # Create the target if this is the first invocation. + add_library(velox ${ARGN}) + set_target_properties(velox PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/lib) + set_target_properties(velox PROPERTIES ARCHIVE_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/lib) + install(TARGETS velox DESTINATION lib/velox) + endif() + # create alias for compatability + if(NOT TARGET ${TARGET}) + add_library(${TARGET} ALIAS velox) + endif() + else() + # Create a library for each invocation. + velox_base_add_library(${TARGET} ${library_type} ${ARGN}) + endif() + velox_install_library_headers() +endfunction() + +function(velox_link_libraries TARGET) + # TODO(assignUser): Handle scope keywords (they currently are empty calls ala + # target_link_libraries(target PRIVATE)) + if(VELOX_MONO_LIBRARY) + message(DEBUG "${TARGET}: ${ARGN}") + foreach(_lib ${ARGN}) + if("${_lib}" MATCHES "^velox_*") + message(DEBUG "\t\tDROP: ${_lib}") + else() + message(DEBUG "\t\tADDING: ${_lib}") + target_link_libraries(velox ${_lib}) + endif() + endforeach() + else() + target_link_libraries(${TARGET} ${ARGN}) + endif() +endfunction() + +function(velox_include_directories TARGET) + if(VELOX_MONO_LIBRARY) + target_include_directories(velox ${ARGN}) + else() + target_include_directories(${TARGET} ${ARGN}) + endif() +endfunction() + +function(velox_compile_definitions TARGET) + if(VELOX_MONO_LIBRARY) + target_compile_definitions(velox ${ARGN}) + else() + target_compile_definitions(${TARGET} ${ARGN}) + endif() +endfunction() + +function(velox_sources TARGET) + if(VELOX_MONO_LIBRARY) + target_sources(velox ${ARGN}) + else() + target_sources(${TARGET} ${ARGN}) + endif() +endfunction() diff --git a/CMake/resolve_dependency_modules/README.md b/CMake/resolve_dependency_modules/README.md index e5c3346c9d2aa..a164e50ad3325 100644 --- a/CMake/resolve_dependency_modules/README.md +++ b/CMake/resolve_dependency_modules/README.md @@ -1,7 +1,51 @@ +# Dependency List +Following is the list of libraries and their minimum version +that Velox requires. Some of these libraries can be installed +via a platform's package manager (eg. `brew` on MacOS). +The versions of certain libraries is the default provided by +the platform's package manager. Some libraries can be bundled +by Velox. See details on bundling below. + +| Library Name | Minimum Version | Bundled? | +|-------------------|-----------------|----------| +| ninja | default | No | +| ccache | default | No | +| icu4c | default | Yes | +| gflags | default | Yes | +| glog | default | Yes | +| gtest (testing) | default | Yes | +| libevent | default | No | +| libsodium | default | No | +| lz4 | default | No | +| snappy | default | No | +| lzo | default | No | +| xz | default | No | +| zstd | default | No | +| openssl | default | No | +| protobuf | 21.7 >= x < 22 | Yes | +| boost | 1.77.0 | Yes | +| flex | 2.5.13 | No | +| bison | 3.0.4 | No | +| cmake | 3.28 | No | +| double-conversion | 3.1.5 | No | +| xsimd | 10.0.0 | Yes | +| re2 | 2021-04-01 | Yes | +| fmt | 10.1.1 | Yes | +| simdjson | 3.9.3 | Yes | +| fast_float | v6.1.6 | Yes | +| folly | v2024.09.16.00 | Yes | +| fizz | v2024.09.16.00 | No | +| wangle | v2024.09.16.00 | No | +| mvfst | v2024.09.16.00 | No | +| fbthrift | v2024.09.16.00 | No | +| libstemmer | 2.2.0 | Yes | +| DuckDB (testing) | 0.8.1 | Yes | +| cpr (testing) | 1.10.15 | Yes | + # Bundled Dependency Management This module provides a dependency management system that allows us to automatically fetch and build dependencies from source if needed. -By default the system will use dependencies installed on the host and fallback to building from source. This behaviour can be changed by setting environment variables: +By default, the system will use dependencies installed on the host and fallback to building from source. This behaviour can be changed by setting environment variables: - `VELOX_DEPENDENCY_SOURCE=AUTO|BUNDLED|SYSTEM` for all dependencies or - `_SOURCE=AUTO|BUNDLED|SYSTEM` for each dependency individually "package" has to use the same spelling as used in `CMakelists.txt`. diff --git a/CMake/resolve_dependency_modules/absl.cmake b/CMake/resolve_dependency_modules/absl.cmake new file mode 100644 index 0000000000000..11c267b688e80 --- /dev/null +++ b/CMake/resolve_dependency_modules/absl.cmake @@ -0,0 +1,36 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +set(VELOX_ABSL_BUILD_VERSION 20240116.2) +set(VELOX_ABSL_BUILD_SHA256_CHECKSUM + 733726b8c3a6d39a4120d7e45ea8b41a434cdacde401cba500f14236c49b39dc) +string(CONCAT VELOX_ABSL_SOURCE_URL + "https://github.com/abseil/abseil-cpp/archive/refs/tags/" + "${VELOX_ABSL_BUILD_VERSION}.tar.gz") + +resolve_dependency_url(ABSL) + +message(STATUS "Building Abseil from source") + +FetchContent_Declare( + absl + URL ${VELOX_ABSL_SOURCE_URL} + URL_HASH ${VELOX_ABSL_BUILD_SHA256_CHECKSUM} + OVERRIDE_FIND_PACKAGE EXCLUDE_FROM_ALL SYSTEM) + +set(ABSL_BUILD_TESTING OFF) +set(ABSL_PROPAGATE_CXX_STD ON) +set(ABSL_ENABLE_INSTALL ON) +FetchContent_MakeAvailable(absl) diff --git a/CMake/resolve_dependency_modules/arrow.cmake b/CMake/resolve_dependency_modules/arrow.cmake new file mode 100644 index 0000000000000..b9d04683d69d4 --- /dev/null +++ b/CMake/resolve_dependency_modules/arrow.cmake @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/arrow) diff --git a/CMake/resolve_dependency_modules/arrow/CMakeLists.txt b/CMake/resolve_dependency_modules/arrow/CMakeLists.txt new file mode 100644 index 0000000000000..ddf1fac71b86c --- /dev/null +++ b/CMake/resolve_dependency_modules/arrow/CMakeLists.txt @@ -0,0 +1,96 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +project(Arrow) + +if(VELOX_ENABLE_ARROW) + find_package(Thrift) + if(Thrift_FOUND) + set(THRIFT_SOURCE "SYSTEM") + else() + set(THRIFT_SOURCE "BUNDLED") + endif() + + set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep") + set(ARROW_CMAKE_ARGS + -DARROW_PARQUET=OFF + -DARROW_DEPENDENCY_SOURCE=AUTO + -DARROW_WITH_THRIFT=ON + -DARROW_WITH_LZ4=ON + -DARROW_WITH_SNAPPY=ON + -DARROW_WITH_ZLIB=ON + -DARROW_WITH_ZSTD=ON + -DARROW_JEMALLOC=OFF + -DARROW_SIMD_LEVEL=NONE + -DARROW_RUNTIME_SIMD_LEVEL=NONE + -DARROW_WITH_UTF8PROC=OFF + -DARROW_TESTING=ON + -DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}/install + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DARROW_BUILD_STATIC=ON + -DThrift_SOURCE=${THRIFT_SOURCE} + -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}) + set(ARROW_LIBDIR ${ARROW_PREFIX}/install/${CMAKE_INSTALL_LIBDIR}) + + add_library(thrift STATIC IMPORTED GLOBAL) + if(NOT Thrift_FOUND) + set(THRIFT_ROOT ${ARROW_PREFIX}/src/arrow_ep-build/thrift_ep-install) + set(THRIFT_LIB ${THRIFT_ROOT}/lib/libthrift.a) + + file(MAKE_DIRECTORY ${THRIFT_ROOT}/include) + set(THRIFT_INCLUDE_DIR ${THRIFT_ROOT}/include) + endif() + + set_property(TARGET thrift PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${THRIFT_INCLUDE_DIR}) + set_property(TARGET thrift PROPERTY IMPORTED_LOCATION ${THRIFT_LIB}) + + set(VELOX_ARROW_BUILD_VERSION 15.0.0) + set(VELOX_ARROW_BUILD_SHA256_CHECKSUM + 01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d) + set(VELOX_ARROW_SOURCE_URL + "https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz" + ) + + resolve_dependency_url(ARROW) + + ExternalProject_Add( + arrow_ep + PREFIX ${ARROW_PREFIX} + URL ${VELOX_ARROW_SOURCE_URL} + URL_HASH ${VELOX_ARROW_BUILD_SHA256_CHECKSUM} + SOURCE_SUBDIR cpp + CMAKE_ARGS ${ARROW_CMAKE_ARGS} + BUILD_BYPRODUCTS ${ARROW_LIBDIR}/libarrow.a ${ARROW_LIBDIR}/libparquet.a + ${ARROW_LIBDIR}/libarrow_testing.a ${THRIFT_LIB}) + + add_library(arrow STATIC IMPORTED GLOBAL) + add_library(arrow_testing STATIC IMPORTED GLOBAL) + add_library(parquet STATIC IMPORTED GLOBAL) + add_dependencies(arrow arrow_ep) + add_dependencies(arrow_testing arrow) + add_dependencies(parquet arrow) + file(MAKE_DIRECTORY ${ARROW_PREFIX}/install/include) + set_target_properties( + arrow arrow_testing parquet PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + ${ARROW_PREFIX}/install/include) + set_target_properties(arrow PROPERTIES IMPORTED_LOCATION + ${ARROW_LIBDIR}/libarrow.a) + set_property(TARGET arrow PROPERTY INTERFACE_LINK_LIBRARIES ${RE2} thrift) + set_target_properties( + arrow_testing PROPERTIES IMPORTED_LOCATION + ${ARROW_LIBDIR}/libarrow_testing.a) + set_target_properties(parquet PROPERTIES IMPORTED_LOCATION + ${ARROW_LIBDIR}/libparquet.a) + +endif() diff --git a/CMake/resolve_dependency_modules/boost/CMakeLists.txt b/CMake/resolve_dependency_modules/boost/CMakeLists.txt index 8f9bf06ed44b7..c6d32b88c2b4d 100644 --- a/CMake/resolve_dependency_modules/boost/CMakeLists.txt +++ b/CMake/resolve_dependency_modules/boost/CMakeLists.txt @@ -21,14 +21,14 @@ add_compile_options(-w) # We need to use boost > 1.70 to build it with CMake 1.81 was the first to be # released as a github release INCLUDING the cmake files (which are not in the # officale releases for some reason) -set(VELOX_BOOST_BUILD_VERSION 1.81.0) +set(VELOX_BOOST_BUILD_VERSION 1.84.0) string( CONCAT VELOX_BOOST_SOURCE_URL "https://github.com/boostorg/boost/releases/download/" "boost-${VELOX_BOOST_BUILD_VERSION}/" "boost-${VELOX_BOOST_BUILD_VERSION}.tar.gz") set(VELOX_BOOST_BUILD_SHA256_CHECKSUM - 121da556b718fd7bd700b5f2e734f8004f1cfa78b7d30145471c526ba75a151c) + 4d27e9efed0f6f152dc28db6430b9d3dfb40c0345da7342eaa5a987dde57bd95) resolve_dependency_url(BOOST) message(STATUS "Building boost from source") @@ -53,6 +53,7 @@ set(BOOST_HEADER_ONLY circular_buffer math multi_index + multiprecision numeric_conversion random uuid @@ -61,11 +62,9 @@ list(APPEND BOOST_INCLUDE_LIBRARIES ${BOOST_HEADER_ONLY}) # The `headers` target is not created by Boost cmake and leads to a warning list(REMOVE_ITEM BOOST_INCLUDE_LIBRARIES headers) -set(BUILD_SHARED_LIBS ON) +set(BUILD_SHARED_LIBS OFF) FetchContent_MakeAvailable(Boost) -# To aling with Boost system install we create Boost::headers target -add_library(boost_headers INTERFACE) -add_library(Boost::headers ALIAS boost_headers) list(TRANSFORM BOOST_HEADER_ONLY PREPEND Boost::) target_link_libraries(boost_headers INTERFACE ${BOOST_HEADER_ONLY}) +add_library(Boost::headers ALIAS boost_headers) diff --git a/CMake/resolve_dependency_modules/boost/FindBoost.cmake.in b/CMake/resolve_dependency_modules/boost/FindBoost.cmake.in index ab59b0238e7c6..7a1a5d81d6ae8 100644 --- a/CMake/resolve_dependency_modules/boost/FindBoost.cmake.in +++ b/CMake/resolve_dependency_modules/boost/FindBoost.cmake.in @@ -14,5 +14,6 @@ message(STATUS "Using Boost - Bundled") set(Boost_FOUND TRUE) set(Boost_LIBRARIES @BOOST_INCLUDE_LIBRARIES@) +list(APPEND Boost_LIBRARIES headers) list(TRANSFORM Boost_LIBRARIES PREPEND Boost::) message(STATUS "Boost targets: ${Boost_LIBRARIES}") diff --git a/CMake/resolve_dependency_modules/c-ares.cmake b/CMake/resolve_dependency_modules/c-ares.cmake new file mode 100644 index 0000000000000..3dc5b27addeda --- /dev/null +++ b/CMake/resolve_dependency_modules/c-ares.cmake @@ -0,0 +1,43 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +set(VELOX_CARES_BUILD_VERSION cares-1_13_0) +set(VELOX_CARES_BUILD_SHA256_CHECKSUM + 7c48c57706a38691041920e705d2a04426ad9c68d40edd600685323f214b2d57) +string( + CONCAT VELOX_CARES_SOURCE_URL + "https://github.com/c-ares/c-ares/archive/refs/tags/" + "${VELOX_CARES_BUILD_VERSION}.tar.gz") + +resolve_dependency_url(CARES) + +message(STATUS "Building C-ARES from source") + +FetchContent_Declare( + c-ares + URL ${VELOX_CARES_SOURCE_URL} + URL_HASH ${VELOX_CARES_BUILD_SHA256_CHECKSUM} + PATCH_COMMAND + git init && git apply + ${CMAKE_CURRENT_LIST_DIR}/c-ares/c-ares-random-file.patch + OVERRIDE_FIND_PACKAGE EXCLUDE_FROM_ALL SYSTEM) + +set(CARES_STATIC ON) +set(CARES_INSTALL ON) +set(CARES_SHARED OFF) +FetchContent_MakeAvailable(c-ares) +if(NOT TARGET c-ares::cares) + add_library(c-ares::cares ALIAS c-ares) +endif() diff --git a/CMake/resolve_dependency_modules/c-ares/c-ares-random-file.patch b/CMake/resolve_dependency_modules/c-ares/c-ares-random-file.patch new file mode 100644 index 0000000000000..49f5dadeb3434 --- /dev/null +++ b/CMake/resolve_dependency_modules/c-ares/c-ares-random-file.patch @@ -0,0 +1,73 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 364b6c7c..c96a5b6c 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -333,6 +333,8 @@ SET (CMAKE_EXTRA_INCLUDE_FILES) + SET (CMAKE_REQUIRED_DEFINITIONS) + SET (CMAKE_REQUIRED_LIBRARIES) + ++find_file(CARES_RANDOM_FILE urandom /dev) ++mark_as_advanced(CARES_RANDOM_FILE) + + ################################################################################ + # recv, recvfrom, send, getnameinfo, gethostname +diff --git a/ares_config.h.cmake b/ares_config.h.cmake +index 0cb2f6ae..b5da36d3 100644 +--- a/ares_config.h.cmake ++++ b/ares_config.h.cmake +@@ -338,7 +338,7 @@ + #cmakedefine NEED_MEMORY_H + + /* a suitable file/device to read random data from */ +-#cmakedefine RANDOM_FILE ++#cmakedefine CARES_RANDOM_FILE "@CARES_RANDOM_FILE@" + + /* Define to the type qualifier pointed by arg 5 for recvfrom. */ + #define RECVFROM_QUAL_ARG5 @RECVFROM_QUAL_ARG5@ +diff --git a/ares_init.c b/ares_init.c +index f7b700bf..29bb9784 100644 +--- a/ares_init.c ++++ b/ares_init.c +@@ -2376,8 +2376,8 @@ static void randomize_key(unsigned char* key,int key_data_len) + randomized = 1; + } + #else /* !WIN32 */ +-#ifdef RANDOM_FILE +- FILE *f = fopen(RANDOM_FILE, "rb"); ++#ifdef CARES_RANDOM_FILE ++ FILE *f = fopen(CARES_RANDOM_FILE, "rb"); + if(f) { + counter = aresx_uztosi(fread(key, 1, key_data_len, f)); + fclose(f); +diff --git a/configure.ac b/configure.ac +index 7e86ad67..ffdcfdf7 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -829,22 +829,22 @@ dnl Check for user-specified random device + AC_ARG_WITH(random, + AC_HELP_STRING([--with-random=FILE], + [read randomness from FILE (default=/dev/urandom)]), +- [ RANDOM_FILE="$withval" ], ++ [ CARES_RANDOM_FILE="$withval" ], + [ + dnl Check for random device. If we're cross compiling, we can't + dnl check, and it's better to assume it doesn't exist than it is + dnl to fail on AC_CHECK_FILE or later. + if test "$cross_compiling" = "no"; then +- AC_CHECK_FILE("/dev/urandom", [ RANDOM_FILE="/dev/urandom"] ) ++ AC_CHECK_FILE("/dev/urandom", [ CARES_RANDOM_FILE="/dev/urandom"] ) + else + AC_MSG_WARN([cannot check for /dev/urandom while cross compiling; assuming none]) + fi + + ] + ) +-if test -n "$RANDOM_FILE" && test X"$RANDOM_FILE" != Xno ; then +- AC_SUBST(RANDOM_FILE) +- AC_DEFINE_UNQUOTED(RANDOM_FILE, "$RANDOM_FILE", ++if test -n "$CARES_RANDOM_FILE" && test X"$CARES_RANDOM_FILE" != Xno ; then ++ AC_SUBST(CARES_RANDOM_FILE) ++ AC_DEFINE_UNQUOTED(CARES_RANDOM_FILE, "$CARES_RANDOM_FILE", + [a suitable file/device to read random data from]) + fi + diff --git a/CMake/resolve_dependency_modules/cpr.cmake b/CMake/resolve_dependency_modules/cpr.cmake new file mode 100644 index 0000000000000..45e0d16375a81 --- /dev/null +++ b/CMake/resolve_dependency_modules/cpr.cmake @@ -0,0 +1,46 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +set(VELOX_CPR_VERSION 1.10.5) +set(VELOX_CPR_BUILD_SHA256_CHECKSUM + c8590568996cea918d7cf7ec6845d954b9b95ab2c4980b365f582a665dea08d8) +set(VELOX_CPR_SOURCE_URL + "https://github.com/libcpr/cpr/archive/refs/tags/${VELOX_CPR_VERSION}.tar.gz" +) + +# Add the dependency for curl, so that we can define the source URL for curl in +# curl.cmake. This will override the curl version declared by cpr. +set(curl_SOURCE BUNDLED) +resolve_dependency(curl) + +resolve_dependency_url(CPR) + +message(STATUS "Building cpr from source") +FetchContent_Declare( + cpr + URL ${VELOX_CPR_SOURCE_URL} + URL_HASH ${VELOX_CPR_BUILD_SHA256_CHECKSUM} + PATCH_COMMAND + git apply ${CMAKE_CURRENT_LIST_DIR}/cpr/cpr-libcurl-compatible.patch && git + apply ${CMAKE_CURRENT_LIST_DIR}/cpr/cpr-remove-sancheck.patch) +set(BUILD_SHARED_LIBS OFF) +set(CPR_USE_SYSTEM_CURL OFF) +# ZLIB has already been found by find_package(ZLIB, REQUIRED), set CURL_ZLIB=OFF +# to save compile time. +set(CURL_ZLIB OFF) +FetchContent_MakeAvailable(cpr) +# libcpr in its CMakeLists.txt file disables the BUILD_TESTING globally when +# CPR_USE_SYSTEM_CURL=OFF. unset BUILD_TESTING here. +unset(BUILD_TESTING) diff --git a/CMake/resolve_dependency_modules/cpr/cpr-libcurl-compatible.patch b/CMake/resolve_dependency_modules/cpr/cpr-libcurl-compatible.patch new file mode 100644 index 0000000000000..49821889f2bfd --- /dev/null +++ b/CMake/resolve_dependency_modules/cpr/cpr-libcurl-compatible.patch @@ -0,0 +1,41 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This can be removed once we upgrade to curl >= 7.68.0 +--- a/cpr/multiperform.cpp ++++ b/cpr/multiperform.cpp +@@ -97,9 +97,9 @@ void MultiPerform::DoMultiPerform() { + + if (still_running) { + const int timeout_ms{250}; +- error_code = curl_multi_poll(multicurl_->handle, nullptr, 0, timeout_ms, nullptr); ++ error_code = curl_multi_wait(multicurl_->handle, nullptr, 0, timeout_ms, nullptr); + if (error_code) { +- std::cerr << "curl_multi_poll() failed, code " << static_cast(error_code) << std::endl; ++ std::cerr << "curl_multi_wait() failed, code " << static_cast(error_code) << std::endl; + break; + } + } + +--- a/include/cpr/util.h ++++ b/include/cpr/util.h +@@ -23,7 +23,7 @@ size_t writeUserFunction(char* ptr, size_t size, size_t nmemb, const WriteCallba + template + int progressUserFunction(const T* progress, cpr_pf_arg_t dltotal, cpr_pf_arg_t dlnow, cpr_pf_arg_t ultotal, cpr_pf_arg_t ulnow) { + const int cancel_retval{1}; +- static_assert(cancel_retval != CURL_PROGRESSFUNC_CONTINUE); ++ static_assert(cancel_retval != 0x10000001); + return (*progress)(dltotal, dlnow, ultotal, ulnow) ? 0 : cancel_retval; + } + int debugUserFunction(CURL* handle, curl_infotype type, char* data, size_t size, const DebugCallback* debug); diff --git a/CMake/resolve_dependency_modules/cpr/cpr-remove-sancheck.patch b/CMake/resolve_dependency_modules/cpr/cpr-remove-sancheck.patch new file mode 100644 index 0000000000000..4fca92831a206 --- /dev/null +++ b/CMake/resolve_dependency_modules/cpr/cpr-remove-sancheck.patch @@ -0,0 +1,24 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This hangs on CI and is not needed #9116 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -84,7 +84,6 @@ endif() + include(GNUInstallDirs) + include(FetchContent) + include(cmake/code_coverage.cmake) +-include(cmake/sanitizer.cmake) + include(cmake/clear_variable.cmake) + + # So CMake can find FindMbedTLS.cmake diff --git a/CMake/resolve_dependency_modules/curl.cmake b/CMake/resolve_dependency_modules/curl.cmake new file mode 100644 index 0000000000000..114aa30c9fcf3 --- /dev/null +++ b/CMake/resolve_dependency_modules/curl.cmake @@ -0,0 +1,30 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +set(VELOX_CURL_VERSION 8.4.0) +string(REPLACE "." "_" VELOX_CURL_VERSION_UNDERSCORES ${VELOX_CURL_VERSION}) +set(VELOX_CURL_BUILD_SHA256_CHECKSUM + 16c62a9c4af0f703d28bda6d7bbf37ba47055ad3414d70dec63e2e6336f2a82d) +string( + CONCAT + VELOX_CURL_SOURCE_URL "https://github.com/curl/curl/releases/download/" + "curl-${VELOX_CURL_VERSION_UNDERSCORES}/curl-${VELOX_CURL_VERSION}.tar.xz") + +resolve_dependency_url(CURL) + +FetchContent_Declare( + curl + URL ${VELOX_CURL_SOURCE_URL} + URL_HASH ${VELOX_CURL_BUILD_SHA256_CHECKSUM}) diff --git a/CMake/resolve_dependency_modules/duckdb.cmake b/CMake/resolve_dependency_modules/duckdb.cmake new file mode 100644 index 0000000000000..f606ab48ef5b9 --- /dev/null +++ b/CMake/resolve_dependency_modules/duckdb.cmake @@ -0,0 +1,56 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +set(VELOX_DUCKDB_VERSION 0.8.1) +set(VELOX_DUCKDB_BUILD_SHA256_CHECKSUM + a0674f7e320dc7ebcf51990d7fc1c0e7f7b2c335c08f5953702b5285e6c30694) +set(VELOX_DUCKDB_SOURCE_URL + "https://github.com/duckdb/duckdb/archive/refs/tags/v${VELOX_DUCKDB_VERSION}.tar.gz" +) + +resolve_dependency_url(DUCKDB) + +message(STATUS "Building DuckDB from source") +# We need remove-ccache.patch to remove adding ccache to the build command +# twice. Velox already does this. We need fix-duckdbversion.patch as DuckDB +# tries to infer the version via a git commit hash or git tag. This inference +# can lead to errors when building in another git project such as Prestissimo. +FetchContent_Declare( + duckdb + URL ${VELOX_DUCKDB_SOURCE_URL} + URL_HASH ${VELOX_DUCKDB_BUILD_SHA256_CHECKSUM} + PATCH_COMMAND + git apply ${CMAKE_CURRENT_LIST_DIR}/duckdb/remove-ccache.patch && git apply + ${CMAKE_CURRENT_LIST_DIR}/duckdb/fix-duckdbversion.patch && git apply + ${CMAKE_CURRENT_LIST_DIR}/duckdb/re2.patch) + +set(BUILD_UNITTESTS OFF) +set(ENABLE_SANITIZER OFF) +set(ENABLE_UBSAN OFF) +set(BUILD_SHELL OFF) +set(EXPORT_DLL_SYMBOLS OFF) +set(PREVIOUS_BUILD_TYPE ${CMAKE_BUILD_TYPE}) +set(CMAKE_BUILD_TYPE Release) +set(PREVIOUS_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-non-virtual-dtor") + +FetchContent_MakeAvailable(duckdb) + +if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") + target_compile_options(duckdb_catalog PRIVATE -Wno-nonnull-compare) +endif() + +set(CMAKE_CXX_FLAGS ${PREVIOUS_CMAKE_CXX_FLAGS}) +set(CMAKE_BUILD_TYPE ${PREVIOUS_BUILD_TYPE}) diff --git a/CMake/resolve_dependency_modules/duckdb/fix-duckdbversion.patch b/CMake/resolve_dependency_modules/duckdb/fix-duckdbversion.patch new file mode 100644 index 0000000000000..d990646800f55 --- /dev/null +++ b/CMake/resolve_dependency_modules/duckdb/fix-duckdbversion.patch @@ -0,0 +1,59 @@ +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -210,56 +210,8 @@ + set(CXX_EXTRA "${CXX_EXTRA} -mimpure-text") + add_definitions(-DSUN=1) + set(SUN TRUE) +-endif() +- +-find_package(Git) +-if(Git_FOUND) +- if (NOT DEFINED GIT_COMMIT_HASH) +- execute_process( +- COMMAND ${GIT_EXECUTABLE} log -1 --format=%h +- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +- RESULT_VARIABLE GIT_RESULT +- OUTPUT_VARIABLE GIT_COMMIT_HASH +- OUTPUT_STRIP_TRAILING_WHITESPACE) +- endif() +- execute_process( +- COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 +- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +- OUTPUT_VARIABLE GIT_LAST_TAG +- OUTPUT_STRIP_TRAILING_WHITESPACE) +- execute_process( +- COMMAND ${GIT_EXECUTABLE} describe --tags --long +- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +- OUTPUT_VARIABLE GIT_ITERATION +- OUTPUT_STRIP_TRAILING_WHITESPACE) +-else() +- message("Git NOT FOUND") +-endif() +- +-if(GIT_RESULT EQUAL "0") +- string(REGEX REPLACE "v([0-9]+).[0-9]+.[0-9]+" "\\1" DUCKDB_MAJOR_VERSION "${GIT_LAST_TAG}") +- string(REGEX REPLACE "v[0-9]+.([0-9]+).[0-9]+" "\\1" DUCKDB_MINOR_VERSION "${GIT_LAST_TAG}") +- string(REGEX REPLACE "v[0-9]+.[0-9]+.([0-9]+)" "\\1" DUCKDB_PATCH_VERSION "${GIT_LAST_TAG}") +- string(REGEX REPLACE ".*-([0-9]+)-.*" "\\1" DUCKDB_DEV_ITERATION "${GIT_ITERATION}") +- +- if(DUCKDB_DEV_ITERATION EQUAL 0) +- # on a tag; directly use the version +- set(DUCKDB_VERSION "${GIT_LAST_TAG}") +- else() +- # not on a tag, increment the patch version by one and add a -devX suffix +- math(EXPR DUCKDB_PATCH_VERSION "${DUCKDB_PATCH_VERSION}+1") +- set(DUCKDB_VERSION "v${DUCKDB_MAJOR_VERSION}.${DUCKDB_MINOR_VERSION}.${DUCKDB_PATCH_VERSION}-dev${DUCKDB_DEV_ITERATION}") +- endif() +-else() +- # fallback for when building from tarball +- set(DUCKDB_MAJOR_VERSION 0) +- set(DUCKDB_MINOR_VERSION 0) +- set(DUCKDB_PATCH_VERSION 1) +- set(DUCKDB_DEV_ITERATION 0) +- set(DUCKDB_VERSION "v${DUCKDB_MAJOR_VERSION}.${DUCKDB_MINOR_VERSION}.${DUCKDB_PATCH_VERSION}-dev${DUCKDB_DEV_ITERATION}") + endif() + +-message(STATUS "git hash ${GIT_COMMIT_HASH}, version ${DUCKDB_VERSION}") + + option(AMALGAMATION_BUILD + "Build from the amalgamation files, rather than from the normal sources." diff --git a/CMake/resolve_dependency_modules/duckdb/re2.patch b/CMake/resolve_dependency_modules/duckdb/re2.patch new file mode 100644 index 0000000000000..43ec173ee7838 --- /dev/null +++ b/CMake/resolve_dependency_modules/duckdb/re2.patch @@ -0,0 +1,11 @@ +--- a/third_party/re2/CMakeLists.txt ++++ b/third_party/re2/CMakeLists.txt +@@ -90,7 +90,7 @@ + + target_include_directories( + duckdb_re2 +- PUBLIC $) ++ PRIVATE $) + + install(TARGETS duckdb_re2 + EXPORT "${DUCKDB_EXPORT_SET}" diff --git a/CMake/resolve_dependency_modules/duckdb/remove-ccache.patch b/CMake/resolve_dependency_modules/duckdb/remove-ccache.patch new file mode 100644 index 0000000000000..4d5097f0adca8 --- /dev/null +++ b/CMake/resolve_dependency_modules/duckdb/remove-ccache.patch @@ -0,0 +1,19 @@ +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -32,16 +32,6 @@ set(CMAKE_VERBOSE_MAKEFILE OFF) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + set(CMAKE_MACOSX_RPATH 1) + +-find_program(CCACHE_PROGRAM ccache) +-if(CCACHE_PROGRAM) +- set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}") +-else() +- find_program(CCACHE_PROGRAM sccache) +- if(CCACHE_PROGRAM) +- set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}") +- endif() +-endif() +- + # Determine install paths + set(INSTALL_LIB_DIR + lib diff --git a/CMake/resolve_dependency_modules/fast_float.cmake b/CMake/resolve_dependency_modules/fast_float.cmake new file mode 100644 index 0000000000000..a9f75c774d038 --- /dev/null +++ b/CMake/resolve_dependency_modules/fast_float.cmake @@ -0,0 +1,34 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +set(VELOX_FAST_FLOAT_VERSION 6.1.6) +set(VELOX_FAST_FLOAT_BUILD_SHA256_CHECKSUM + 4458aae4b0eb55717968edda42987cabf5f7fc737aee8fede87a70035dba9ab0) +set(VELOX_FAST_FLOAT_SOURCE_URL + "https://github.com/fastfloat/fast_float/archive/v${VELOX_FAST_FLOAT_VERSION}.tar.gz" +) + +resolve_dependency_url(FAST_FLOAT) + +message(STATUS "Building fast_float from source") +FetchContent_Declare( + fast_float + URL ${VELOX_FAST_FLOAT_SOURCE_URL} + URL_HASH ${VELOX_FAST_FLOAT_BUILD_SHA256_CHECKSUM}) + +FetchContent_MakeAvailable(fast_float) +# Folly searches for the header path directly so need to make sure to search in +# the dependency path. +list(APPEND CMAKE_PREFIX_PATH "${fast_float_SOURCE_DIR}") diff --git a/CMake/resolve_dependency_modules/fmt.cmake b/CMake/resolve_dependency_modules/fmt.cmake index 55be2629962ed..88d8d674d3a3d 100644 --- a/CMake/resolve_dependency_modules/fmt.cmake +++ b/CMake/resolve_dependency_modules/fmt.cmake @@ -13,9 +13,9 @@ # limitations under the License. include_guard(GLOBAL) -set(VELOX_FMT_VERSION 8.0.1) +set(VELOX_FMT_VERSION 10.1.1) set(VELOX_FMT_BUILD_SHA256_CHECKSUM - b06ca3130158c625848f3fb7418f235155a4d389b2abc3a6245fb01cb0eb1e01) + 78b8c0a72b1c35e4443a7e308df52498252d1cefc2b08c9a97bc9ee6cfe61f8b) set(VELOX_FMT_SOURCE_URL "https://github.com/fmtlib/fmt/archive/${VELOX_FMT_VERSION}.tar.gz") @@ -25,9 +25,7 @@ message(STATUS "Building fmt from source") FetchContent_Declare( fmt URL ${VELOX_FMT_SOURCE_URL} - URL_HASH ${VELOX_FMT_BUILD_SHA256_CHECKSUM} - PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/fmt/no-targets.patch) - + URL_HASH ${VELOX_FMT_BUILD_SHA256_CHECKSUM}) # Force fmt to create fmt-config.cmake which can be found by other dependecies # (e.g. folly) set(FMT_INSTALL ON) diff --git a/CMake/resolve_dependency_modules/fmt/no-targets.patch b/CMake/resolve_dependency_modules/fmt/no-targets.patch deleted file mode 100644 index 5ec962d32973c..0000000000000 --- a/CMake/resolve_dependency_modules/fmt/no-targets.patch +++ /dev/null @@ -1,12 +0,0 @@ -This can be removed once we upgrade to fmt >= 9 ---- a/support/cmake/fmt-config.cmake.in -+++ b/support/cmake/fmt-config.cmake.in -@@ -1,4 +1,7 @@ - @PACKAGE_INIT@ - --include(${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake) -+if (NOT TARGET fmt::fmt) -+ include(${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake) -+endif () -+ - check_required_components(fmt) diff --git a/CMake/resolve_dependency_modules/folly/CMakeLists.txt b/CMake/resolve_dependency_modules/folly/CMakeLists.txt index a017409c1bd37..6479e3613bea6 100644 --- a/CMake/resolve_dependency_modules/folly/CMakeLists.txt +++ b/CMake/resolve_dependency_modules/folly/CMakeLists.txt @@ -14,13 +14,15 @@ project(Folly) cmake_minimum_required(VERSION 3.14) -set(VELOX_FOLLY_BUILD_VERSION v2022.11.14.00) +set(VELOX_FOLLY_BUILD_VERSION v2024.09.16.00) set(VELOX_FOLLY_BUILD_SHA256_CHECKSUM - b249436cb61b6dfd5288093565438d8da642b07ae021191a4042b221bc1bdc0e) + 0a375f2f3e15a2679b4d21fa1064986830a52f59c74d82b3bda1aeeea4e77da0) set(VELOX_FOLLY_SOURCE_URL - "https://github.com/facebook/folly/archive/${VELOX_FOLLY_BUILD_VERSION}.tar.gz" + "https://github.com/facebook/folly/releases/download/${VELOX_FOLLY_BUILD_VERSION}/folly-${VELOX_FOLLY_BUILD_VERSION}.tar.gz" ) +set(fast_float_SOURCE BUNDLED) +resolve_dependency(fast_float) resolve_dependency_url(FOLLY) message(STATUS "Building Folly from source") @@ -52,11 +54,17 @@ add_library(Folly::folly ALIAS folly) # The folly target does not contain any include directories, they are propagated # from folly_base. This marks them as system headers which should suppress # warnigs generated by them when they are included else where. -get_target_property(_inc folly_base INTERFACE_INCLUDE_DIRECTORIES) -target_include_directories(folly_base SYSTEM INTERFACE ${_inc}) +set_target_properties( + folly_deps + PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES + $) +set_target_properties( + folly_base + PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES + $) if(${gflags_SOURCE} STREQUAL "BUNDLED") - add_dependencies(folly glog gflags) + add_dependencies(folly glog gflags_static fmt::fmt) endif() set(FOLLY_BENCHMARK_STATIC_LIB diff --git a/CMake/resolve_dependency_modules/folly/folly-gflags-glog.patch b/CMake/resolve_dependency_modules/folly/folly-gflags-glog.patch index 6ef25f5ac2317..a330989000423 100644 --- a/CMake/resolve_dependency_modules/folly/folly-gflags-glog.patch +++ b/CMake/resolve_dependency_modules/folly/folly-gflags-glog.patch @@ -13,10 +13,10 @@ # limitations under the License. --- a/CMake/folly-deps.cmake +++ b/CMake/folly-deps.cmake -@@ -52,19 +52,20 @@ find_package(DoubleConversion MODULE REQUIRED) - list(APPEND FOLLY_LINK_LIBRARIES ${DOUBLE_CONVERSION_LIBRARY}) - list(APPEND FOLLY_INCLUDE_DIRECTORIES ${DOUBLE_CONVERSION_INCLUDE_DIR}) - +@@ -55,19 +55,23 @@ list(APPEND FOLLY_INCLUDE_DIRECTORIES ${DOUBLE_CONVERSION_INCLUDE_DIR}) + find_package(FastFloat MODULE REQUIRED) + list(APPEND FOLLY_INCLUDE_DIRECTORIES ${FASTFLOAT_INCLUDE_DIR}) + -find_package(Gflags MODULE) -set(FOLLY_HAVE_LIBGFLAGS ${LIBGFLAGS_FOUND}) -if(LIBGFLAGS_FOUND) @@ -27,12 +27,14 @@ +find_package(gflags) +set(FOLLY_HAVE_LIBGFLAGS ${gflags_FOUND}) +if(gflags_FOUND) -+ list(APPEND FOLLY_LINK_LIBRARIES ${gflags_LIBRARY}) -+ list(APPEND FOLLY_INCLUDE_DIRECTORIES ${gflags_INCLUDE_DIR}) -+ set(FOLLY_LIBGFLAGS_LIBRARY ${gflags_LIBRARY}) -+ set(FOLLY_LIBGFLAGS_INCLUDE ${gflags_INCLUDE_DIR}) ++ list(APPEND FOLLY_LINK_LIBRARIES ${gflags_LIBRARY}) ++ list(APPEND FOLLY_INCLUDE_DIRECTORIES ${gflags_INCLUDE_DIR}) ++ set(FOLLY_LIBGFLAGS_LIBRARY ${gflags_LIBRARY}) ++ set(FOLLY_LIBGFLAGS_INCLUDE ${gflags_INCLUDE_DIR}) ++ message(STATUS "gflags_INCLUDE_DIR: ${gflags_INCLUDE_DIR}") ++ message(STATUS "gflags_LIBRARY: ${gflags_LIBRARY}") endif() - + -find_package(Glog MODULE) -set(FOLLY_HAVE_LIBGLOG ${GLOG_FOUND}) -list(APPEND FOLLY_LINK_LIBRARIES ${GLOG_LIBRARY}) @@ -41,7 +43,8 @@ +set(FOLLY_HAVE_LIBGLOG ${glog_FOUND}) +list(APPEND FOLLY_LINK_LIBRARIES ${glog_LIBRARY}) +list(APPEND FOLLY_INCLUDE_DIRECTORIES ${glog_INCLUDE_DIR}) -+message(STATUS "glog_INCLUDE_DIR: ${gflags_LINRARY}") - ++message(STATUS "glog_INCLUDE_DIR: ${glog_INCLUDE_DIR}") ++message(STATUS "glog_LIBRARY: ${glog_LIBRARY}") + find_package(LibEvent MODULE REQUIRED) list(APPEND FOLLY_LINK_LIBRARIES ${LIBEVENT_LIB}) diff --git a/CMake/resolve_dependency_modules/glog.cmake b/CMake/resolve_dependency_modules/glog.cmake index 2b77c8f7e9a7a..54836009ed7c8 100644 --- a/CMake/resolve_dependency_modules/glog.cmake +++ b/CMake/resolve_dependency_modules/glog.cmake @@ -37,7 +37,7 @@ set(BUILD_TESTING OFF) FetchContent_MakeAvailable(glog) unset(BUILD_TESTING) unset(BUILD_SHARED_LIBS) -add_dependencies(glog gflags) +add_dependencies(glog gflags_static) list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/glog) set(glog_INCLUDE_DIR ${glog_BINARY_DIR}) diff --git a/CMake/resolve_dependency_modules/google_cloud_cpp_storage.cmake b/CMake/resolve_dependency_modules/google_cloud_cpp_storage.cmake new file mode 100644 index 0000000000000..8106234925e7b --- /dev/null +++ b/CMake/resolve_dependency_modules/google_cloud_cpp_storage.cmake @@ -0,0 +1,41 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +set_source(gRPC) +resolve_dependency(gRPC CONFIG 1.48.1 REQUIRED) + +set(VELOX_GOOGLE_CLOUD_CPP_BUILD_VERSION 2.22.0) +set(VELOX_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM + 0c68782e57959c82e0c81def805c01460a042c1aae0c2feee905acaa2a2dc9bf) +string( + CONCAT VELOX_GOOGLE_CLOUD_CPP_SOURCE_URL + "https://github.com/googleapis/google-cloud-cpp/archive/refs/tags/" + "v${VELOX_GOOGLE_CLOUD_CPP_BUILD_VERSION}.tar.gz") + +resolve_dependency_url(GOOGLE_CLOUD_CPP) + +message(STATUS "Building Google Cloud CPP storage from source") + +FetchContent_Declare( + google_cloud_cpp + URL ${VELOX_GOOGLE_CLOUD_CPP_SOURCE_URL} + URL_HASH ${VELOX_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM} + OVERRIDE_FIND_PACKAGE EXCLUDE_FROM_ALL SYSTEM) + +set(GOOGLE_CLOUD_CPP_ENABLE_EXAMPLES OFF) +set(GOOGLE_CLOUD_CPP_ENABLE + "storage" + CACHE STRING "The list of libraries to build.") +FetchContent_MakeAvailable(google_cloud_cpp) diff --git a/CMake/resolve_dependency_modules/grpc.cmake b/CMake/resolve_dependency_modules/grpc.cmake new file mode 100644 index 0000000000000..a2e1a48672bf5 --- /dev/null +++ b/CMake/resolve_dependency_modules/grpc.cmake @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +set_source(absl) +resolve_dependency(absl CONFIG REQUIRED) + +set(VELOX_GRPC_BUILD_VERSION 1.48.1) +set(VELOX_GRPC_BUILD_SHA256_CHECKSUM + 320366665d19027cda87b2368c03939006a37e0388bfd1091c8d2a96fbc93bd8) +string( + CONCAT VELOX_GRPC_SOURCE_URL + "https://github.com/grpc/grpc/archive/refs/tags/" + "v${VELOX_GRPC_BUILD_VERSION}.tar.gz") + +resolve_dependency_url(GRPC) + +message(STATUS "Building gRPC from source") + +FetchContent_Declare( + gRPC + URL ${VELOX_GRPC_SOURCE_URL} + URL_HASH ${VELOX_GRPC_BUILD_SHA256_CHECKSUM} + OVERRIDE_FIND_PACKAGE EXCLUDE_FROM_ALL) + +set(gRPC_ABSL_PROVIDER + "package" + CACHE STRING "Provider of absl library") +set(gRPC_ZLIB_PROVIDER + "package" + CACHE STRING "Provider of zlib library") +set(gRPC_CARES_PROVIDER + "package" + CACHE STRING "Provider of c-ares library") +set(gRPC_RE2_PROVIDER + "package" + CACHE STRING "Provider of re2 library") +set(gRPC_SSL_PROVIDER + "package" + CACHE STRING "Provider of ssl library") +set(gRPC_PROTOBUF_PROVIDER + "package" + CACHE STRING "Provider of protobuf library") +set(gRPC_INSTALL + ON + CACHE BOOL "Generate installation target") +FetchContent_MakeAvailable(gRPC) +add_library(gRPC::grpc ALIAS grpc) +add_library(gRPC::grpc++ ALIAS grpc++) +add_executable(gRPC::grpc_cpp_plugin ALIAS grpc_cpp_plugin) diff --git a/CMake/resolve_dependency_modules/gtest.cmake b/CMake/resolve_dependency_modules/gtest.cmake index 5f0c29553c5da..8f35b9e954b95 100644 --- a/CMake/resolve_dependency_modules/gtest.cmake +++ b/CMake/resolve_dependency_modules/gtest.cmake @@ -29,3 +29,6 @@ FetchContent_Declare( URL_HASH ${VELOX_GTEST_BUILD_SHA256_CHECKSUM}) FetchContent_MakeAvailable(gtest) + +# Mask compilation warning in clang 16. +target_compile_options(gtest PRIVATE -Wno-implicit-int-float-conversion) diff --git a/CMake/resolve_dependency_modules/libstemmer/Makefile.patch b/CMake/resolve_dependency_modules/libstemmer/Makefile.patch new file mode 100644 index 0000000000000..e7d6910521115 --- /dev/null +++ b/CMake/resolve_dependency_modules/libstemmer/Makefile.patch @@ -0,0 +1,24 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- a/Makefile ++++ b/Makefile +@@ -3,7 +3,7 @@ + EXEEXT=.exe + endif + CFLAGS=-O2 +-CPPFLAGS=-Iinclude ++CPPFLAGS=-Iinclude -fPIC + all: libstemmer.a stemwords$(EXEEXT) + libstemmer.a: $(snowball_sources:.c=.o) + $(AR) -cru $@ $^ diff --git a/CMake/resolve_dependency_modules/protobuf.cmake b/CMake/resolve_dependency_modules/protobuf.cmake index 78f896fd655c2..e79d9871b9da5 100644 --- a/CMake/resolve_dependency_modules/protobuf.cmake +++ b/CMake/resolve_dependency_modules/protobuf.cmake @@ -13,15 +13,23 @@ # limitations under the License. include_guard(GLOBAL) -set(VELOX_PROTOBUF_BUILD_VERSION 21.4) +set(VELOX_PROTOBUF_BUILD_VERSION 21.8) set(VELOX_PROTOBUF_BUILD_SHA256_CHECKSUM - 6c5e1b0788afba4569aeebb2cfe205cb154aa01deacaba0cd26442f3b761a836) -string( - CONCAT - VELOX_PROTOBUF_SOURCE_URL - "https://github.com/protocolbuffers/protobuf/releases/download/" - "v${VELOX_PROTOBUF_BUILD_VERSION}/protobuf-all-${VELOX_PROTOBUF_BUILD_VERSION}.tar.gz" -) + 83ad4faf95ff9cbece7cb9c56eb3ca9e42c3497b77001840ab616982c6269fb6) +if(${VELOX_PROTOBUF_BUILD_VERSION} LESS 22.0) + string( + CONCAT + VELOX_PROTOBUF_SOURCE_URL + "https://github.com/protocolbuffers/protobuf/releases/download/" + "v${VELOX_PROTOBUF_BUILD_VERSION}/protobuf-all-${VELOX_PROTOBUF_BUILD_VERSION}.tar.gz" + ) +else() + set_source(absl) + resolve_dependency(absl CONFIG REQUIRED) + string(CONCAT VELOX_PROTOBUF_SOURCE_URL + "https://github.com/protocolbuffers/protobuf/archive/" + "v${VELOX_PROTOBUF_BUILD_VERSION}.tar.gz") +endif() resolve_dependency_url(PROTOBUF) @@ -30,44 +38,12 @@ message(STATUS "Building Protobuf from source") FetchContent_Declare( protobuf URL ${VELOX_PROTOBUF_SOURCE_URL} - URL_HASH ${VELOX_PROTOBUF_BUILD_SHA256_CHECKSUM}) - -if(NOT protobuf_POPULATED) - # We don't want to build tests. - set(protobuf_BUILD_TESTS - OFF - CACHE BOOL "Disable protobuf tests" FORCE) - set(CMAKE_CXX_FLAGS_BKP "${CMAKE_CXX_FLAGS}") - - # Disable warnings that would fail protobuf compilation. - string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-field-initializers") - - check_cxx_compiler_flag("-Wstringop-overflow" - COMPILER_HAS_W_STRINGOP_OVERFLOW) - if(COMPILER_HAS_W_STRINGOP_OVERFLOW) - string(APPEND CMAKE_CXX_FLAGS " -Wno-stringop-overflow") - endif() - - check_cxx_compiler_flag("-Winvalid-noreturn" COMPILER_HAS_W_INVALID_NORETURN) - - if(COMPILER_HAS_W_INVALID_NORETURN) - string(APPEND CMAKE_CXX_FLAGS " -Wno-invalid-noreturn") - else() - # Currently reproduced on Ubuntu 22.04 with clang 14 - string(APPEND CMAKE_CXX_FLAGS " -Wno-error") - endif() - - # Fetch the content using previously declared details - FetchContent_Populate(protobuf) - - # Set right path to libprotobuf-dev include files. - set(Protobuf_INCLUDE_DIRS "${protobuf_SOURCE_DIR}/src/") - set(Protobuf_PROTOC_EXECUTABLE "${protobuf_BINARY_DIR}/protoc") - if(CMAKE_BUILD_TYPE MATCHES Debug) - set(Protobuf_LIBRARIES "${protobuf_BINARY_DIR}/libprotobufd.a") - else() - set(Protobuf_LIBRARIES "${protobuf_BINARY_DIR}/libprotobuf.a") - endif() - add_subdirectory(${protobuf_SOURCE_DIR} ${protobuf_BINARY_DIR}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BKP}") -endif() + URL_HASH ${VELOX_PROTOBUF_BUILD_SHA256_CHECKSUM} + OVERRIDE_FIND_PACKAGE EXCLUDE_FROM_ALL SYSTEM) + +set(protobuf_BUILD_TESTS OFF) +set(protobuf_ABSL_PROVIDER + "package" + CACHE STRING "Provider of absl library") +FetchContent_MakeAvailable(protobuf) +set(Protobuf_INCLUDE_DIRS ${protobuf_SOURCE_DIR}/src) diff --git a/CMake/resolve_dependency_modules/re2.cmake b/CMake/resolve_dependency_modules/re2.cmake index a5dc4baed89b9..6bc00c66c0197 100644 --- a/CMake/resolve_dependency_modules/re2.cmake +++ b/CMake/resolve_dependency_modules/re2.cmake @@ -44,7 +44,6 @@ endif() set(re2_LIBRARIES ${re2_BINARY_DIR}/libre2.a) set(re2_INCLUDE_DIRS ${re2_SOURCE_DIR}) -add_library(re2::re2 ALIAS re2) set(RE2_ROOT ${re2_BINARY_DIR}) set(re2_ROOT ${re2_BINARY_DIR}) diff --git a/CMake/resolve_dependency_modules/simdjson.cmake b/CMake/resolve_dependency_modules/simdjson.cmake index 050a5df4cde51..69e7f204494b2 100644 --- a/CMake/resolve_dependency_modules/simdjson.cmake +++ b/CMake/resolve_dependency_modules/simdjson.cmake @@ -13,9 +13,9 @@ # limitations under the License. include_guard(GLOBAL) -set(VELOX_SIMDJSON_VERSION 3.1.5) +set(VELOX_SIMDJSON_VERSION 3.9.3) set(VELOX_SIMDJSON_BUILD_SHA256_CHECKSUM - 5b916be17343324426fc467a4041a30151e481700d60790acfd89716ecc37076) + 2e3d10abcde543d3dd8eba9297522cafdcebdd1db4f51b28f3bc95bf1d6ad23c) set(VELOX_SIMDJSON_SOURCE_URL "https://github.com/simdjson/simdjson/archive/refs/tags/v${VELOX_SIMDJSON_VERSION}.tar.gz" ) diff --git a/CMake/resolve_dependency_modules/stemmer.cmake b/CMake/resolve_dependency_modules/stemmer.cmake new file mode 100644 index 0000000000000..dbaca146341bc --- /dev/null +++ b/CMake/resolve_dependency_modules/stemmer.cmake @@ -0,0 +1,57 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include_guard(GLOBAL) + +set(VELOX_STEMMER_VERSION 2.2.0) +set(VELOX_STEMMER_BUILD_SHA256_CHECKSUM + b941d9fe9cf36b4e2f8d3873cd4d8b8775bd94867a1df8d8c001bb8b688377c3) +set(VELOX_STEMMER_SOURCE_URL + "https://snowballstem.org/dist/libstemmer_c-${VELOX_STEMMER_VERSION}.tar.gz" +) + +resolve_dependency_url(STEMMER) + +message(STATUS "Building stemmer from source") +find_program(MAKE_PROGRAM make REQUIRED) + +set(STEMMER_PREFIX "${CMAKE_BINARY_DIR}/_deps/libstemmer") +set(STEMMER_INCLUDE_PATH ${STEMMER_PREFIX}/src/libstemmer/include) + +# We can not use FetchContent as libstemmer does not use cmake +ExternalProject_Add( + libstemmer + PREFIX ${STEMMER_PREFIX} + SOURCE_DIR ${STEMMER_PREFIX}/src/libstemmer + URL ${VELOX_STEMMER_SOURCE_URL} + URL_HASH ${VELOX_STEMMER_BUILD_SHA256_CHECKSUM} + BUILD_IN_SOURCE TRUE + CONFIGURE_COMMAND "" + BUILD_COMMAND ${MAKE_PROGRAM} + INSTALL_COMMAND "" + PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/libstemmer/Makefile.patch + BUILD_BYPRODUCTS + ${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX} +) + +add_library(stemmer STATIC IMPORTED GLOBAL) +add_library(stemmer::stemmer ALIAS stemmer) +file(MAKE_DIRECTORY ${STEMMER_INCLUDE_PATH}) +set_target_properties( + stemmer + PROPERTIES + IMPORTED_LOCATION + ${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX} + INTERFACE_INCLUDE_DIRECTORIES ${STEMMER_INCLUDE_PATH}) + +add_dependencies(stemmer libstemmer) diff --git a/CMakeLists.txt b/CMakeLists.txt index 31df59045c044..8b0d3a67c21c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.14) +cmake_minimum_required(VERSION 3.28) +message(STATUS "Building using CMake version: ${CMAKE_VERSION}") -# the policy allows us to change options without caching +# The policy allows us to change options without caching. cmake_policy(SET CMP0077 NEW) set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) @@ -25,14 +26,39 @@ if(POLICY CMP0135) set(CMAKE_POLICY_DEFAULT_CMP0135 NEW) endif() -# set the project name +# Set the project name. project(velox) +# If we are in an active conda env disable search in system paths and add env to +# prefix path +if(DEFINED ENV{CONDA_PREFIX}) + if(NOT DEFINED ENV{VELOX_DEPENDENCY_SOURCE} OR "$ENV{VELOX_DEPENDENCY_SOURCE}" + STREQUAL "CONDA") + message(STATUS "Using Conda environment: $ENV{CONDA_PREFIX}") + set(CMAKE_FIND_USE_SYSTEM_ENVIRONMENT_PATH FALSE) + list(APPEND CMAKE_PREFIX_PATH "$ENV{CONDA_PREFIX}") + # Override in case it was set to CONDA + set(ENV{VELOX_DEPENDENCY_SOURCE} AUTO) + elseif(DEFINED ENV{VELOX_DEPENDENCY_SOURCE} + AND NOT "$ENV{VELOX_DEPENDENCY_SOURCE}" STREQUAL "CONDA") + message(STATUS "Overriding Conda environment: $ENV{CONDA_PREFIX}") + endif() +endif() + +if(DEFINED ENV{INSTALL_PREFIX}) + message(STATUS "Dependency install directory set to: $ENV{INSTALL_PREFIX}") + list(APPEND CMAKE_PREFIX_PATH "$ENV{INSTALL_PREFIX}") + # Allow installed package headers to be picked up before brew/system package + # headers + include_directories(BEFORE "$ENV{INSTALL_PREFIX}/include") +endif() + list(PREPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMake" "${PROJECT_SOURCE_DIR}/CMake/third-party") # Include our ThirdPartyToolchain dependencies macros include(ResolveDependency) +include(VeloxUtils) set_with_default(VELOX_DEPENDENCY_SOURCE_DEFAULT VELOX_DEPENDENCY_SOURCE AUTO) message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") @@ -46,6 +72,13 @@ option( VELOX_BUILD_MINIMAL "Build a minimal set of components only. This will override other build options." OFF) +option( + VELOX_BUILD_MINIMAL_WITH_DWIO + "Build a minimal set of components, including DWIO (file format readers/writers). + This will override other build options." + OFF) +option(VELOX_MONO_LIBRARY "Build single unified library." OFF) + # option() always creates a BOOL variable so we have to use a normal cache # variable with STRING type for this option. # @@ -58,7 +91,12 @@ set(VELOX_DEPENDENCY_SOURCE STRING "Default source for all dependencies with source builds enabled: AUTO SYSTEM BUNDLED." ) -option(VELOX_ENABLE_DUCKDB "Build duckDB to enable differential testing." ON) +set(VELOX_GFLAGS_TYPE + "shared" + CACHE + STRING + "Specify whether to find the gflags package as a shared or static package" +) option(VELOX_ENABLE_EXEC "Build exec." ON) option(VELOX_ENABLE_AGGREGATES "Build aggregates." ON) option(VELOX_ENABLE_HIVE_CONNECTOR "Build Hive connector." ON) @@ -66,10 +104,9 @@ option(VELOX_ENABLE_TPCH_CONNECTOR "Build TPC-H connector." ON) option(VELOX_ENABLE_PRESTO_FUNCTIONS "Build Presto SQL functions." ON) option(VELOX_ENABLE_SPARK_FUNCTIONS "Build Spark SQL functions." ON) option(VELOX_ENABLE_EXPRESSION "Build expression." ON) -option(VELOX_ENABLE_PARSE "Build parser used for unit tests." ON) -option(VELOX_ENABLE_EXAMPLES - "Build examples. This will enable VELOX_ENABLE_EXPRESSION automatically." - OFF) +option( + VELOX_ENABLE_EXAMPLES + "Build examples. This will enable VELOX_ENABLE_EXPRESSION automatically." OFF) option(VELOX_ENABLE_SUBSTRAIT "Build Substrait-to-Velox converter." OFF) option(VELOX_ENABLE_BENCHMARKS "Enable Velox top level benchmarks." OFF) option(VELOX_ENABLE_BENCHMARKS_BASIC "Enable Velox basic benchmarks." OFF) @@ -83,20 +120,24 @@ option(VELOX_ENABLE_REMOTE_FUNCTIONS "Enable remote function support" OFF) option(VELOX_ENABLE_CCACHE "Use ccache if installed." ON) option(VELOX_BUILD_TEST_UTILS "Builds Velox test utilities" OFF) +option(VELOX_BUILD_VECTOR_TEST_UTILS "Builds Velox vector test utilities" OFF) option(VELOX_BUILD_PYTHON_PACKAGE "Builds Velox Python bindings" OFF) -option(VELOX_BUILD_BENCHMARKS "Builds Velox benchmarks" OFF) option( VELOX_ENABLE_INT64_BUILD_PARTITION_BOUND "make buildPartitionBounds_ a vector int64 instead of int32 to avoid integer overflow when the hashtable has billions of records" OFF) -if(${VELOX_BUILD_MINIMAL}) +# Explicitly force compilers to generate colored output. Compilers usually do +# this by default if they detect the output is a terminal, but this assumption +# is broken if you use ninja. +option(VELOX_FORCE_COLORED_OUTPUT + "Always produce ANSI-colored output (GNU/Clang only)." OFF) + +if(${VELOX_BUILD_MINIMAL} OR ${VELOX_BUILD_MINIMAL_WITH_DWIO}) # Enable and disable components for velox base build set(VELOX_BUILD_TESTING OFF) set(VELOX_ENABLE_PRESTO_FUNCTIONS ON) - set(VELOX_ENABLE_DUCKDB OFF) set(VELOX_ENABLE_EXPRESSION ON) - set(VELOX_ENABLE_PARSE OFF) set(VELOX_ENABLE_EXEC OFF) set(VELOX_ENABLE_AGGREGATES OFF) set(VELOX_ENABLE_HIVE_CONNECTOR OFF) @@ -107,64 +148,54 @@ if(${VELOX_BUILD_MINIMAL}) set(VELOX_ENABLE_GCS OFF) set(VELOX_ENABLE_ABFS OFF) set(VELOX_ENABLE_SUBSTRAIT OFF) - set(VELOX_CODEGEN_SUPPORT OFF) endif() if(${VELOX_BUILD_TESTING}) # Enable all components to build testing binaries set(VELOX_ENABLE_PRESTO_FUNCTIONS ON) - set(VELOX_ENABLE_DUCKDB ON) set(VELOX_ENABLE_EXPRESSION ON) - set(VELOX_ENABLE_PARSE ON) set(VELOX_ENABLE_EXEC ON) set(VELOX_ENABLE_AGGREGATES ON) set(VELOX_ENABLE_HIVE_CONNECTOR ON) set(VELOX_ENABLE_TPCH_CONNECTOR ON) set(VELOX_ENABLE_SPARK_FUNCTIONS ON) - set(VELOX_ENABLE_TEST_UTILS OFF) set(VELOX_ENABLE_EXAMPLES ON) + set(VELOX_ENABLE_PARQUET ON) endif() -if(${VELOX_ENABLE_EXAMPLES}) - set(VELOX_ENABLE_EXPRESSION ON) - set(VELOX_ENABLE_TEST_UTILS ON) +if(${VELOX_ENABLE_BENCHMARKS}) + set(VELOX_ENABLE_BENCHMARKS_BASIC ON) endif() -if(${VELOX_BUILD_BENCHMARKS}) - set(VELOX_ENABLE_BENCHMARKS ON) - set(VELOX_ENABLE_BENCHMARKS_BASIC ON) +if(VELOX_ENABLE_BENCHMARKS_BASIC) + set(VELOX_BUILD_TEST_UTILS ON) +endif() + +if(VELOX_BUILD_TESTING OR VELOX_BUILD_TEST_UTILS) + set(cpr_SOURCE BUNDLED) + resolve_dependency(cpr) set(VELOX_ENABLE_DUCKDB ON) set(VELOX_ENABLE_PARSE ON) - set(VELOX_ENABLE_PARQUET ON) - set(VELOX_BUILD_TEST_UTILS ON) - set(VELOX_BUILD_TESTING OFF) - set(VELOX_ENABLE_EXAMPLES OFF) - set(VELOX_ENABLE_GCS OFF) - set(VELOX_ENABLE_ABFS OFF) - set(VELOX_ENABLE_SUBSTRAIT OFF) - set(VELOX_CODEGEN_SUPPORT OFF) +endif() + +if(${VELOX_ENABLE_EXAMPLES}) + set(VELOX_ENABLE_EXPRESSION ON) endif() if(${VELOX_BUILD_PYTHON_PACKAGE}) - set(VELOX_BUILD_TESTING OFF) set(VELOX_ENABLE_PRESTO_FUNCTIONS ON) set(VELOX_ENABLE_DUCKDB ON) set(VELOX_ENABLE_EXPRESSION ON) set(VELOX_ENABLE_PARSE ON) set(VELOX_ENABLE_EXEC ON) - set(VELOX_ENABLE_AGGREGATES OFF) - set(VELOX_ENABLE_HIVE_CONNECTOR OFF) - set(VELOX_ENABLE_TPCH_CONNECTOR OFF) + set(VELOX_ENABLE_AGGREGATES ON) set(VELOX_ENABLE_SPARK_FUNCTIONS ON) - set(VELOX_ENABLE_EXAMPLES OFF) - set(VELOX_ENABLE_S3 OFF) - set(VELOX_ENABLE_GCS OFF) - set(VELOX_ENABLE_ABFS OFF) - set(VELOX_ENABLE_SUBSTRAIT OFF) - set(VELOX_CODEGEN_SUPPORT OFF) - set(VELOX_ENABLE_BENCHMARKS_BASIC OFF) endif() +# We look for OpenSSL here to cache the result enforce the version across our +# dependencies. +find_package(OpenSSL REQUIRED) + if(VELOX_ENABLE_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER) @@ -180,6 +211,20 @@ if(VELOX_ENABLE_CCACHE endif() endif() +if(${VELOX_FORCE_COLORED_OUTPUT}) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + add_compile_options(-fdiagnostics-color=always) + elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" + OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") + add_compile_options(-fcolor-diagnostics) + endif() +endif() + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" + AND "${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER_EQUAL 15) + set(CMAKE_EXE_LINKER_FLAGS "-latomic") +endif() + # At the moment we prefer static linking but by default cmake looks for shared # libs first. This will still fallback to shared libs when static ones are not # found @@ -193,22 +238,14 @@ if(VELOX_ENABLE_S3) add_definitions(-DVELOX_ENABLE_S3) endif() -if(VELOX_ENABLE_GCS) - # Set GCS_ROOT_DIR if you have a custom install location of GCS SDK CPP. - if(GCSSDK_ROOT_DIR) - list(APPEND CMAKE_PREFIX_PATH ${GCSSDK_ROOT_DIR}) - endif() - find_package(google_cloud_cpp_storage REQUIRED) - add_definitions(-DVELOX_ENABLE_GCS) -endif() - if(VELOX_ENABLE_ABFS) # Set AZURESDK_ROOT_DIR if you have a custom install location of Azure Storage # SDK CPP. if(AZURESDK_ROOT_DIR) list(APPEND CMAKE_PREFIX_PATH ${AZURESDK_ROOT_DIR}) endif() - find_package(azure-storage-blobs-cpp CONFIG REQUIRED) + # files-datalake is built on blobs + find_package(azure-storage-files-datalake-cpp CONFIG REQUIRED) add_definitions(-DVELOX_ENABLE_ABFS) endif() @@ -227,30 +264,6 @@ if(VELOX_ENABLE_PARQUET) set(VELOX_ENABLE_ARROW ON) endif() -if(VELOX_ENABLE_REMOTE_FUNCTIONS) - # TODO: Move this to use resolve_dependency(). For some reason, FBThrift - # requires clients to explicitly install fizz and wangle. - find_package(fizz CONFIG REQUIRED) - find_package(wangle CONFIG REQUIRED) - find_package(FBThrift CONFIG REQUIRED) -endif() - -# Turn on Codegen only for Clang and non Mac systems. -if((NOT DEFINED VELOX_CODEGEN_SUPPORT) - AND (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - AND NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")) - message(STATUS "Enabling Codegen") - set(VELOX_CODEGEN_SUPPORT True) -else() - message(STATUS "Disabling Codegen") - set(VELOX_CODEGEN_SUPPORT False) -endif() - -# define processor variable for conditional compilation -if(${VELOX_CODEGEN_SUPPORT}) - add_compile_definitions(CODEGEN_ENABLED=1) -endif() - # make buildPartitionBounds_ a vector int64 instead of int32 to avoid integer # overflow if(${VELOX_ENABLE_INT64_BUILD_PARTITION_BOUND}) @@ -265,7 +278,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") endif() if(UNIX AND NOT APPLE) - # codegen linker flags, -export-dynamic for rtti + # linker flags, -export-dynamic for rtti add_link_options("-Wl,-export-dynamic") endif() @@ -324,7 +337,6 @@ if("${ENABLE_ALL_WARNINGS}") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(KNOWN_COMPILER_SPECIFIC_WARNINGS "-Wno-implicit-fallthrough \ - -Wno-empty-body \ -Wno-class-memaccess \ -Wno-comment \ -Wno-int-in-bool-context \ @@ -333,11 +345,7 @@ if("${ENABLE_ALL_WARNINGS}") -Wno-maybe-uninitialized \ -Wno-unused-result \ -Wno-format-overflow \ - -Wno-strict-aliasing \ - -Wno-type-limits \ - -Wno-stringop-overflow \ - -Wno-stringop-overread \ - -Wno-return-type") + -Wno-strict-aliasing") endif() set(KNOWN_WARNINGS @@ -356,6 +364,10 @@ if(${VELOX_ENABLE_GPU}) enable_language(CUDA) # Determine CUDA_ARCHITECTURES automatically. cmake_policy(SET CMP0104 NEW) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # it will fail later in the build otherwise + message(FATAL_ERROR "-DCMAKE_CUDA_ARCHITECTURES= must be set") + endif() if(CMAKE_BUILD_TYPE MATCHES Debug) add_compile_options("$<$:-G>") endif() @@ -365,7 +377,6 @@ endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(BOOST_INCLUDE_LIBRARIES - headers atomic context date_time @@ -376,13 +387,13 @@ set(BOOST_INCLUDE_LIBRARIES thread) set_source(Boost) -resolve_dependency(Boost 1.66.0 COMPONENTS ${BOOST_INCLUDE_LIBRARIES}) +resolve_dependency(Boost 1.77.0 COMPONENTS ${BOOST_INCLUDE_LIBRARIES}) # Range-v3 will be enable when the codegen code actually lands keeping it here # for reference. find_package(range-v3) set_source(gflags) -resolve_dependency(gflags COMPONENTS shared) +resolve_dependency(gflags COMPONENTS ${VELOX_GFLAGS_TYPE}) if(NOT TARGET gflags::gflags) # This is a bit convoluted, but we want to be able to use gflags::gflags as a # target even when velox is built as a subproject which uses @@ -402,10 +413,18 @@ else() endif() resolve_dependency(glog) +if(${VELOX_ENABLE_DUCKDB}) + set_source(DuckDB) + resolve_dependency(DuckDB) +endif() + set_source(fmt) -resolve_dependency(fmt) +resolve_dependency(fmt 9.0.0) -if(NOT ${VELOX_BUILD_MINIMAL}) +if(${VELOX_BUILD_MINIMAL_WITH_DWIO} OR ${VELOX_ENABLE_HIVE_CONNECTOR}) + # DWIO needs all sorts of stream compression libraries. + # + # TODO: make these optional and pluggable. find_package(ZLIB REQUIRED) find_package(lz4 REQUIRED) find_package(lzo2 REQUIRED) @@ -430,36 +449,72 @@ if(${VELOX_BUILD_PYTHON_PACKAGE}) add_subdirectory(pyvelox) endif() +# DWIO (ORC/DWRF) and Substrait depend on protobuf. +if(${VELOX_BUILD_MINIMAL_WITH_DWIO} + OR ${VELOX_ENABLE_HIVE_CONNECTOR} + OR ${VELOX_ENABLE_SUBSTRAIT} + OR VELOX_ENABLE_GCS) + + # Locate or build protobuf. + set_source(Protobuf) + resolve_dependency(Protobuf CONFIG 3.21.7 REQUIRED) + include_directories(${Protobuf_INCLUDE_DIRS}) +endif() + set_source(simdjson) -resolve_dependency(simdjson 3.1.5) +resolve_dependency(simdjson 3.9.3) # Locate or build folly. add_compile_definitions(FOLLY_HAVE_INT128_T=1) set_source(folly) resolve_dependency(folly) +if(${VELOX_BUILD_TESTING}) + # Spark qury runner depends on absl, gRPC. + set_source(absl) + resolve_dependency(absl) + + # 'gRPC_CARES_PROVIDER' is set as 'package', which means c-ares library needs + # to be installed on the system, instead of being built by gRPC. + set_source(c-ares) + resolve_dependency(c-ares) + + set_source(gRPC) + resolve_dependency(gRPC) +endif() + +if(VELOX_ENABLE_REMOTE_FUNCTIONS) + # TODO: Move this to use resolve_dependency(). For some reason, FBThrift + # requires clients to explicitly install fizz and wangle. + find_package(fizz CONFIG REQUIRED) + find_package(wangle CONFIG REQUIRED) + find_package(FBThrift CONFIG REQUIRED) +endif() + if(DEFINED FOLLY_BENCHMARK_STATIC_LIB) set(FOLLY_BENCHMARK ${FOLLY_BENCHMARK_STATIC_LIB}) else() set(FOLLY_BENCHMARK Folly::follybenchmark) endif() -if(NOT ${VELOX_BUILD_MINIMAL}) - # Locate or build protobuf. - set_source(Protobuf) - resolve_dependency(Protobuf 3.21 EXACT) - include_directories(${Protobuf_INCLUDE_DIRS}) +if(VELOX_ENABLE_GCS) + set_source(google_cloud_cpp_storage) + resolve_dependency(google_cloud_cpp_storage CONFIG 2.22.0 REQUIRED) + add_definitions(-DVELOX_ENABLE_GCS) endif() # GCC needs to link a library to enable std::filesystem. if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") set(FILESYSTEM "stdc++fs") - # Ensure we have gcc at least 8+. - if(CMAKE_CXX_COMPILER_VERSION LESS 8.0) + # Ensure we have gcc at least 9+. + if(CMAKE_CXX_COMPILER_VERSION LESS 9.0) message( - FATAL_ERROR "VELOX requires gcc > 8. Found ${CMAKE_CXX_COMPILER_VERSION}") + FATAL_ERROR "VELOX requires gcc > 9. Found ${CMAKE_CXX_COMPILER_VERSION}") endif() + + # Find Threads library + find_package(Threads REQUIRED) else() set(FILESYSTEM "") endif() @@ -505,25 +560,30 @@ if(CMAKE_HOST_SYSTEM_NAME MATCHES "Darwin") endif() find_package(BISON 3.0.4 REQUIRED) find_package(FLEX 2.5.13 REQUIRED) +find_package(double-conversion 3.1.5 REQUIRED) include_directories(SYSTEM velox) include_directories(SYSTEM velox/external) -include_directories(SYSTEM velox/external/duckdb) -include_directories(SYSTEM velox/external/duckdb/tpch/dbgen/include) # these were previously vendored in third-party/ if(NOT VELOX_DISABLE_GOOGLETEST) - set(gtest_SOURCE BUNDLED) - resolve_dependency(gtest) + set(GTest_SOURCE AUTO) + resolve_dependency(GTest) set(VELOX_GTEST_INCUDE_DIR "${gtest_SOURCE_DIR}/googletest/include" PARENT_SCOPE) endif() -set(xsimd_SOURCE BUNDLED) -resolve_dependency(xsimd) +set_source(xsimd) +resolve_dependency(xsimd 10.0.0) + +set(stemmer_SOURCE BUNDLED) +resolve_dependency(stemmer) -include(CTest) # include after project() but before add_subdirectory() +if(VELOX_BUILD_TESTING) + set(BUILD_TESTING ON) + include(CTest) # include after project() but before add_subdirectory() +endif() include_directories(.) @@ -537,5 +597,9 @@ if("${TREAT_WARNINGS_AS_ERRORS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") endif() -add_subdirectory(third_party) +if(VELOX_ENABLE_ARROW) + set_source(Arrow) + resolve_dependency(Arrow) +endif() + add_subdirectory(velox) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0be82b463ad19..6c8d4f75ca513 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,7 +9,8 @@ make contributions to the project and community. ## Code of Conduct -First and foremost, the Velox project and all its contributors and maintainers +First and foremost, the Velox project and all its contributors and +[maintainers]((https://velox-lib.io/docs/community/components-and-maintainers)) are governed by a [Code of Conduct](CODE_OF_CONDUCT.md). When participating, you are expected to uphold this code. @@ -21,6 +22,12 @@ and [Discussions](https://github.com/facebookincubator/velox/discussions), and j [the Velox-OSS Slack workspace](http://velox-oss.slack.com) - please reach out to **velox@meta.com** to get access. +## Components and Maintainers + +Velox is logically organized into components, each maintained by a group of +individuals. The list of components and their respective maintainers [can be +found here](https://velox-lib.io/docs/community/components-and-maintainers). + ## Documentation Help the community understand how to use the Velox library by proposing @@ -166,7 +173,8 @@ Velox and the code review process. In addition to the general contribution guidelines presented above, here are specific guidelines for contributing functions: -1. Read [How to add a scalar function?](https://facebookincubator.github.io/velox/develop/scalar-functions.html) guide. +1. Read [How to add a scalar function?](https://facebookincubator.github.io/velox/develop/scalar-functions.html) guide. When implementing a function, simple function is preferred unless the implementation of vector function provides a significant performance gain which can be demonstrated +with a benchmark. 2. Use the following template for the PR title: Add xxx [Presto|Spark] function (replace xxx with the function name). * Ensure the PR description contains a link to the function documentation @@ -174,6 +182,7 @@ functions: * Describe the function semantics and edge cases clearly. 3. Use Presto or Spark to check the function semantics. + * When implementing a Spark function, check the function semantics using Spark 3.5 with ANSI OFF. * Try different edge cases to check whether the function returns null, or throws, etc. * Make sure to replicate the exact semantics. @@ -197,11 +206,11 @@ functions: ``` # Test the new function in isolation. Use --only flag to restrict the set of functions # and run for 60 seconds or longer. - velox_expression_fuzzer_test --only --duration_sec 60 --logtostderr=1 --enable_variadic_signatures --velox_fuzzer_enable_complex_types --lazy_vector_generation_ratio 0.2 --velox_fuzzer_enable_column_reuse --velox_fuzzer_enable_expression_reuse + velox_expression_fuzzer_test --only --duration_sec 60 --logtostderr=1 --enable_variadic_signatures --velox_fuzzer_enable_complex_types --velox_fuzzer_enable_decimal_type --lazy_vector_generation_ratio 0.2 --velox_fuzzer_enable_column_reuse --velox_fuzzer_enable_expression_reuse # Test the new function in combination with other functions. Do not restrict the set # of functions and run for 10 minutes (600 seconds) or longer. - velox_expression_fuzzer_test --duration_sec 600 --logtostderr=1 --enable_variadic_signatures --velox_fuzzer_enable_complex_types --lazy_vector_generation_ratio 0.2 --velox_fuzzer_enable_column_reuse --velox_fuzzer_enable_expression_reuse + velox_expression_fuzzer_test --duration_sec 600 --logtostderr=1 --enable_variadic_signatures --velox_fuzzer_enable_complex_types --velox_fuzzer_enable_decimal_type --lazy_vector_generation_ratio 0.2 --velox_fuzzer_enable_column_reuse --velox_fuzzer_enable_expression_reuse ``` Here are example PRs: diff --git a/Makefile b/Makefile index c6cdc3a714554..2e6144c844c53 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ # limitations under the License. .PHONY: all cmake build clean debug release unit +SHELL=/bin/bash BUILD_BASE_DIR=_build BUILD_DIR=release BUILD_TYPE=Release @@ -20,6 +21,7 @@ BENCHMARKS_BASIC_DIR=$(BUILD_BASE_DIR)/$(BUILD_DIR)/velox/benchmarks/basic/ BENCHMARKS_DUMP_DIR=dumps TREAT_WARNINGS_AS_ERRORS ?= 1 ENABLE_WALL ?= 1 +PYTHON_VENV ?= .venv # Option to make a minimal build. By default set to "OFF"; set to # "ON" to only build a minimal set of components. This may override @@ -50,17 +52,28 @@ ifdef AZURESDK_ROOT_DIR CMAKE_FLAGS += -DAZURESDK_ROOT_DIR=$(AZURESDK_ROOT_DIR) endif +ifdef CUDA_ARCHITECTURES +CMAKE_FLAGS += -DCMAKE_CUDA_ARCHITECTURES="$(CUDA_ARCHITECTURES)" +endif + +ifdef CUDA_COMPILER +CMAKE_FLAGS += -DCMAKE_CUDA_COMPILER="$(CUDA_COMPILER)" +endif + +ifdef CUDA_FLAGS +CMAKE_FLAGS += -DCMAKE_CUDA_FLAGS="$(CUDA_FLAGS)" +endif + # Use Ninja if available. If Ninja is used, pass through parallelism control flags. USE_NINJA ?= 1 ifeq ($(USE_NINJA), 1) ifneq ($(shell which ninja), ) -GENERATOR=-GNinja -DMAX_LINK_JOBS=$(MAX_LINK_JOBS) -DMAX_HIGH_MEM_JOBS=$(MAX_HIGH_MEM_JOBS) -endif -endif +GENERATOR := -GNinja +GENERATOR += -DMAX_LINK_JOBS=$(MAX_LINK_JOBS) +GENERATOR += -DMAX_HIGH_MEM_JOBS=$(MAX_HIGH_MEM_JOBS) -ifndef USE_CCACHE -ifneq ($(shell which ccache), ) -USE_CCACHE=-DCMAKE_CXX_COMPILER_LAUNCHER=ccache +# Ninja makes compilers disable colored output by default. +GENERATOR += -DVELOX_FORCE_COLORED_OUTPUT=ON endif endif @@ -79,16 +92,14 @@ clean: #: Delete all build artifacts cmake: #: Use CMake to create a Makefile build system mkdir -p $(BUILD_BASE_DIR)/$(BUILD_DIR) && \ - cmake -B \ + cmake -B \ "$(BUILD_BASE_DIR)/$(BUILD_DIR)" \ ${CMAKE_FLAGS} \ $(GENERATOR) \ - $(USE_CCACHE) \ - $(FORCE_COLOR) \ ${EXTRA_CMAKE_FLAGS} cmake-gpu: - $(MAKE) EXTRA_CMAKE_FLAGS=-DVELOX_ENABLE_GPU=ON cmake + $(MAKE) EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON" cmake build: #: Build the software based in BUILD_DIR and BUILD_TYPE variables cmake --build $(BUILD_BASE_DIR)/$(BUILD_DIR) -j $(NUM_THREADS) @@ -101,12 +112,43 @@ release: #: Build the release version $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=Release && \ $(MAKE) build BUILD_DIR=release -min_debug: #: Minimal build with debugging symbols - $(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS=-DVELOX_BUILD_MINIMAL=ON +minimal_debug: #: Minimal build with debugging symbols + $(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_BUILD_MINIMAL=ON" + $(MAKE) build BUILD_DIR=debug + +min_debug: minimal_debug + +minimal: #: Minimal build + $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_BUILD_MINIMAL=ON" + $(MAKE) build BUILD_DIR=release + +gpu: #: Build with GPU support + $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON" + $(MAKE) build BUILD_DIR=release + +gpu_debug: #: Build with debugging symbols and GPU support + $(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON" + $(MAKE) build BUILD_DIR=debug + +dwio: #: Minimal build with dwio enabled. + $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} \ + -DVELOX_BUILD_MINIMAL_WITH_DWIO=ON" + $(MAKE) build BUILD_DIR=release + +dwio_debug: #: Minimal build with dwio debugging symbols. + $(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} \ + -DVELOX_BUILD_MINIMAL_WITH_DWIO=ON" $(MAKE) build BUILD_DIR=debug benchmarks-basic-build: - $(MAKE) release EXTRA_CMAKE_FLAGS="-DVELOX_BUILD_BENCHMARKS=ON" + $(MAKE) release EXTRA_CMAKE_FLAGS=" ${EXTRA_CMAKE_FLAGS} \ + -DVELOX_BUILD_TESTING=OFF \ + -DVELOX_ENABLE_BENCHMARKS_BASIC=ON" + +benchmarks-build: + $(MAKE) release EXTRA_CMAKE_FLAGS=" ${EXTRA_CMAKE_FLAGS} \ + -DVELOX_BUILD_TESTING=OFF \ + -DVELOX_ENABLE_BENCHMARKS=ON" benchmarks-basic-run: scripts/benchmark-runner.py run \ @@ -121,7 +163,7 @@ unittest: debug #: Build with debugging and run unit tests # Build with debugging and run expression fuzzer test. Use a fixed seed to # ensure the tests are reproducible. fuzzertest: debug - $(BUILD_BASE_DIR)/debug/velox/expression/tests/velox_expression_fuzzer_test \ + $(BUILD_BASE_DIR)/debug/velox/expression/fuzzer/velox_expression_fuzzer_test \ --seed $(FUZZER_SEED) \ --duration_sec $(FUZZER_DURATION_SEC) \ --repro_persist_path $(FUZZER_REPRO_PERSIST_PATH) \ @@ -129,17 +171,33 @@ fuzzertest: debug --minloglevel=0 format-fix: #: Fix formatting issues in the main branch +ifneq ("$(wildcard ${PYTHON_VENV}/pyvenv.cfg)","") + source ${PYTHON_VENV}/bin/activate; scripts/check.py format main --fix +else scripts/check.py format main --fix +endif format-check: #: Check for formatting issues on the main branch clang-format --version +ifneq ("$(wildcard ${PYTHON_VENV}/pyvenv.cfg)","") + source ${PYTHON_VENV}/bin/activate; scripts/check.py format main +else scripts/check.py format main +endif -header-fix: #: Fix license header issues in the current branch +header-fix: #: Fix license header issues in the current branch +ifneq ("$(wildcard ${PYTHON_VENV}/pyvenv.cfg)","") + source ${PYTHON_VENV}/bin/activate; scripts/check.py header main --fix +else scripts/check.py header main --fix +endif header-check: #: Check for license header issues on the main branch +ifneq ("$(wildcard ${PYTHON_VENV}/pyvenv.cfg)","") + source ${PYTHON_VENV}/bin/activate; scripts/check.py header main +else scripts/check.py header main +endif circleci-container: #: Build the linux container for CircleCi $(MAKE) linux-container CONTAINER_NAME=circleci @@ -147,9 +205,6 @@ circleci-container: #: Build the linux container for CircleCi check-container: $(MAKE) linux-container CONTAINER_NAME=check -velox-torcharrow-container: - $(MAKE) linux-container CONTAINER_NAME=velox-torcharrow - linux-container: rm -rf /tmp/docker && \ mkdir -p /tmp/docker && \ @@ -166,8 +221,8 @@ python-clean: DEBUG=1 ${PYTHON_EXECUTABLE} setup.py clean python-build: - DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=4 ${PYTHON_EXECUTABLE} -m pip install -e .$(extras) --verbose + DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=${NUM_THREADS} ${PYTHON_EXECUTABLE} -m pip install -e .$(extras) --verbose -python-test: +python-test: $(MAKE) python-build extras="[tests]" DEBUG=1 ${PYTHON_EXECUTABLE} -m unittest -v diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 0000000000000..58655beb3ca76 --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,11 @@ +The Velox Project +Copyright © 2024 Meta Platforms, Inc. + +This product includes software from the The gRPC project (Apache 2.0). +* https://github.com/grpc/grpc/blob/v1.64.2/cmake/re2.cmake + +This product includes software from the QT project (BSD, 3-clause). +* https://github.com/qt/qtbase/blob/6.6.3/cmake/FindWrapSystemDoubleConversion.cmake + +This product includes software from HowardHinnant's date library (MIT License). +* https://github.com/HowardHinnant/date/tree/master diff --git a/README.md b/README.md index 03ff41fefef81..20d55392a2f99 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ Velox is a C++ database acceleration library which provides reusable, extensible, and high-performance data processing components. These components can be reused to build compute engines focused on different analytical workloads, including batch, interactive, stream processing, and AI/ML. -Velox was created by Facebook and it is currently developed in partnership with -Intel, ByteDance, and Ahana. +Velox was created by Meta and it is currently developed in partnership with +IBM/Ahana, Intel, Voltron Data, Microsoft, ByteDance and many other companies. In common usage scenarios, Velox takes a fully optimized query plan as input and performs the described computation. Considering Velox does not provide a @@ -70,50 +70,95 @@ Blog posts are available [here](https://velox-lib.io/blog). ## Getting Started -We provide scripts to help developers setup and install Velox dependencies. - ### Get the Velox Source ``` -git clone --recursive https://github.com/facebookincubator/velox.git +git clone https://github.com/facebookincubator/velox.git cd velox -# if you are updating an existing checkout -git submodule sync --recursive -git submodule update --init --recursive ``` +Once Velox is checked out, the first step is to install the dependencies. +Details on the dependencies and how Velox manages some of them for you +[can be found here](CMake/resolve_dependency_modules/README.md). + +Velox also provides the following scripts to help developers setup and install Velox +dependencies for a given platform. + +### Setting up dependencies + +The following setup scripts use the `DEPENDENCY_DIR` environment variable to set the +location to download and build packages. This defaults to `deps-download` in the current +working directory. + +Use `INSTALL_PREFIX` to set the install directory of the packages. This defaults to +`deps-install` in the current working directory on macOS and to the default install +location (eg. `/usr/local`) on linux. +Using the default install location `/usr/local` on macOS is discouraged since this +location is used by certain Homebrew versions. + +Manually add the `INSTALL_PREFIX` value in the IDE or bash environment, +say `export INSTALL_PREFIX=/Users/$USERNAME/velox/deps-install` to `~/.zshrc` so that +subsequent Velox builds can use the installed packages. + +*You can reuse `DEPENDENCY_INSTALL` and `INSTALL_PREFIX` for Velox clients such as Prestissimo +by specifying a common shared directory.`* ### Setting up on macOS -Once you have checked out Velox, on an Intel MacOS machine you can setup and then build like so: +On a macOS machine (either Intel or Apple silicon) you can setup and then build like so: ```shell -$ ./scripts/setup-macos.sh +$ ./scripts/setup-macos.sh $ make ``` -On an M1 MacOS machine you can build like so: +With macOS 14.4 and XCode 15.3 where `m4` is missing, you can either +1. install `m4` via `brew`: +```shell +$ brew install m4 +$ export PATH=/opt/homebrew/opt/m4/bin:$PATH +``` +2. or use `gm4` instead: ```shell -$ CPU_TARGET="arm64" ./scripts/setup-macos.sh -$ CPU_TARGET="arm64" make +$ M4=/usr/bin/gm4 make ``` -You can also produce intel binaries on an M1, use `CPU_TARGET="sse"` for the above. +### Setting up on Ubuntu (20.04 or later) -### Setting up on aarch64 Linux (Ubuntu 20.04 or later) +The supported architectures are x86_64 (avx, sse), and AArch64 (apple-m1+crc, neoverse-n1). +You can build like so: -On an aarch64 based machine, you can build like so: +```shell +$ ./scripts/setup-ubuntu.sh +$ make +``` + +### Setting up on Centos 9 Stream with adapters + +Velox adapters include file-systems such as AWS S3, Google Cloud Storage, +and Azure Blob File System. These adapters require installation of additional +libraries. Once you have checked out Velox, you can setup and build like so: ```shell -$ CPU_TARGET="aarch64" ./scripts/setup-ubuntu.sh -$ CPU_TARGET="aarch64" make +$ ./scripts/setup-centos9.sh +$ ./scripts/setup-adapters.sh +$ make ``` -### Setting up on x86_64 Linux (Ubuntu 20.04 or later) +Note that `setup-adapters.sh` supports macOS and Ubuntu 20.04 or later. + +### Using Clang on Linux -Once you have checked out Velox, you can setup and build like so: +Clang 15 can be additionally installed during the setup step for Ubuntu 22.04/24.04 +and CentOS 9 by setting the `USE_CLANG` environment variable prior to running the platform specific setup script. +```shell +$ export USE_CLANG=true +``` +This will install and use Clang 15 to build the dependencies instead of using the default GCC compiler. +Once completed, and before running any `make` command, set the compiler to be used: ```shell -$ ./scripts/setup-ubuntu.sh +$ export CC=/usr/bin/clang-15 +$ export CXX=/usr/bin/clang++-15 $ make ``` @@ -124,20 +169,22 @@ Run `make` in the root directory to compile the sources. For development, use an optimized version. Use `make unittest` to build and run tests. Note that, -* Velox requires C++17 , thus minimum supported compiler is GCC 5.0 and Clang 5.0. +* Velox requires a compiler at the minimum GCC 11.0 or Clang 15.0. * Velox requires the CPU to support instruction sets: * bmi * bmi2 * f16c * Velox tries to use the following (or equivalent) instruction sets where available: * On Intel CPUs - * avx + * avx * avx2 * sse * On ARM * Neon * Neon64 +Build metrics for Velox are published at + ### Building Velox with docker-compose If you don't want to install the system dependencies required to build Velox, @@ -162,11 +209,15 @@ contribute to the project. ## Community +Velox's technical governance mechanics is described [in this +document.](https://velox-lib.io/docs/community/technical-governance). +Components and maintainers [are listed +here](https://velox-lib.io/docs/community/components-and-maintainers). + The main communication channel with the Velox OSS community is through the -[the Velox-OSS Slack workspace](http://velox-oss.slack.com). +[the Velox-OSS Slack workspace](http://velox-oss.slack.com). Please reach out to **velox@meta.com** to get access to Velox Slack Channel. - ## License Velox is licensed under the Apache 2.0 License. A copy of the license diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt deleted file mode 100644 index 890b8aa6943c5..0000000000000 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 338aee2892ad2d4f480776a01f900c1b366e84df diff --git a/build/fbcode_builder/.gitignore b/build/fbcode_builder/.gitignore deleted file mode 100644 index b98f3edfa6f9b..0000000000000 --- a/build/fbcode_builder/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -# Facebook-internal CI builds don't have write permission outside of the -# source tree, so we install all projects into this directory. -/facebook_ci -__pycache__/ -*.pyc diff --git a/build/fbcode_builder/CMake/FBBuildOptions.cmake b/build/fbcode_builder/CMake/FBBuildOptions.cmake deleted file mode 100644 index dbaa29933a05b..0000000000000 --- a/build/fbcode_builder/CMake/FBBuildOptions.cmake +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. - -function (fb_activate_static_library_option) - option(USE_STATIC_DEPS_ON_UNIX - "If enabled, use static dependencies on unix systems. This is generally discouraged." - OFF - ) - # Mark USE_STATIC_DEPS_ON_UNIX as an "advanced" option, since enabling it - # is generally discouraged. - mark_as_advanced(USE_STATIC_DEPS_ON_UNIX) - - if(UNIX AND USE_STATIC_DEPS_ON_UNIX) - SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a" PARENT_SCOPE) - endif() -endfunction() diff --git a/build/fbcode_builder/CMake/FBCMakeParseArgs.cmake b/build/fbcode_builder/CMake/FBCMakeParseArgs.cmake deleted file mode 100644 index 933180189d076..0000000000000 --- a/build/fbcode_builder/CMake/FBCMakeParseArgs.cmake +++ /dev/null @@ -1,141 +0,0 @@ -# -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Helper function for parsing arguments to a CMake function. -# -# This function is very similar to CMake's built-in cmake_parse_arguments() -# function, with some improvements: -# - This function correctly handles empty arguments. (cmake_parse_arguments() -# ignores empty arguments.) -# - If a multi-value argument is specified more than once, the subsequent -# arguments are appended to the original list rather than replacing it. e.g. -# if "SOURCES" is a multi-value argument, and the argument list contains -# "SOURCES a b c SOURCES x y z" then the resulting value for SOURCES will be -# "a;b;c;x;y;z" rather than "x;y;z" -# - This function errors out by default on unrecognized arguments. You can -# pass in an extra "ALLOW_UNPARSED_ARGS" argument to make it behave like -# cmake_parse_arguments(), and return the unparsed arguments in a -# _UNPARSED_ARGUMENTS variable instead. -# -# It does look like cmake_parse_arguments() handled empty arguments correctly -# from CMake 3.0 through 3.3, but it seems like this was probably broken when -# it was turned into a built-in function in CMake 3.4. Here is discussion and -# patches that fixed this behavior prior to CMake 3.0: -# https://cmake.org/pipermail/cmake-developers/2013-November/020607.html -# -# The one downside to this function over the built-in cmake_parse_arguments() -# is that I don't think we can achieve the PARSE_ARGV behavior in a non-builtin -# function, so we can't properly handle arguments that contain ";". CMake will -# treat the ";" characters as list element separators, and treat it as multiple -# separate arguments. -# -function(fb_cmake_parse_args PREFIX OPTIONS ONE_VALUE_ARGS MULTI_VALUE_ARGS ARGS) - foreach(option IN LISTS ARGN) - if ("${option}" STREQUAL "ALLOW_UNPARSED_ARGS") - set(ALLOW_UNPARSED_ARGS TRUE) - else() - message( - FATAL_ERROR - "unknown optional argument for fb_cmake_parse_args(): ${option}" - ) - endif() - endforeach() - - # Define all options as FALSE in the parent scope to start with - foreach(var_name IN LISTS OPTIONS) - set("${PREFIX}_${var_name}" "FALSE" PARENT_SCOPE) - endforeach() - - # TODO: We aren't extremely strict about error checking for one-value - # arguments here. e.g., we don't complain if a one-value argument is - # followed by another option/one-value/multi-value name rather than an - # argument. We also don't complain if a one-value argument is the last - # argument and isn't followed by a value. - - list(APPEND all_args ${ONE_VALUE_ARGS}) - list(APPEND all_args ${MULTI_VALUE_ARGS}) - set(current_variable) - set(unparsed_args) - foreach(arg IN LISTS ARGS) - list(FIND OPTIONS "${arg}" opt_index) - if("${opt_index}" EQUAL -1) - list(FIND all_args "${arg}" arg_index) - if("${arg_index}" EQUAL -1) - # This argument does not match an argument name, - # must be an argument value - if("${current_variable}" STREQUAL "") - list(APPEND unparsed_args "${arg}") - else() - # Ugh, CMake lists have a pretty fundamental flaw: they cannot - # distinguish between an empty list and a list with a single empty - # element. We track our own SEEN_VALUES_arg setting to help - # distinguish this and behave properly here. - if ("${SEEN_${current_variable}}" AND "${${current_variable}}" STREQUAL "") - set("${current_variable}" ";${arg}") - else() - list(APPEND "${current_variable}" "${arg}") - endif() - set("SEEN_${current_variable}" TRUE) - endif() - else() - # We found a single- or multi-value argument name - set(current_variable "VALUES_${arg}") - set("SEEN_${arg}" TRUE) - endif() - else() - # We found an option variable - set("${PREFIX}_${arg}" "TRUE" PARENT_SCOPE) - set(current_variable) - endif() - endforeach() - - foreach(arg_name IN LISTS ONE_VALUE_ARGS) - if(NOT "${SEEN_${arg_name}}") - unset("${PREFIX}_${arg_name}" PARENT_SCOPE) - elseif(NOT "${SEEN_VALUES_${arg_name}}") - # If the argument was seen but a value wasn't specified, error out. - # We require exactly one value to be specified. - message( - FATAL_ERROR "argument ${arg_name} was specified without a value" - ) - else() - list(LENGTH "VALUES_${arg_name}" num_args) - if("${num_args}" EQUAL 0) - # We know an argument was specified and that we called list(APPEND). - # If CMake thinks the list is empty that means there is really a single - # empty element in the list. - set("${PREFIX}_${arg_name}" "" PARENT_SCOPE) - elseif("${num_args}" EQUAL 1) - list(GET "VALUES_${arg_name}" 0 arg_value) - set("${PREFIX}_${arg_name}" "${arg_value}" PARENT_SCOPE) - else() - message( - FATAL_ERROR "too many arguments specified for ${arg_name}: " - "${VALUES_${arg_name}}" - ) - endif() - endif() - endforeach() - - foreach(arg_name IN LISTS MULTI_VALUE_ARGS) - # If this argument name was never seen, then unset the parent scope - if (NOT "${SEEN_${arg_name}}") - unset("${PREFIX}_${arg_name}" PARENT_SCOPE) - else() - # TODO: Our caller still won't be able to distinguish between an empty - # list and a list with a single empty element. We can tell which is - # which, but CMake lists don't make it easy to show this to our caller. - set("${PREFIX}_${arg_name}" "${VALUES_${arg_name}}" PARENT_SCOPE) - endif() - endforeach() - - # By default we fatal out on unparsed arguments, but return them to the - # caller if ALLOW_UNPARSED_ARGS was specified. - if (DEFINED unparsed_args) - if ("${ALLOW_UNPARSED_ARGS}") - set("${PREFIX}_UNPARSED_ARGUMENTS" "${unparsed_args}" PARENT_SCOPE) - else() - message(FATAL_ERROR "unrecognized arguments: ${unparsed_args}") - endif() - endif() -endfunction() diff --git a/build/fbcode_builder/CMake/FBCompilerSettings.cmake b/build/fbcode_builder/CMake/FBCompilerSettings.cmake deleted file mode 100644 index 585c953203c8f..0000000000000 --- a/build/fbcode_builder/CMake/FBCompilerSettings.cmake +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. - -# This file applies common compiler settings that are shared across -# a number of Facebook opensource projects. -# Please use caution and your best judgement before making changes -# to these shared compiler settings in order to avoid accidentally -# breaking a build in another project! - -if (WIN32) - include(FBCompilerSettingsMSVC) -else() - include(FBCompilerSettingsUnix) -endif() diff --git a/build/fbcode_builder/CMake/FBCompilerSettingsMSVC.cmake b/build/fbcode_builder/CMake/FBCompilerSettingsMSVC.cmake deleted file mode 100644 index 4efd7e9668f08..0000000000000 --- a/build/fbcode_builder/CMake/FBCompilerSettingsMSVC.cmake +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. - -# This file applies common compiler settings that are shared across -# a number of Facebook opensource projects. -# Please use caution and your best judgement before making changes -# to these shared compiler settings in order to avoid accidentally -# breaking a build in another project! - -add_compile_options( - /wd4250 # 'class1' : inherits 'class2::member' via dominance -) diff --git a/build/fbcode_builder/CMake/FBCompilerSettingsUnix.cmake b/build/fbcode_builder/CMake/FBCompilerSettingsUnix.cmake deleted file mode 100644 index c26ce78b1d20e..0000000000000 --- a/build/fbcode_builder/CMake/FBCompilerSettingsUnix.cmake +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. - -# This file applies common compiler settings that are shared across -# a number of Facebook opensource projects. -# Please use caution and your best judgement before making changes -# to these shared compiler settings in order to avoid accidentally -# breaking a build in another project! - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Wextra -Wno-deprecated -Wno-deprecated-declarations") diff --git a/build/fbcode_builder/CMake/FBPythonBinary.cmake b/build/fbcode_builder/CMake/FBPythonBinary.cmake deleted file mode 100644 index f91ebaf326452..0000000000000 --- a/build/fbcode_builder/CMake/FBPythonBinary.cmake +++ /dev/null @@ -1,697 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. - -include(FBCMakeParseArgs) - -# -# This file contains helper functions for building self-executing Python -# binaries. -# -# This is somewhat different than typical python installation with -# distutils/pip/virtualenv/etc. We primarily want to build a standalone -# executable, isolated from other Python packages on the system. We don't want -# to install files into the standard library python paths. This is more -# similar to PEX (https://github.com/pantsbuild/pex) and XAR -# (https://github.com/facebookincubator/xar). (In the future it would be nice -# to update this code to also support directly generating XAR files if XAR is -# available.) -# -# We also want to be able to easily define "libraries" of python files that can -# be shared and re-used between these standalone python executables, and can be -# shared across projects in different repositories. This means that we do need -# a way to "install" libraries so that they are visible to CMake builds in -# other repositories, without actually installing them in the standard python -# library paths. -# - -# If the caller has not already found Python, do so now. -# If we fail to find python now we won't fail immediately, but -# add_fb_python_executable() or add_fb_python_library() will fatal out if they -# are used. -if(NOT TARGET Python3::Interpreter) - # CMake 3.12+ ships with a FindPython3.cmake module. Try using it first. - # We find with QUIET here, since otherwise this generates some noisy warnings - # on versions of CMake before 3.12 - if (WIN32) - # On Windows we need both the Interpreter as well as the Development - # libraries. - find_package(Python3 COMPONENTS Interpreter Development QUIET) - else() - find_package(Python3 COMPONENTS Interpreter QUIET) - endif() - if(Python3_Interpreter_FOUND) - message(STATUS "Found Python 3: ${Python3_EXECUTABLE}") - else() - # Try with the FindPythonInterp.cmake module available in older CMake - # versions. Check to see if the caller has already searched for this - # themselves first. - if(NOT PYTHONINTERP_FOUND) - set(Python_ADDITIONAL_VERSIONS 3 3.6 3.5 3.4 3.3 3.2 3.1) - find_package(PythonInterp) - # TODO: On Windows we require the Python libraries as well. - # We currently do not search for them on this code path. - # For now we require building with CMake 3.12+ on Windows, so that the - # FindPython3 code path above is available. - endif() - if(PYTHONINTERP_FOUND) - if("${PYTHON_VERSION_MAJOR}" GREATER_EQUAL 3) - set(Python3_EXECUTABLE "${PYTHON_EXECUTABLE}") - add_custom_target(Python3::Interpreter) - else() - string( - CONCAT FBPY_FIND_PYTHON_ERR - "found Python ${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}, " - "but need Python 3" - ) - endif() - endif() - endif() -endif() - -# Find our helper program. -# We typically install this in the same directory as this .cmake file. -find_program( - FB_MAKE_PYTHON_ARCHIVE "make_fbpy_archive.py" - PATHS ${CMAKE_MODULE_PATH} -) -set(FB_PY_TEST_MAIN "${CMAKE_CURRENT_LIST_DIR}/fb_py_test_main.py") -set( - FB_PY_TEST_DISCOVER_SCRIPT - "${CMAKE_CURRENT_LIST_DIR}/FBPythonTestAddTests.cmake" -) -set( - FB_PY_WIN_MAIN_C - "${CMAKE_CURRENT_LIST_DIR}/fb_py_win_main.c" -) - -# An option to control the default installation location for -# install_fb_python_library(). This is relative to ${CMAKE_INSTALL_PREFIX} -set( - FBPY_LIB_INSTALL_DIR "lib/fb-py-libs" CACHE STRING - "The subdirectory where FB python libraries should be installed" -) - -# -# Build a self-executing python binary. -# -# This accepts the same arguments as add_fb_python_library(). -# -# In addition, a MAIN_MODULE argument is accepted. This argument specifies -# which module should be started as the __main__ module when the executable is -# run. If left unspecified, a __main__.py script must be present in the -# manifest. -# -function(add_fb_python_executable TARGET) - fb_py_check_available() - - # Parse the arguments - set(one_value_args BASE_DIR NAMESPACE MAIN_MODULE TYPE) - set(multi_value_args SOURCES DEPENDS) - fb_cmake_parse_args( - ARG "" "${one_value_args}" "${multi_value_args}" "${ARGN}" - ) - fb_py_process_default_args(ARG_NAMESPACE ARG_BASE_DIR) - - # Use add_fb_python_library() to perform most of our source handling - add_fb_python_library( - "${TARGET}.main_lib" - BASE_DIR "${ARG_BASE_DIR}" - NAMESPACE "${ARG_NAMESPACE}" - SOURCES ${ARG_SOURCES} - DEPENDS ${ARG_DEPENDS} - ) - - set( - manifest_files - "$" - ) - set( - source_files - "$" - ) - - # The command to build the executable archive. - # - # If we are using CMake 3.8+ we can use COMMAND_EXPAND_LISTS. - # CMP0067 isn't really the policy we care about, but seems like the best way - # to check if we are running 3.8+. - if (POLICY CMP0067) - set(extra_cmd_params COMMAND_EXPAND_LISTS) - set(make_py_args "${manifest_files}") - else() - set(extra_cmd_params) - set(make_py_args --manifest-separator "::" "$") - endif() - - set(output_file "${TARGET}${CMAKE_EXECUTABLE_SUFFIX}") - if(WIN32) - set(zipapp_output "${TARGET}.py_zipapp") - else() - set(zipapp_output "${output_file}") - endif() - set(zipapp_output_file "${zipapp_output}") - - set(is_dir_output FALSE) - if(DEFINED ARG_TYPE) - list(APPEND make_py_args "--type" "${ARG_TYPE}") - if ("${ARG_TYPE}" STREQUAL "dir") - set(is_dir_output TRUE) - # CMake doesn't really seem to like having a directory specified as an - # output; specify the __main__.py file as the output instead. - set(zipapp_output_file "${zipapp_output}/__main__.py") - list(APPEND - extra_cmd_params - COMMAND "${CMAKE_COMMAND}" -E remove_directory "${zipapp_output}" - ) - endif() - endif() - - if(DEFINED ARG_MAIN_MODULE) - list(APPEND make_py_args "--main" "${ARG_MAIN_MODULE}") - endif() - - add_custom_command( - OUTPUT "${zipapp_output_file}" - ${extra_cmd_params} - COMMAND - "${Python3_EXECUTABLE}" "${FB_MAKE_PYTHON_ARCHIVE}" - -o "${zipapp_output}" - ${make_py_args} - DEPENDS - ${source_files} - "${TARGET}.main_lib.py_sources_built" - "${FB_MAKE_PYTHON_ARCHIVE}" - ) - - if(WIN32) - if(is_dir_output) - # TODO: generate a main executable that will invoke Python3 - # with the correct main module inside the output directory - else() - add_executable("${TARGET}.winmain" "${FB_PY_WIN_MAIN_C}") - target_link_libraries("${TARGET}.winmain" Python3::Python) - # The Python3::Python target doesn't seem to be set up completely - # correctly on Windows for some reason, and we have to explicitly add - # ${Python3_LIBRARY_DIRS} to the target link directories. - target_link_directories( - "${TARGET}.winmain" - PUBLIC ${Python3_LIBRARY_DIRS} - ) - add_custom_command( - OUTPUT "${output_file}" - DEPENDS "${TARGET}.winmain" "${zipapp_output_file}" - COMMAND - "cmd.exe" "/c" "copy" "/b" - "${TARGET}.winmain${CMAKE_EXECUTABLE_SUFFIX}+${zipapp_output}" - "${output_file}" - ) - endif() - endif() - - # Add an "ALL" target that depends on force ${TARGET}, - # so that ${TARGET} will be included in the default list of build targets. - add_custom_target("${TARGET}.GEN_PY_EXE" ALL DEPENDS "${output_file}") - - # Allow resolving the executable path for the target that we generate - # via a generator expression like: - # "WATCHMAN_WAIT_PATH=$" - set_property(TARGET "${TARGET}.GEN_PY_EXE" - PROPERTY EXECUTABLE "${CMAKE_CURRENT_BINARY_DIR}/${output_file}") -endfunction() - -# Define a python unittest executable. -# The executable is built using add_fb_python_executable and has the -# following differences: -# -# Each of the source files specified in SOURCES will be imported -# and have unittest discovery performed upon them. -# Those sources will be imported in the top level namespace. -# -# The ENV argument allows specifying a list of "KEY=VALUE" -# pairs that will be used by the test runner to set up the environment -# in the child process prior to running the test. This is useful for -# passing additional configuration to the test. -function(add_fb_python_unittest TARGET) - # Parse the arguments - set(multi_value_args SOURCES DEPENDS ENV PROPERTIES) - set( - one_value_args - WORKING_DIRECTORY BASE_DIR NAMESPACE TEST_LIST DISCOVERY_TIMEOUT - ) - fb_cmake_parse_args( - ARG "" "${one_value_args}" "${multi_value_args}" "${ARGN}" - ) - fb_py_process_default_args(ARG_NAMESPACE ARG_BASE_DIR) - if(NOT ARG_WORKING_DIRECTORY) - # Default the working directory to the current binary directory. - # This matches the default behavior of add_test() and other standard - # test functions like gtest_discover_tests() - set(ARG_WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") - endif() - if(NOT ARG_TEST_LIST) - set(ARG_TEST_LIST "${TARGET}_TESTS") - endif() - if(NOT ARG_DISCOVERY_TIMEOUT) - set(ARG_DISCOVERY_TIMEOUT 5) - endif() - - # Tell our test program the list of modules to scan for tests. - # We scan all modules directly listed in our SOURCES argument, and skip - # modules that came from dependencies in the DEPENDS list. - # - # This is written into a __test_modules__.py module that the test runner - # will look at. - set( - test_modules_path - "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_test_modules.py" - ) - file(WRITE "${test_modules_path}" "TEST_MODULES = [\n") - string(REPLACE "." "/" namespace_dir "${ARG_NAMESPACE}") - if (NOT "${namespace_dir}" STREQUAL "") - set(namespace_dir "${namespace_dir}/") - endif() - set(test_modules) - foreach(src_path IN LISTS ARG_SOURCES) - fb_py_compute_dest_path( - abs_source dest_path - "${src_path}" "${namespace_dir}" "${ARG_BASE_DIR}" - ) - string(REPLACE "/" "." module_name "${dest_path}") - string(REGEX REPLACE "\\.py$" "" module_name "${module_name}") - list(APPEND test_modules "${module_name}") - file(APPEND "${test_modules_path}" " '${module_name}',\n") - endforeach() - file(APPEND "${test_modules_path}" "]\n") - - # The __main__ is provided by our runner wrapper/bootstrap - list(APPEND ARG_SOURCES "${FB_PY_TEST_MAIN}=__main__.py") - list(APPEND ARG_SOURCES "${test_modules_path}=__test_modules__.py") - - add_fb_python_executable( - "${TARGET}" - NAMESPACE "${ARG_NAMESPACE}" - BASE_DIR "${ARG_BASE_DIR}" - SOURCES ${ARG_SOURCES} - DEPENDS ${ARG_DEPENDS} - ) - - # Run test discovery after the test executable is built. - # This logic is based on the code for gtest_discover_tests() - set(ctest_file_base "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}") - set(ctest_include_file "${ctest_file_base}_include.cmake") - set(ctest_tests_file "${ctest_file_base}_tests.cmake") - add_custom_command( - TARGET "${TARGET}.GEN_PY_EXE" POST_BUILD - BYPRODUCTS "${ctest_tests_file}" - COMMAND - "${CMAKE_COMMAND}" - -D "TEST_TARGET=${TARGET}" - -D "TEST_INTERPRETER=${Python3_EXECUTABLE}" - -D "TEST_ENV=${ARG_ENV}" - -D "TEST_EXECUTABLE=$" - -D "TEST_WORKING_DIR=${ARG_WORKING_DIRECTORY}" - -D "TEST_LIST=${ARG_TEST_LIST}" - -D "TEST_PREFIX=${TARGET}::" - -D "TEST_PROPERTIES=${ARG_PROPERTIES}" - -D "CTEST_FILE=${ctest_tests_file}" - -P "${FB_PY_TEST_DISCOVER_SCRIPT}" - VERBATIM - ) - - file( - WRITE "${ctest_include_file}" - "if(EXISTS \"${ctest_tests_file}\")\n" - " include(\"${ctest_tests_file}\")\n" - "else()\n" - " add_test(\"${TARGET}_NOT_BUILT\" \"${TARGET}_NOT_BUILT\")\n" - "endif()\n" - ) - set_property( - DIRECTORY APPEND PROPERTY TEST_INCLUDE_FILES - "${ctest_include_file}" - ) -endfunction() - -# -# Define a python library. -# -# If you want to install a python library generated from this rule note that -# you need to use install_fb_python_library() rather than CMake's built-in -# install() function. This will make it available for other downstream -# projects to use in their add_fb_python_executable() and -# add_fb_python_library() calls. (You do still need to use `install(EXPORT)` -# later to install the CMake exports.) -# -# Parameters: -# - BASE_DIR : -# The base directory path to strip off from each source path. All source -# files must be inside this directory. If not specified it defaults to -# ${CMAKE_CURRENT_SOURCE_DIR}. -# - NAMESPACE : -# The destination namespace where these files should be installed in python -# binaries. If not specified, this defaults to the current relative path of -# ${CMAKE_CURRENT_SOURCE_DIR} inside ${CMAKE_SOURCE_DIR}. e.g., a python -# library defined in the directory repo_root/foo/bar will use a default -# namespace of "foo.bar" -# - SOURCES <...>: -# The python source files. -# You may optionally specify as source using the form: PATH=ALIAS where -# PATH is a relative path in the source tree and ALIAS is the relative -# path into which PATH should be rewritten. This is useful for mapping -# an executable script to the main module in a python executable. -# e.g.: `python/bin/watchman-wait=__main__.py` -# - DEPENDS <...>: -# Other python libraries that this one depends on. -# - INSTALL_DIR : -# The directory where this library should be installed. -# install_fb_python_library() must still be called later to perform the -# installation. If a relative path is given it will be treated relative to -# ${CMAKE_INSTALL_PREFIX} -# -# CMake is unfortunately pretty crappy at being able to define custom build -# rules & behaviors. It doesn't support transitive property propagation -# between custom targets; only the built-in add_executable() and add_library() -# targets support transitive properties. -# -# We hack around this janky CMake behavior by (ab)using interface libraries to -# propagate some of the data we want between targets, without actually -# generating a C library. -# -# add_fb_python_library(SOMELIB) generates the following things: -# - An INTERFACE library rule named SOMELIB.py_lib which tracks some -# information about transitive dependencies: -# - the transitive set of source files in the INTERFACE_SOURCES property -# - the transitive set of manifest files that this library depends on in -# the INTERFACE_INCLUDE_DIRECTORIES property. -# - A custom command that generates a SOMELIB.manifest file. -# This file contains the mapping of source files to desired destination -# locations in executables that depend on this library. This manifest file -# will then be read at build-time in order to build executables. -# -function(add_fb_python_library LIB_NAME) - fb_py_check_available() - - # Parse the arguments - # We use fb_cmake_parse_args() rather than cmake_parse_arguments() since - # cmake_parse_arguments() does not handle empty arguments, and it is common - # for callers to want to specify an empty NAMESPACE parameter. - set(one_value_args BASE_DIR NAMESPACE INSTALL_DIR) - set(multi_value_args SOURCES DEPENDS) - fb_cmake_parse_args( - ARG "" "${one_value_args}" "${multi_value_args}" "${ARGN}" - ) - fb_py_process_default_args(ARG_NAMESPACE ARG_BASE_DIR) - - string(REPLACE "." "/" namespace_dir "${ARG_NAMESPACE}") - if (NOT "${namespace_dir}" STREQUAL "") - set(namespace_dir "${namespace_dir}/") - endif() - - if(NOT DEFINED ARG_INSTALL_DIR) - set(install_dir "${FBPY_LIB_INSTALL_DIR}/") - elseif("${ARG_INSTALL_DIR}" STREQUAL "") - set(install_dir "") - else() - set(install_dir "${ARG_INSTALL_DIR}/") - endif() - - # message(STATUS "fb py library ${LIB_NAME}: " - # "NS=${namespace_dir} BASE=${ARG_BASE_DIR}") - - # TODO: In the future it would be nice to support pre-compiling the source - # files. We could emit a rule to compile each source file and emit a - # .pyc/.pyo file here, and then have the manifest reference the pyc/pyo - # files. - - # Define a library target to help pass around information about the library, - # and propagate dependency information. - # - # CMake make a lot of assumptions that libraries are C++ libraries. To help - # avoid confusion we name our target "${LIB_NAME}.py_lib" rather than just - # "${LIB_NAME}". This helps avoid confusion if callers try to use - # "${LIB_NAME}" on their own as a target name. (e.g., attempting to install - # it directly with install(TARGETS) won't work. Callers must use - # install_fb_python_library() instead.) - add_library("${LIB_NAME}.py_lib" INTERFACE) - - # Emit the manifest file. - # - # We write the manifest file to a temporary path first, then copy it with - # configure_file(COPYONLY). This is necessary to get CMake to understand - # that "${manifest_path}" is generated by the CMake configure phase, - # and allow using it as a dependency for add_custom_command(). - # (https://gitlab.kitware.com/cmake/cmake/issues/16367) - set(manifest_path "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}.manifest") - set(tmp_manifest "${manifest_path}.tmp") - file(WRITE "${tmp_manifest}" "FBPY_MANIFEST 1\n") - set(abs_sources) - foreach(src_path IN LISTS ARG_SOURCES) - fb_py_compute_dest_path( - abs_source dest_path - "${src_path}" "${namespace_dir}" "${ARG_BASE_DIR}" - ) - list(APPEND abs_sources "${abs_source}") - target_sources( - "${LIB_NAME}.py_lib" INTERFACE - "$" - "$" - ) - file( - APPEND "${tmp_manifest}" - "${abs_source} :: ${dest_path}\n" - ) - endforeach() - configure_file("${tmp_manifest}" "${manifest_path}" COPYONLY) - - target_include_directories( - "${LIB_NAME}.py_lib" INTERFACE - "$" - "$" - ) - - # Add a target that depends on all of the source files. - # This is needed in case some of the source files are generated. This will - # ensure that these source files are brought up-to-date before we build - # any python binaries that depend on this library. - add_custom_target("${LIB_NAME}.py_sources_built" DEPENDS ${abs_sources}) - add_dependencies("${LIB_NAME}.py_lib" "${LIB_NAME}.py_sources_built") - - # Hook up library dependencies, and also make the *.py_sources_built target - # depend on the sources for all of our dependencies also being up-to-date. - foreach(dep IN LISTS ARG_DEPENDS) - target_link_libraries("${LIB_NAME}.py_lib" INTERFACE "${dep}.py_lib") - - # Mark that our .py_sources_built target depends on each our our dependent - # libraries. This serves two functions: - # - This causes CMake to generate an error message if one of the - # dependencies is never defined. The target_link_libraries() call above - # won't complain if one of the dependencies doesn't exist (since it is - # intended to allow passing in file names for plain library files rather - # than just targets). - # - It ensures that sources for our dependencies are built before any - # executable that depends on us. Note that we depend on "${dep}.py_lib" - # rather than "${dep}.py_sources_built" for this purpose because the - # ".py_sources_built" target won't be available for imported targets. - add_dependencies("${LIB_NAME}.py_sources_built" "${dep}.py_lib") - endforeach() - - # Add a custom command to help with library installation, in case - # install_fb_python_library() is called later for this library. - # add_custom_command() only works with file dependencies defined in the same - # CMakeLists.txt file, so we want to make sure this is defined here, rather - # then where install_fb_python_library() is called. - # This command won't be run by default, but will only be run if it is needed - # by a subsequent install_fb_python_library() call. - # - # This command copies the library contents into the build directory. - # It would be nicer if we could skip this intermediate copy, and just run - # make_fbpy_archive.py at install time to copy them directly to the desired - # installation directory. Unfortunately this is difficult to do, and seems - # to interfere with some of the CMake code that wants to generate a manifest - # of installed files. - set(build_install_dir "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}.lib_install") - add_custom_command( - OUTPUT - "${build_install_dir}/${LIB_NAME}.manifest" - COMMAND "${CMAKE_COMMAND}" -E remove_directory "${build_install_dir}" - COMMAND - "${Python3_EXECUTABLE}" "${FB_MAKE_PYTHON_ARCHIVE}" --type lib-install - --install-dir "${LIB_NAME}" - -o "${build_install_dir}/${LIB_NAME}" "${manifest_path}" - DEPENDS - "${abs_sources}" - "${manifest_path}" - "${FB_MAKE_PYTHON_ARCHIVE}" - ) - add_custom_target( - "${LIB_NAME}.py_lib_install" - DEPENDS "${build_install_dir}/${LIB_NAME}.manifest" - ) - - # Set some properties to pass through the install paths to - # install_fb_python_library() - # - # Passing through ${build_install_dir} allows install_fb_python_library() - # to work even if used from a different CMakeLists.txt file than where - # add_fb_python_library() was called (i.e. such that - # ${CMAKE_CURRENT_BINARY_DIR} is different between the two calls). - set(abs_install_dir "${install_dir}") - if(NOT IS_ABSOLUTE "${abs_install_dir}") - set(abs_install_dir "${CMAKE_INSTALL_PREFIX}/${abs_install_dir}") - endif() - string(REGEX REPLACE "/$" "" abs_install_dir "${abs_install_dir}") - set_target_properties( - "${LIB_NAME}.py_lib_install" - PROPERTIES - INSTALL_DIR "${abs_install_dir}" - BUILD_INSTALL_DIR "${build_install_dir}" - ) -endfunction() - -# -# Install an FB-style packaged python binary. -# -# - DESTINATION : -# Associate the installed target files with the given export-name. -# -function(install_fb_python_executable TARGET) - # Parse the arguments - set(one_value_args DESTINATION) - set(multi_value_args) - fb_cmake_parse_args( - ARG "" "${one_value_args}" "${multi_value_args}" "${ARGN}" - ) - - if(NOT DEFINED ARG_DESTINATION) - set(ARG_DESTINATION bin) - endif() - - install( - PROGRAMS "$" - DESTINATION "${ARG_DESTINATION}" - ) -endfunction() - -# -# Install a python library. -# -# - EXPORT : -# Associate the installed target files with the given export-name. -# -# Note that unlike the built-in CMake install() function we do not accept a -# DESTINATION parameter. Instead, use the INSTALL_DIR parameter to -# add_fb_python_library() to set the installation location. -# -function(install_fb_python_library LIB_NAME) - set(one_value_args EXPORT) - fb_cmake_parse_args(ARG "" "${one_value_args}" "" "${ARGN}") - - # Export our "${LIB_NAME}.py_lib" target so that it will be available to - # downstream projects in our installed CMake config files. - if(DEFINED ARG_EXPORT) - install(TARGETS "${LIB_NAME}.py_lib" EXPORT "${ARG_EXPORT}") - endif() - - # add_fb_python_library() emits a .py_lib_install target that will prepare - # the installation directory. However, it isn't part of the "ALL" target and - # therefore isn't built by default. - # - # Make sure the ALL target depends on it now. We have to do this by - # introducing yet another custom target. - # Add it as a dependency to the ALL target now. - add_custom_target("${LIB_NAME}.py_lib_install_all" ALL) - add_dependencies( - "${LIB_NAME}.py_lib_install_all" "${LIB_NAME}.py_lib_install" - ) - - # Copy the intermediate install directory generated at build time into - # the desired install location. - get_target_property(dest_dir "${LIB_NAME}.py_lib_install" "INSTALL_DIR") - get_target_property( - build_install_dir "${LIB_NAME}.py_lib_install" "BUILD_INSTALL_DIR" - ) - install( - DIRECTORY "${build_install_dir}/${LIB_NAME}" - DESTINATION "${dest_dir}" - ) - install( - FILES "${build_install_dir}/${LIB_NAME}.manifest" - DESTINATION "${dest_dir}" - ) -endfunction() - -# Helper macro to process the BASE_DIR and NAMESPACE arguments for -# add_fb_python_executable() and add_fb_python_executable() -macro(fb_py_process_default_args NAMESPACE_VAR BASE_DIR_VAR) - # If the namespace was not specified, default to the relative path to the - # current directory (starting from the repository root). - if(NOT DEFINED "${NAMESPACE_VAR}") - file( - RELATIVE_PATH "${NAMESPACE_VAR}" - "${CMAKE_SOURCE_DIR}" - "${CMAKE_CURRENT_SOURCE_DIR}" - ) - endif() - - if(NOT DEFINED "${BASE_DIR_VAR}") - # If the base directory was not specified, default to the current directory - set("${BASE_DIR_VAR}" "${CMAKE_CURRENT_SOURCE_DIR}") - else() - # If the base directory was specified, always convert it to an - # absolute path. - get_filename_component("${BASE_DIR_VAR}" "${${BASE_DIR_VAR}}" ABSOLUTE) - endif() -endmacro() - -function(fb_py_check_available) - # Make sure that Python 3 and our make_fbpy_archive.py helper script are - # available. - if(NOT Python3_EXECUTABLE) - if(FBPY_FIND_PYTHON_ERR) - message(FATAL_ERROR "Unable to find Python 3: ${FBPY_FIND_PYTHON_ERR}") - else() - message(FATAL_ERROR "Unable to find Python 3") - endif() - endif() - - if (NOT FB_MAKE_PYTHON_ARCHIVE) - message( - FATAL_ERROR "unable to find make_fbpy_archive.py helper program (it " - "should be located in the same directory as FBPythonBinary.cmake)" - ) - endif() -endfunction() - -function( - fb_py_compute_dest_path - src_path_output dest_path_output src_path namespace_dir base_dir -) - if("${src_path}" MATCHES "=") - # We want to split the string on the `=` sign, but cmake doesn't - # provide much in the way of helpers for this, so we rewrite the - # `=` sign to `;` so that we can treat it as a cmake list and - # then index into the components - string(REPLACE "=" ";" src_path_list "${src_path}") - list(GET src_path_list 0 src_path) - # Note that we ignore the `namespace_dir` in the alias case - # in order to allow aliasing a source to the top level `__main__.py` - # filename. - list(GET src_path_list 1 dest_path) - else() - unset(dest_path) - endif() - - get_filename_component(abs_source "${src_path}" ABSOLUTE) - if(NOT DEFINED dest_path) - file(RELATIVE_PATH rel_src "${ARG_BASE_DIR}" "${abs_source}") - if("${rel_src}" MATCHES "^../") - message( - FATAL_ERROR "${LIB_NAME}: source file \"${abs_source}\" is not inside " - "the base directory ${ARG_BASE_DIR}" - ) - endif() - set(dest_path "${namespace_dir}${rel_src}") - endif() - - set("${src_path_output}" "${abs_source}" PARENT_SCOPE) - set("${dest_path_output}" "${dest_path}" PARENT_SCOPE) -endfunction() diff --git a/build/fbcode_builder/CMake/FBPythonTestAddTests.cmake b/build/fbcode_builder/CMake/FBPythonTestAddTests.cmake deleted file mode 100644 index d73c055d8245a..0000000000000 --- a/build/fbcode_builder/CMake/FBPythonTestAddTests.cmake +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. - -# Add a command to be emitted to the CTest file -set(ctest_script) -function(add_command CMD) - set(escaped_args "") - foreach(arg ${ARGN}) - # Escape all arguments using "Bracket Argument" syntax - # We could skip this for argument that don't contain any special - # characters if we wanted to make the output slightly more human-friendly. - set(escaped_args "${escaped_args} [==[${arg}]==]") - endforeach() - set(ctest_script "${ctest_script}${CMD}(${escaped_args})\n" PARENT_SCOPE) -endfunction() - -if(NOT EXISTS "${TEST_EXECUTABLE}") - message(FATAL_ERROR "Test executable does not exist: ${TEST_EXECUTABLE}") -endif() -execute_process( - COMMAND ${CMAKE_COMMAND} -E env ${TEST_ENV} "${TEST_INTERPRETER}" "${TEST_EXECUTABLE}" --list-tests - WORKING_DIRECTORY "${TEST_WORKING_DIR}" - OUTPUT_VARIABLE output - RESULT_VARIABLE result -) -if(NOT "${result}" EQUAL 0) - string(REPLACE "\n" "\n " output "${output}") - message( - FATAL_ERROR - "Error running test executable: ${TEST_EXECUTABLE}\n" - "Output:\n" - " ${output}\n" - ) -endif() - -# Parse output -string(REPLACE "\n" ";" tests_list "${output}") -foreach(test_name ${tests_list}) - add_command( - add_test - "${TEST_PREFIX}${test_name}" - ${CMAKE_COMMAND} -E env ${TEST_ENV} - "${TEST_INTERPRETER}" "${TEST_EXECUTABLE}" "${test_name}" - ) - add_command( - set_tests_properties - "${TEST_PREFIX}${test_name}" - PROPERTIES - WORKING_DIRECTORY "${TEST_WORKING_DIR}" - ${TEST_PROPERTIES} - ) -endforeach() - -# Set a list of discovered tests in the parent scope, in case users -# want access to this list as a CMake variable -if(TEST_LIST) - add_command(set ${TEST_LIST} ${tests_list}) -endif() - -file(WRITE "${CTEST_FILE}" "${ctest_script}") diff --git a/build/fbcode_builder/CMake/FBThriftCppLibrary.cmake b/build/fbcode_builder/CMake/FBThriftCppLibrary.cmake deleted file mode 100644 index 7688d80960dfd..0000000000000 --- a/build/fbcode_builder/CMake/FBThriftCppLibrary.cmake +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. - -include(FBCMakeParseArgs) - -# Generate a C++ library from a thrift file -# -# Parameters: -# - SERVICES [ ...] -# The names of the services defined in the thrift file. -# - DEPENDS [ ...] -# A list of other thrift C++ libraries that this library depends on. -# - OPTIONS [ ...] -# A list of options to pass to the thrift compiler. -# - INCLUDE_DIR -# The sub-directory where generated headers will be installed. -# Defaults to "include" if not specified. The caller must still call -# install() to install the thrift library if desired. -# - THRIFT_INCLUDE_DIR -# The sub-directory where generated headers will be installed. -# Defaults to "${INCLUDE_DIR}/thrift-files" if not specified. -# The caller must still call install() to install the thrift library if -# desired. -function(add_fbthrift_cpp_library LIB_NAME THRIFT_FILE) - # Parse the arguments - set(one_value_args INCLUDE_DIR THRIFT_INCLUDE_DIR) - set(multi_value_args SERVICES DEPENDS OPTIONS) - fb_cmake_parse_args( - ARG "" "${one_value_args}" "${multi_value_args}" "${ARGN}" - ) - if(NOT DEFINED ARG_INCLUDE_DIR) - set(ARG_INCLUDE_DIR "include") - endif() - if(NOT DEFINED ARG_THRIFT_INCLUDE_DIR) - set(ARG_THRIFT_INCLUDE_DIR "${ARG_INCLUDE_DIR}/thrift-files") - endif() - - get_filename_component(base ${THRIFT_FILE} NAME_WE) - get_filename_component( - output_dir - ${CMAKE_CURRENT_BINARY_DIR}/${THRIFT_FILE} - DIRECTORY - ) - - # Generate relative paths in #includes - file( - RELATIVE_PATH include_prefix - "${CMAKE_SOURCE_DIR}" - "${CMAKE_CURRENT_SOURCE_DIR}/${THRIFT_FILE}" - ) - get_filename_component(include_prefix ${include_prefix} DIRECTORY) - - if (NOT "${include_prefix}" STREQUAL "") - list(APPEND ARG_OPTIONS "include_prefix=${include_prefix}") - endif() - # CMake 3.12 is finally getting a list(JOIN) function, but until then - # treating the list as a string and replacing the semicolons is good enough. - string(REPLACE ";" "," GEN_ARG_STR "${ARG_OPTIONS}") - - # Compute the list of generated files - list(APPEND generated_headers - "${output_dir}/gen-cpp2/${base}_constants.h" - "${output_dir}/gen-cpp2/${base}_types.h" - "${output_dir}/gen-cpp2/${base}_types.tcc" - "${output_dir}/gen-cpp2/${base}_types_custom_protocol.h" - "${output_dir}/gen-cpp2/${base}_metadata.h" - ) - list(APPEND generated_sources - "${output_dir}/gen-cpp2/${base}_constants.cpp" - "${output_dir}/gen-cpp2/${base}_data.h" - "${output_dir}/gen-cpp2/${base}_data.cpp" - "${output_dir}/gen-cpp2/${base}_types.cpp" - "${output_dir}/gen-cpp2/${base}_metadata.cpp" - ) - foreach(service IN LISTS ARG_SERVICES) - list(APPEND generated_headers - "${output_dir}/gen-cpp2/${service}.h" - "${output_dir}/gen-cpp2/${service}.tcc" - "${output_dir}/gen-cpp2/${service}AsyncClient.h" - "${output_dir}/gen-cpp2/${service}_custom_protocol.h" - ) - list(APPEND generated_sources - "${output_dir}/gen-cpp2/${service}.cpp" - "${output_dir}/gen-cpp2/${service}AsyncClient.cpp" - "${output_dir}/gen-cpp2/${service}_processmap_binary.cpp" - "${output_dir}/gen-cpp2/${service}_processmap_compact.cpp" - ) - endforeach() - - # This generator expression gets the list of include directories required - # for all of our dependencies. - # It requires using COMMAND_EXPAND_LISTS in the add_custom_command() call - # below. COMMAND_EXPAND_LISTS is only available in CMake 3.8+ - # If we really had to support older versions of CMake we would probably need - # to use a wrapper script around the thrift compiler that could take the - # include list as a single argument and split it up before invoking the - # thrift compiler. - if (NOT POLICY CMP0067) - message(FATAL_ERROR "add_fbthrift_cpp_library() requires CMake 3.8+") - endif() - set( - thrift_include_options - "-I;$,;-I;>" - ) - - # Emit the rule to run the thrift compiler - add_custom_command( - OUTPUT - ${generated_headers} - ${generated_sources} - COMMAND_EXPAND_LISTS - COMMAND - "${CMAKE_COMMAND}" -E make_directory "${output_dir}" - COMMAND - "${FBTHRIFT_COMPILER}" - --legacy-strict - --gen "mstch_cpp2:${GEN_ARG_STR}" - "${thrift_include_options}" - -I "${FBTHRIFT_INCLUDE_DIR}" - -o "${output_dir}" - "${CMAKE_CURRENT_SOURCE_DIR}/${THRIFT_FILE}" - WORKING_DIRECTORY - "${CMAKE_BINARY_DIR}" - MAIN_DEPENDENCY - "${THRIFT_FILE}" - DEPENDS - ${ARG_DEPENDS} - "${FBTHRIFT_COMPILER}" - ) - - # Now emit the library rule to compile the sources - if (BUILD_SHARED_LIBS) - set(LIB_TYPE SHARED) - else () - set(LIB_TYPE STATIC) - endif () - - add_library( - "${LIB_NAME}" ${LIB_TYPE} - ${generated_sources} - ) - - target_include_directories( - "${LIB_NAME}" - PUBLIC - "$" - "$" - ) - target_link_libraries( - "${LIB_NAME}" - PUBLIC - ${ARG_DEPENDS} - FBThrift::thriftcpp2 - Folly::folly - mvfst::mvfst_server_async_tran - mvfst::mvfst_server - ) - - # Add ${generated_headers} to the PUBLIC_HEADER property for ${LIB_NAME} - # - # This allows callers to install it using - # "install(TARGETS ${LIB_NAME} PUBLIC_HEADER)" - # However, note that CMake's PUBLIC_HEADER behavior is rather inflexible, - # and does have any way to preserve header directory structure. Callers - # must be careful to use the correct PUBLIC_HEADER DESTINATION parameter - # when doing this, to put the files the correct directory themselves. - # We define a HEADER_INSTALL_DIR property with the include directory prefix, - # so typically callers should specify the PUBLIC_HEADER DESTINATION as - # "$" - set_property( - TARGET "${LIB_NAME}" - PROPERTY PUBLIC_HEADER ${generated_headers} - ) - - # Define a dummy interface library to help propagate the thrift include - # directories between dependencies. - add_library("${LIB_NAME}.thrift_includes" INTERFACE) - target_include_directories( - "${LIB_NAME}.thrift_includes" - INTERFACE - "$" - "$" - ) - foreach(dep IN LISTS ARG_DEPENDS) - target_link_libraries( - "${LIB_NAME}.thrift_includes" - INTERFACE "${dep}.thrift_includes" - ) - endforeach() - - set_target_properties( - "${LIB_NAME}" - PROPERTIES - EXPORT_PROPERTIES "THRIFT_INSTALL_DIR" - THRIFT_INSTALL_DIR "${ARG_THRIFT_INCLUDE_DIR}/${include_prefix}" - HEADER_INSTALL_DIR "${ARG_INCLUDE_DIR}/${include_prefix}/gen-cpp2" - ) -endfunction() diff --git a/build/fbcode_builder/CMake/FBThriftLibrary.cmake b/build/fbcode_builder/CMake/FBThriftLibrary.cmake deleted file mode 100644 index e4280e2a4092b..0000000000000 --- a/build/fbcode_builder/CMake/FBThriftLibrary.cmake +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. - -include(FBCMakeParseArgs) -include(FBThriftPyLibrary) -include(FBThriftCppLibrary) - -# -# add_fbthrift_library() -# -# This is a convenience function that generates thrift libraries for multiple -# languages. -# -# For example: -# add_fbthrift_library( -# foo foo.thrift -# LANGUAGES cpp py -# SERVICES Foo -# DEPENDS bar) -# -# will be expanded into two separate calls: -# -# add_fbthrift_cpp_library(foo_cpp foo.thrift SERVICES Foo DEPENDS bar_cpp) -# add_fbthrift_py_library(foo_py foo.thrift SERVICES Foo DEPENDS bar_py) -# -function(add_fbthrift_library LIB_NAME THRIFT_FILE) - # Parse the arguments - set(one_value_args PY_NAMESPACE INCLUDE_DIR THRIFT_INCLUDE_DIR) - set(multi_value_args SERVICES DEPENDS LANGUAGES CPP_OPTIONS PY_OPTIONS) - fb_cmake_parse_args( - ARG "" "${one_value_args}" "${multi_value_args}" "${ARGN}" - ) - - if(NOT DEFINED ARG_INCLUDE_DIR) - set(ARG_INCLUDE_DIR "include") - endif() - if(NOT DEFINED ARG_THRIFT_INCLUDE_DIR) - set(ARG_THRIFT_INCLUDE_DIR "${ARG_INCLUDE_DIR}/thrift-files") - endif() - - # CMake 3.12+ adds list(TRANSFORM) which would be nice to use here, but for - # now we still want to support older versions of CMake. - set(CPP_DEPENDS) - set(PY_DEPENDS) - foreach(dep IN LISTS ARG_DEPENDS) - list(APPEND CPP_DEPENDS "${dep}_cpp") - list(APPEND PY_DEPENDS "${dep}_py") - endforeach() - - foreach(lang IN LISTS ARG_LANGUAGES) - if ("${lang}" STREQUAL "cpp") - add_fbthrift_cpp_library( - "${LIB_NAME}_cpp" "${THRIFT_FILE}" - SERVICES ${ARG_SERVICES} - DEPENDS ${CPP_DEPENDS} - OPTIONS ${ARG_CPP_OPTIONS} - INCLUDE_DIR "${ARG_INCLUDE_DIR}" - THRIFT_INCLUDE_DIR "${ARG_THRIFT_INCLUDE_DIR}" - ) - elseif ("${lang}" STREQUAL "py" OR "${lang}" STREQUAL "python") - if (DEFINED ARG_PY_NAMESPACE) - set(namespace_args NAMESPACE "${ARG_PY_NAMESPACE}") - endif() - add_fbthrift_py_library( - "${LIB_NAME}_py" "${THRIFT_FILE}" - SERVICES ${ARG_SERVICES} - ${namespace_args} - DEPENDS ${PY_DEPENDS} - OPTIONS ${ARG_PY_OPTIONS} - THRIFT_INCLUDE_DIR "${ARG_THRIFT_INCLUDE_DIR}" - ) - else() - message( - FATAL_ERROR "unknown language for thrift library ${LIB_NAME}: ${lang}" - ) - endif() - endforeach() -endfunction() diff --git a/build/fbcode_builder/CMake/FBThriftPyLibrary.cmake b/build/fbcode_builder/CMake/FBThriftPyLibrary.cmake deleted file mode 100644 index fa77cde715533..0000000000000 --- a/build/fbcode_builder/CMake/FBThriftPyLibrary.cmake +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. - -include(FBCMakeParseArgs) -include(FBPythonBinary) - -# Generate a Python library from a thrift file -function(add_fbthrift_py_library LIB_NAME THRIFT_FILE) - # Parse the arguments - set(one_value_args NAMESPACE THRIFT_INCLUDE_DIR) - set(multi_value_args SERVICES DEPENDS OPTIONS) - fb_cmake_parse_args( - ARG "" "${one_value_args}" "${multi_value_args}" "${ARGN}" - ) - - if(NOT DEFINED ARG_THRIFT_INCLUDE_DIR) - set(ARG_THRIFT_INCLUDE_DIR "include/thrift-files") - endif() - - get_filename_component(base ${THRIFT_FILE} NAME_WE) - set(output_dir "${CMAKE_CURRENT_BINARY_DIR}/${THRIFT_FILE}-py") - - # Parse the namespace value - if (NOT DEFINED ARG_NAMESPACE) - set(ARG_NAMESPACE "${base}") - endif() - - string(REPLACE "." "/" namespace_dir "${ARG_NAMESPACE}") - set(py_output_dir "${output_dir}/gen-py/${namespace_dir}") - list(APPEND generated_sources - "${py_output_dir}/__init__.py" - "${py_output_dir}/ttypes.py" - "${py_output_dir}/constants.py" - ) - foreach(service IN LISTS ARG_SERVICES) - list(APPEND generated_sources - ${py_output_dir}/${service}.py - ) - endforeach() - - # Define a dummy interface library to help propagate the thrift include - # directories between dependencies. - add_library("${LIB_NAME}.thrift_includes" INTERFACE) - target_include_directories( - "${LIB_NAME}.thrift_includes" - INTERFACE - "$" - "$" - ) - foreach(dep IN LISTS ARG_DEPENDS) - target_link_libraries( - "${LIB_NAME}.thrift_includes" - INTERFACE "${dep}.thrift_includes" - ) - endforeach() - - # This generator expression gets the list of include directories required - # for all of our dependencies. - # It requires using COMMAND_EXPAND_LISTS in the add_custom_command() call - # below. COMMAND_EXPAND_LISTS is only available in CMake 3.8+ - # If we really had to support older versions of CMake we would probably need - # to use a wrapper script around the thrift compiler that could take the - # include list as a single argument and split it up before invoking the - # thrift compiler. - if (NOT POLICY CMP0067) - message(FATAL_ERROR "add_fbthrift_py_library() requires CMake 3.8+") - endif() - set( - thrift_include_options - "-I;$,;-I;>" - ) - - # Always force generation of "new-style" python classes for Python 2 - list(APPEND ARG_OPTIONS "new_style") - # CMake 3.12 is finally getting a list(JOIN) function, but until then - # treating the list as a string and replacing the semicolons is good enough. - string(REPLACE ";" "," GEN_ARG_STR "${ARG_OPTIONS}") - - # Emit the rule to run the thrift compiler - add_custom_command( - OUTPUT - ${generated_sources} - COMMAND_EXPAND_LISTS - COMMAND - "${CMAKE_COMMAND}" -E make_directory "${output_dir}" - COMMAND - "${FBTHRIFT_COMPILER}" - --legacy-strict - --gen "py:${GEN_ARG_STR}" - "${thrift_include_options}" - -o "${output_dir}" - "${CMAKE_CURRENT_SOURCE_DIR}/${THRIFT_FILE}" - WORKING_DIRECTORY - "${CMAKE_BINARY_DIR}" - MAIN_DEPENDENCY - "${THRIFT_FILE}" - DEPENDS - "${FBTHRIFT_COMPILER}" - ) - - # We always want to pass the namespace as "" to this call: - # thrift will already emit the files with the desired namespace prefix under - # gen-py. We don't want add_fb_python_library() to prepend the namespace a - # second time. - add_fb_python_library( - "${LIB_NAME}" - BASE_DIR "${output_dir}/gen-py" - NAMESPACE "" - SOURCES ${generated_sources} - DEPENDS ${ARG_DEPENDS} FBThrift::thrift_py - ) -endfunction() diff --git a/build/fbcode_builder/CMake/FindDoubleConversion.cmake b/build/fbcode_builder/CMake/FindDoubleConversion.cmake deleted file mode 100644 index 12a423bc15103..0000000000000 --- a/build/fbcode_builder/CMake/FindDoubleConversion.cmake +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# Finds libdouble-conversion. -# -# This module defines: -# DOUBLE_CONVERSION_INCLUDE_DIR -# DOUBLE_CONVERSION_LIBRARY -# - -find_path(DOUBLE_CONVERSION_INCLUDE_DIR double-conversion/double-conversion.h) -find_library(DOUBLE_CONVERSION_LIBRARY NAMES double-conversion) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - DoubleConversion - DEFAULT_MSG - DOUBLE_CONVERSION_LIBRARY DOUBLE_CONVERSION_INCLUDE_DIR) - -mark_as_advanced(DOUBLE_CONVERSION_INCLUDE_DIR DOUBLE_CONVERSION_LIBRARY) diff --git a/build/fbcode_builder/CMake/FindGMock.cmake b/build/fbcode_builder/CMake/FindGMock.cmake deleted file mode 100644 index cd042dd9c4fa6..0000000000000 --- a/build/fbcode_builder/CMake/FindGMock.cmake +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# Find libgmock -# -# LIBGMOCK_DEFINES - List of defines when using libgmock. -# LIBGMOCK_INCLUDE_DIR - where to find gmock/gmock.h, etc. -# LIBGMOCK_LIBRARIES - List of libraries when using libgmock. -# LIBGMOCK_FOUND - True if libgmock found. - -IF (LIBGMOCK_INCLUDE_DIR) - # Already in cache, be silent - SET(LIBGMOCK_FIND_QUIETLY TRUE) -ENDIF () - -find_package(GTest CONFIG QUIET) -if (TARGET GTest::gmock) - get_target_property(LIBGMOCK_DEFINES GTest::gtest INTERFACE_COMPILE_DEFINITIONS) - if (NOT ${LIBGMOCK_DEFINES}) - # Explicitly set to empty string if not found to avoid it being - # set to NOTFOUND and breaking compilation - set(LIBGMOCK_DEFINES "") - endif() - get_target_property(LIBGMOCK_INCLUDE_DIR GTest::gtest INTERFACE_INCLUDE_DIRECTORIES) - set(LIBGMOCK_LIBRARIES GTest::gmock_main GTest::gmock GTest::gtest) - set(LIBGMOCK_FOUND ON) - message(STATUS "Found gmock via config, defines=${LIBGMOCK_DEFINES}, include=${LIBGMOCK_INCLUDE_DIR}, libs=${LIBGMOCK_LIBRARIES}") -else() - - FIND_PATH(LIBGMOCK_INCLUDE_DIR gmock/gmock.h) - - FIND_LIBRARY(LIBGMOCK_MAIN_LIBRARY_DEBUG NAMES gmock_maind) - FIND_LIBRARY(LIBGMOCK_MAIN_LIBRARY_RELEASE NAMES gmock_main) - FIND_LIBRARY(LIBGMOCK_LIBRARY_DEBUG NAMES gmockd) - FIND_LIBRARY(LIBGMOCK_LIBRARY_RELEASE NAMES gmock) - FIND_LIBRARY(LIBGTEST_LIBRARY_DEBUG NAMES gtestd) - FIND_LIBRARY(LIBGTEST_LIBRARY_RELEASE NAMES gtest) - - find_package(Threads REQUIRED) - INCLUDE(SelectLibraryConfigurations) - SELECT_LIBRARY_CONFIGURATIONS(LIBGMOCK_MAIN) - SELECT_LIBRARY_CONFIGURATIONS(LIBGMOCK) - SELECT_LIBRARY_CONFIGURATIONS(LIBGTEST) - - set(LIBGMOCK_LIBRARIES - ${LIBGMOCK_MAIN_LIBRARY} - ${LIBGMOCK_LIBRARY} - ${LIBGTEST_LIBRARY} - Threads::Threads - ) - - if(CMAKE_SYSTEM_NAME STREQUAL "Windows") - # The GTEST_LINKED_AS_SHARED_LIBRARY macro must be set properly on Windows. - # - # There isn't currently an easy way to determine if a library was compiled as - # a shared library on Windows, so just assume we've been built against a - # shared build of gmock for now. - SET(LIBGMOCK_DEFINES "GTEST_LINKED_AS_SHARED_LIBRARY=1" CACHE STRING "") - endif() - - # handle the QUIETLY and REQUIRED arguments and set LIBGMOCK_FOUND to TRUE if - # all listed variables are TRUE - INCLUDE(FindPackageHandleStandardArgs) - FIND_PACKAGE_HANDLE_STANDARD_ARGS( - GMock - DEFAULT_MSG - LIBGMOCK_MAIN_LIBRARY - LIBGMOCK_LIBRARY - LIBGTEST_LIBRARY - LIBGMOCK_LIBRARIES - LIBGMOCK_INCLUDE_DIR - ) - - MARK_AS_ADVANCED( - LIBGMOCK_DEFINES - LIBGMOCK_MAIN_LIBRARY - LIBGMOCK_LIBRARY - LIBGTEST_LIBRARY - LIBGMOCK_LIBRARIES - LIBGMOCK_INCLUDE_DIR - ) -endif() diff --git a/build/fbcode_builder/CMake/FindGflags.cmake b/build/fbcode_builder/CMake/FindGflags.cmake deleted file mode 100644 index 0101203e03635..0000000000000 --- a/build/fbcode_builder/CMake/FindGflags.cmake +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# Find libgflags. -# There's a lot of compatibility cruft going on in here, both -# to deal with changes across the FB consumers of this and also -# to deal with variances in behavior of cmake itself. -# -# Since this file is named FindGflags.cmake the cmake convention -# is for the module to export both GFLAGS_FOUND and Gflags_FOUND. -# The convention expected by consumers is that we export the -# following variables, even though these do not match the cmake -# conventions: -# -# LIBGFLAGS_INCLUDE_DIR - where to find gflags/gflags.h, etc. -# LIBGFLAGS_LIBRARY - List of libraries when using libgflags. -# LIBGFLAGS_FOUND - True if libgflags found. -# -# We need to be able to locate gflags both from an installed -# cmake config file and just from the raw headers and libs, so -# test for the former and then the latter, and then stick -# the results together and export them into the variables -# listed above. -# -# For forwards compatibility, we export the following variables: -# -# gflags_INCLUDE_DIR - where to find gflags/gflags.h, etc. -# gflags_TARGET / GFLAGS_TARGET / gflags_LIBRARIES -# - List of libraries when using libgflags. -# gflags_FOUND - True if libgflags found. -# - -IF (LIBGFLAGS_INCLUDE_DIR) - # Already in cache, be silent - SET(Gflags_FIND_QUIETLY TRUE) -ENDIF () - -find_package(gflags CONFIG QUIET) -if (gflags_FOUND) - if (NOT Gflags_FIND_QUIETLY) - message(STATUS "Found gflags from package config ${gflags_CONFIG}") - endif() - # Re-export the config-specified libs with our local names - set(LIBGFLAGS_LIBRARY ${gflags_LIBRARIES}) - set(LIBGFLAGS_INCLUDE_DIR ${gflags_INCLUDE_DIR}) - if(NOT EXISTS "${gflags_INCLUDE_DIR}") - # The gflags-devel RPM on recent RedHat-based systems is somewhat broken. - # RedHat symlinks /lib64 to /usr/lib64, and this breaks some of the - # relative path computation performed in gflags-config.cmake. The package - # config file ends up being found via /lib64, but the relative path - # computation it does only works if it was found in /usr/lib64. - # If gflags_INCLUDE_DIR does not actually exist, simply default it to - # /usr/include on these systems. - set(LIBGFLAGS_INCLUDE_DIR "/usr/include") - set(GFLAGS_INCLUDE_DIR "/usr/include") - endif() - set(LIBGFLAGS_FOUND ${gflags_FOUND}) - # cmake module compat - set(GFLAGS_FOUND ${gflags_FOUND}) - set(Gflags_FOUND ${gflags_FOUND}) -else() - FIND_PATH(LIBGFLAGS_INCLUDE_DIR gflags/gflags.h) - - FIND_LIBRARY(LIBGFLAGS_LIBRARY_DEBUG NAMES gflagsd gflags_staticd) - FIND_LIBRARY(LIBGFLAGS_LIBRARY_RELEASE NAMES gflags gflags_static) - - INCLUDE(SelectLibraryConfigurations) - SELECT_LIBRARY_CONFIGURATIONS(LIBGFLAGS) - - # handle the QUIETLY and REQUIRED arguments and set LIBGFLAGS_FOUND to TRUE if - # all listed variables are TRUE - INCLUDE(FindPackageHandleStandardArgs) - FIND_PACKAGE_HANDLE_STANDARD_ARGS(gflags DEFAULT_MSG LIBGFLAGS_LIBRARY LIBGFLAGS_INCLUDE_DIR) - # cmake module compat - set(Gflags_FOUND ${GFLAGS_FOUND}) - # compat with some existing FindGflags consumers - set(LIBGFLAGS_FOUND ${GFLAGS_FOUND}) - - # Compat with the gflags CONFIG based detection - set(gflags_FOUND ${GFLAGS_FOUND}) - set(gflags_INCLUDE_DIR ${LIBGFLAGS_INCLUDE_DIR}) - set(gflags_LIBRARIES ${LIBGFLAGS_LIBRARY}) - set(GFLAGS_TARGET ${LIBGFLAGS_LIBRARY}) - set(gflags_TARGET ${LIBGFLAGS_LIBRARY}) - - MARK_AS_ADVANCED(LIBGFLAGS_LIBRARY LIBGFLAGS_INCLUDE_DIR) -endif() - -# Compat with the gflags CONFIG based detection -if (LIBGFLAGS_FOUND AND NOT TARGET gflags) - add_library(gflags UNKNOWN IMPORTED) - if(TARGET gflags-shared) - # If the installed gflags CMake package config defines a gflags-shared - # target but not gflags, just make the gflags target that we define - # depend on the gflags-shared target. - target_link_libraries(gflags INTERFACE gflags-shared) - # Export LIBGFLAGS_LIBRARY as the gflags-shared target in this case. - set(LIBGFLAGS_LIBRARY gflags-shared) - else() - set_target_properties( - gflags - PROPERTIES - IMPORTED_LINK_INTERFACE_LANGUAGES "C" - IMPORTED_LOCATION "${LIBGFLAGS_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES "${LIBGFLAGS_INCLUDE_DIR}" - ) - endif() -endif() diff --git a/build/fbcode_builder/CMake/FindGlog.cmake b/build/fbcode_builder/CMake/FindGlog.cmake deleted file mode 100644 index 38ee54a7cf899..0000000000000 --- a/build/fbcode_builder/CMake/FindGlog.cmake +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# - Try to find Glog -# Once done, this will define -# -# GLOG_FOUND - system has Glog -# GLOG_INCLUDE_DIRS - the Glog include directories -# GLOG_LIBRARIES - link these to use Glog - -include(FindPackageHandleStandardArgs) -include(SelectLibraryConfigurations) - -find_library(GLOG_LIBRARY_RELEASE glog - PATHS ${GLOG_LIBRARYDIR}) -find_library(GLOG_LIBRARY_DEBUG glogd - PATHS ${GLOG_LIBRARYDIR}) - -find_path(GLOG_INCLUDE_DIR glog/logging.h - PATHS ${GLOG_INCLUDEDIR}) - -select_library_configurations(GLOG) - -find_package_handle_standard_args(Glog DEFAULT_MSG - GLOG_LIBRARY - GLOG_INCLUDE_DIR) - -mark_as_advanced( - GLOG_LIBRARY - GLOG_INCLUDE_DIR) - -set(GLOG_LIBRARIES ${GLOG_LIBRARY}) -set(GLOG_INCLUDE_DIRS ${GLOG_INCLUDE_DIR}) - -if (NOT TARGET glog::glog) - add_library(glog::glog UNKNOWN IMPORTED) - set_target_properties(glog::glog PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${GLOG_INCLUDE_DIRS}") - set_target_properties(glog::glog PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "C" IMPORTED_LOCATION "${GLOG_LIBRARIES}") - - find_package(Gflags) - if(GFLAGS_FOUND) - message(STATUS "Found gflags as a dependency of glog::glog, include=${LIBGFLAGS_INCLUDE_DIR}, libs=${LIBGFLAGS_LIBRARY}") - set_target_properties(glog::glog PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES ${LIBGFLAGS_LIBRARY}) - endif() -endif() diff --git a/build/fbcode_builder/CMake/FindLibEvent.cmake b/build/fbcode_builder/CMake/FindLibEvent.cmake deleted file mode 100644 index dd11ebd8435d7..0000000000000 --- a/build/fbcode_builder/CMake/FindLibEvent.cmake +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# - Find LibEvent (a cross event library) -# This module defines -# LIBEVENT_INCLUDE_DIR, where to find LibEvent headers -# LIBEVENT_LIB, LibEvent libraries -# LibEvent_FOUND, If false, do not try to use libevent - -set(LibEvent_EXTRA_PREFIXES /usr/local /opt/local "$ENV{HOME}") -foreach(prefix ${LibEvent_EXTRA_PREFIXES}) - list(APPEND LibEvent_INCLUDE_PATHS "${prefix}/include") - list(APPEND LibEvent_LIB_PATHS "${prefix}/lib") -endforeach() - -find_package(Libevent CONFIG QUIET) -if (TARGET event) - # Re-export the config under our own names - - # Somewhat gross, but some vcpkg installed libevents have a relative - # `include` path exported into LIBEVENT_INCLUDE_DIRS, which triggers - # a cmake error because it resolves to the `include` dir within the - # folly repo, which is not something cmake allows to be in the - # INTERFACE_INCLUDE_DIRECTORIES. Thankfully on such a system the - # actual include directory is already part of the global include - # directories, so we can just skip it. - if (NOT "${LIBEVENT_INCLUDE_DIRS}" STREQUAL "include") - set(LIBEVENT_INCLUDE_DIR ${LIBEVENT_INCLUDE_DIRS}) - else() - set(LIBEVENT_INCLUDE_DIR) - endif() - - # Unfortunately, with a bare target name `event`, downstream consumers - # of the package that depends on `Libevent` located via CONFIG end - # up exporting just a bare `event` in their libraries. This is problematic - # because this in interpreted as just `-levent` with no library path. - # When libevent is not installed in the default installation prefix - # this results in linker errors. - # To resolve this, we ask cmake to lookup the full path to the library - # and use that instead. - cmake_policy(PUSH) - if(POLICY CMP0026) - # Allow reading the LOCATION property - cmake_policy(SET CMP0026 OLD) - endif() - get_target_property(LIBEVENT_LIB event LOCATION) - cmake_policy(POP) - - set(LibEvent_FOUND ${Libevent_FOUND}) - if (NOT LibEvent_FIND_QUIETLY) - message(STATUS "Found libevent from package config include=${LIBEVENT_INCLUDE_DIRS} lib=${LIBEVENT_LIB}") - endif() -else() - find_path(LIBEVENT_INCLUDE_DIR event.h PATHS ${LibEvent_INCLUDE_PATHS}) - find_library(LIBEVENT_LIB NAMES event PATHS ${LibEvent_LIB_PATHS}) - - if (LIBEVENT_LIB AND LIBEVENT_INCLUDE_DIR) - set(LibEvent_FOUND TRUE) - set(LIBEVENT_LIB ${LIBEVENT_LIB}) - else () - set(LibEvent_FOUND FALSE) - endif () - - if (LibEvent_FOUND) - if (NOT LibEvent_FIND_QUIETLY) - message(STATUS "Found libevent: ${LIBEVENT_LIB}") - endif () - else () - if (LibEvent_FIND_REQUIRED) - message(FATAL_ERROR "Could NOT find libevent.") - endif () - message(STATUS "libevent NOT found.") - endif () - - mark_as_advanced( - LIBEVENT_LIB - LIBEVENT_INCLUDE_DIR - ) -endif() diff --git a/build/fbcode_builder/CMake/FindLibUnwind.cmake b/build/fbcode_builder/CMake/FindLibUnwind.cmake deleted file mode 100644 index b01a674a5ba0f..0000000000000 --- a/build/fbcode_builder/CMake/FindLibUnwind.cmake +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -find_path(LIBUNWIND_INCLUDE_DIR NAMES libunwind.h) -mark_as_advanced(LIBUNWIND_INCLUDE_DIR) - -find_library(LIBUNWIND_LIBRARY NAMES unwind) -mark_as_advanced(LIBUNWIND_LIBRARY) - -include(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS( - LIBUNWIND - REQUIRED_VARS LIBUNWIND_LIBRARY LIBUNWIND_INCLUDE_DIR) - -if(LIBUNWIND_FOUND) - set(LIBUNWIND_LIBRARIES ${LIBUNWIND_LIBRARY}) - set(LIBUNWIND_INCLUDE_DIRS ${LIBUNWIND_INCLUDE_DIR}) -endif() diff --git a/build/fbcode_builder/CMake/FindPCRE.cmake b/build/fbcode_builder/CMake/FindPCRE.cmake deleted file mode 100644 index 32ccb372536f9..0000000000000 --- a/build/fbcode_builder/CMake/FindPCRE.cmake +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -include(FindPackageHandleStandardArgs) -find_path(PCRE_INCLUDE_DIR NAMES pcre.h) -find_library(PCRE_LIBRARY NAMES pcre) -find_package_handle_standard_args( - PCRE - DEFAULT_MSG - PCRE_LIBRARY - PCRE_INCLUDE_DIR -) -mark_as_advanced(PCRE_INCLUDE_DIR PCRE_LIBRARY) diff --git a/build/fbcode_builder/CMake/FindPCRE2.cmake b/build/fbcode_builder/CMake/FindPCRE2.cmake deleted file mode 100644 index c2c64a29bb4ee..0000000000000 --- a/build/fbcode_builder/CMake/FindPCRE2.cmake +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -include(FindPackageHandleStandardArgs) -find_path(PCRE2_INCLUDE_DIR NAMES pcre2.h) -find_library(PCRE2_LIBRARY NAMES pcre2-8) -find_package_handle_standard_args( - PCRE2 - DEFAULT_MSG - PCRE2_LIBRARY - PCRE2_INCLUDE_DIR -) -set(PCRE2_DEFINES "PCRE2_CODE_UNIT_WIDTH=8") -mark_as_advanced(PCRE2_INCLUDE_DIR PCRE2_LIBRARY PCRE2_DEFINES) diff --git a/build/fbcode_builder/CMake/FindRe2.cmake b/build/fbcode_builder/CMake/FindRe2.cmake deleted file mode 100644 index 013ae7761e9c7..0000000000000 --- a/build/fbcode_builder/CMake/FindRe2.cmake +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This software may be used and distributed according to the terms of the -# GNU General Public License version 2. - -find_library(RE2_LIBRARY re2) -mark_as_advanced(RE2_LIBRARY) - -find_path(RE2_INCLUDE_DIR NAMES re2/re2.h) -mark_as_advanced(RE2_INCLUDE_DIR) - -include(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS( - RE2 - REQUIRED_VARS RE2_LIBRARY RE2_INCLUDE_DIR) - -if(RE2_FOUND) - set(RE2_LIBRARY ${RE2_LIBRARY}) - set(RE2_INCLUDE_DIR, ${RE2_INCLUDE_DIR}) -endif() diff --git a/build/fbcode_builder/CMake/FindSodium.cmake b/build/fbcode_builder/CMake/FindSodium.cmake deleted file mode 100644 index 3c3f1245c1dcc..0000000000000 --- a/build/fbcode_builder/CMake/FindSodium.cmake +++ /dev/null @@ -1,297 +0,0 @@ -# Written in 2016 by Henrik Steffen Gaßmann -# -# To the extent possible under law, the author(s) have dedicated all -# copyright and related and neighboring rights to this software to the -# public domain worldwide. This software is distributed without any warranty. -# -# You should have received a copy of the CC0 Public Domain Dedication -# along with this software. If not, see -# -# http://creativecommons.org/publicdomain/zero/1.0/ -# -######################################################################## -# Tries to find the local libsodium installation. -# -# On Windows the sodium_DIR environment variable is used as a default -# hint which can be overridden by setting the corresponding cmake variable. -# -# Once done the following variables will be defined: -# -# sodium_FOUND -# sodium_INCLUDE_DIR -# sodium_LIBRARY_DEBUG -# sodium_LIBRARY_RELEASE -# -# -# Furthermore an imported "sodium" target is created. -# - -if (CMAKE_C_COMPILER_ID STREQUAL "GNU" - OR CMAKE_C_COMPILER_ID STREQUAL "Clang") - set(_GCC_COMPATIBLE 1) -endif() - -# static library option -if (NOT DEFINED sodium_USE_STATIC_LIBS) - option(sodium_USE_STATIC_LIBS "enable to statically link against sodium" OFF) -endif() -if(NOT (sodium_USE_STATIC_LIBS EQUAL sodium_USE_STATIC_LIBS_LAST)) - unset(sodium_LIBRARY CACHE) - unset(sodium_LIBRARY_DEBUG CACHE) - unset(sodium_LIBRARY_RELEASE CACHE) - unset(sodium_DLL_DEBUG CACHE) - unset(sodium_DLL_RELEASE CACHE) - set(sodium_USE_STATIC_LIBS_LAST ${sodium_USE_STATIC_LIBS} CACHE INTERNAL "internal change tracking variable") -endif() - - -######################################################################## -# UNIX -if (UNIX) - # import pkg-config - find_package(PkgConfig QUIET) - if (PKG_CONFIG_FOUND) - pkg_check_modules(sodium_PKG QUIET libsodium) - endif() - - if(sodium_USE_STATIC_LIBS) - foreach(_libname ${sodium_PKG_STATIC_LIBRARIES}) - if (NOT _libname MATCHES "^lib.*\\.a$") # ignore strings already ending with .a - list(INSERT sodium_PKG_STATIC_LIBRARIES 0 "lib${_libname}.a") - endif() - endforeach() - list(REMOVE_DUPLICATES sodium_PKG_STATIC_LIBRARIES) - - # if pkgconfig for libsodium doesn't provide - # static lib info, then override PKG_STATIC here.. - if (NOT sodium_PKG_STATIC_FOUND) - set(sodium_PKG_STATIC_LIBRARIES libsodium.a) - endif() - - set(XPREFIX sodium_PKG_STATIC) - else() - if (NOT sodium_PKG_FOUND) - set(sodium_PKG_LIBRARIES sodium) - endif() - - set(XPREFIX sodium_PKG) - endif() - - find_path(sodium_INCLUDE_DIR sodium.h - HINTS ${${XPREFIX}_INCLUDE_DIRS} - ) - find_library(sodium_LIBRARY_DEBUG NAMES ${${XPREFIX}_LIBRARIES} - HINTS ${${XPREFIX}_LIBRARY_DIRS} - ) - find_library(sodium_LIBRARY_RELEASE NAMES ${${XPREFIX}_LIBRARIES} - HINTS ${${XPREFIX}_LIBRARY_DIRS} - ) - - -######################################################################## -# Windows -elseif (WIN32) - set(sodium_DIR "$ENV{sodium_DIR}" CACHE FILEPATH "sodium install directory") - mark_as_advanced(sodium_DIR) - - find_path(sodium_INCLUDE_DIR sodium.h - HINTS ${sodium_DIR} - PATH_SUFFIXES include - ) - - if (MSVC) - # detect target architecture - file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/arch.cpp" [=[ - #if defined _M_IX86 - #error ARCH_VALUE x86_32 - #elif defined _M_X64 - #error ARCH_VALUE x86_64 - #endif - #error ARCH_VALUE unknown - ]=]) - try_compile(_UNUSED_VAR "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/arch.cpp" - OUTPUT_VARIABLE _COMPILATION_LOG - ) - string(REGEX REPLACE ".*ARCH_VALUE ([a-zA-Z0-9_]+).*" "\\1" _TARGET_ARCH "${_COMPILATION_LOG}") - - # construct library path - if (_TARGET_ARCH STREQUAL "x86_32") - string(APPEND _PLATFORM_PATH "Win32") - elseif(_TARGET_ARCH STREQUAL "x86_64") - string(APPEND _PLATFORM_PATH "x64") - else() - message(FATAL_ERROR "the ${_TARGET_ARCH} architecture is not supported by Findsodium.cmake.") - endif() - string(APPEND _PLATFORM_PATH "/$$CONFIG$$") - - if (MSVC_VERSION LESS 1900) - math(EXPR _VS_VERSION "${MSVC_VERSION} / 10 - 60") - else() - math(EXPR _VS_VERSION "${MSVC_VERSION} / 10 - 50") - endif() - string(APPEND _PLATFORM_PATH "/v${_VS_VERSION}") - - if (sodium_USE_STATIC_LIBS) - string(APPEND _PLATFORM_PATH "/static") - else() - string(APPEND _PLATFORM_PATH "/dynamic") - endif() - - string(REPLACE "$$CONFIG$$" "Debug" _DEBUG_PATH_SUFFIX "${_PLATFORM_PATH}") - string(REPLACE "$$CONFIG$$" "Release" _RELEASE_PATH_SUFFIX "${_PLATFORM_PATH}") - - find_library(sodium_LIBRARY_DEBUG libsodium.lib - HINTS ${sodium_DIR} - PATH_SUFFIXES ${_DEBUG_PATH_SUFFIX} - ) - find_library(sodium_LIBRARY_RELEASE libsodium.lib - HINTS ${sodium_DIR} - PATH_SUFFIXES ${_RELEASE_PATH_SUFFIX} - ) - if (NOT sodium_USE_STATIC_LIBS) - set(CMAKE_FIND_LIBRARY_SUFFIXES_BCK ${CMAKE_FIND_LIBRARY_SUFFIXES}) - set(CMAKE_FIND_LIBRARY_SUFFIXES ".dll") - find_library(sodium_DLL_DEBUG libsodium - HINTS ${sodium_DIR} - PATH_SUFFIXES ${_DEBUG_PATH_SUFFIX} - ) - find_library(sodium_DLL_RELEASE libsodium - HINTS ${sodium_DIR} - PATH_SUFFIXES ${_RELEASE_PATH_SUFFIX} - ) - set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_BCK}) - endif() - - elseif(_GCC_COMPATIBLE) - if (sodium_USE_STATIC_LIBS) - find_library(sodium_LIBRARY_DEBUG libsodium.a - HINTS ${sodium_DIR} - PATH_SUFFIXES lib - ) - find_library(sodium_LIBRARY_RELEASE libsodium.a - HINTS ${sodium_DIR} - PATH_SUFFIXES lib - ) - else() - find_library(sodium_LIBRARY_DEBUG libsodium.dll.a - HINTS ${sodium_DIR} - PATH_SUFFIXES lib - ) - find_library(sodium_LIBRARY_RELEASE libsodium.dll.a - HINTS ${sodium_DIR} - PATH_SUFFIXES lib - ) - - file(GLOB _DLL - LIST_DIRECTORIES false - RELATIVE "${sodium_DIR}/bin" - "${sodium_DIR}/bin/libsodium*.dll" - ) - find_library(sodium_DLL_DEBUG ${_DLL} libsodium - HINTS ${sodium_DIR} - PATH_SUFFIXES bin - ) - find_library(sodium_DLL_RELEASE ${_DLL} libsodium - HINTS ${sodium_DIR} - PATH_SUFFIXES bin - ) - endif() - else() - message(FATAL_ERROR "this platform is not supported by FindSodium.cmake") - endif() - - -######################################################################## -# unsupported -else() - message(FATAL_ERROR "this platform is not supported by FindSodium.cmake") -endif() - - -######################################################################## -# common stuff - -# extract sodium version -if (sodium_INCLUDE_DIR) - set(_VERSION_HEADER "${_INCLUDE_DIR}/sodium/version.h") - if (EXISTS _VERSION_HEADER) - file(READ "${_VERSION_HEADER}" _VERSION_HEADER_CONTENT) - string(REGEX REPLACE ".*#[ \t]*define[ \t]*SODIUM_VERSION_STRING[ \t]*\"([^\n]*)\".*" "\\1" - sodium_VERSION "${_VERSION_HEADER_CONTENT}") - set(sodium_VERSION "${sodium_VERSION}" PARENT_SCOPE) - endif() -endif() - -# communicate results -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - Sodium # The name must be either uppercase or match the filename case. - REQUIRED_VARS - sodium_LIBRARY_RELEASE - sodium_LIBRARY_DEBUG - sodium_INCLUDE_DIR - VERSION_VAR - sodium_VERSION -) - -if(Sodium_FOUND) - set(sodium_LIBRARIES - optimized ${sodium_LIBRARY_RELEASE} debug ${sodium_LIBRARY_DEBUG}) -endif() - -# mark file paths as advanced -mark_as_advanced(sodium_INCLUDE_DIR) -mark_as_advanced(sodium_LIBRARY_DEBUG) -mark_as_advanced(sodium_LIBRARY_RELEASE) -if (WIN32) - mark_as_advanced(sodium_DLL_DEBUG) - mark_as_advanced(sodium_DLL_RELEASE) -endif() - -# create imported target -if(sodium_USE_STATIC_LIBS) - set(_LIB_TYPE STATIC) -else() - set(_LIB_TYPE SHARED) -endif() - -if(NOT TARGET sodium) - add_library(sodium ${_LIB_TYPE} IMPORTED) -endif() - -set_target_properties(sodium PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${sodium_INCLUDE_DIR}" - IMPORTED_LINK_INTERFACE_LANGUAGES "C" -) - -if (sodium_USE_STATIC_LIBS) - set_target_properties(sodium PROPERTIES - INTERFACE_COMPILE_DEFINITIONS "SODIUM_STATIC" - IMPORTED_LOCATION "${sodium_LIBRARY_RELEASE}" - IMPORTED_LOCATION_DEBUG "${sodium_LIBRARY_DEBUG}" - ) -else() - if (UNIX) - set_target_properties(sodium PROPERTIES - IMPORTED_LOCATION "${sodium_LIBRARY_RELEASE}" - IMPORTED_LOCATION_DEBUG "${sodium_LIBRARY_DEBUG}" - ) - elseif (WIN32) - set_target_properties(sodium PROPERTIES - IMPORTED_IMPLIB "${sodium_LIBRARY_RELEASE}" - IMPORTED_IMPLIB_DEBUG "${sodium_LIBRARY_DEBUG}" - ) - if (NOT (sodium_DLL_DEBUG MATCHES ".*-NOTFOUND")) - set_target_properties(sodium PROPERTIES - IMPORTED_LOCATION_DEBUG "${sodium_DLL_DEBUG}" - ) - endif() - if (NOT (sodium_DLL_RELEASE MATCHES ".*-NOTFOUND")) - set_target_properties(sodium PROPERTIES - IMPORTED_LOCATION_RELWITHDEBINFO "${sodium_DLL_RELEASE}" - IMPORTED_LOCATION_MINSIZEREL "${sodium_DLL_RELEASE}" - IMPORTED_LOCATION_RELEASE "${sodium_DLL_RELEASE}" - ) - endif() - endif() -endif() diff --git a/build/fbcode_builder/CMake/FindZstd.cmake b/build/fbcode_builder/CMake/FindZstd.cmake deleted file mode 100644 index 89300ddfd3987..0000000000000 --- a/build/fbcode_builder/CMake/FindZstd.cmake +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# -# - Try to find Facebook zstd library -# This will define -# ZSTD_FOUND -# ZSTD_INCLUDE_DIR -# ZSTD_LIBRARY -# - -find_path(ZSTD_INCLUDE_DIR NAMES zstd.h) - -find_library(ZSTD_LIBRARY_DEBUG NAMES zstdd zstd_staticd) -find_library(ZSTD_LIBRARY_RELEASE NAMES zstd zstd_static) - -include(SelectLibraryConfigurations) -SELECT_LIBRARY_CONFIGURATIONS(ZSTD) - -include(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS( - ZSTD DEFAULT_MSG - ZSTD_LIBRARY ZSTD_INCLUDE_DIR -) - -if (ZSTD_FOUND) - message(STATUS "Found Zstd: ${ZSTD_LIBRARY}") -endif() - -mark_as_advanced(ZSTD_INCLUDE_DIR ZSTD_LIBRARY) diff --git a/build/fbcode_builder/CMake/RustStaticLibrary.cmake b/build/fbcode_builder/CMake/RustStaticLibrary.cmake deleted file mode 100644 index dd57b2b3dcaa5..0000000000000 --- a/build/fbcode_builder/CMake/RustStaticLibrary.cmake +++ /dev/null @@ -1,316 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -include(FBCMakeParseArgs) - -set( - USE_CARGO_VENDOR AUTO CACHE STRING - "Download Rust Crates from an internally vendored location" -) -set_property(CACHE USE_CARGO_VENDOR PROPERTY STRINGS AUTO ON OFF) - -set( - GENERATE_CARGO_VENDOR_CONFIG AUTO CACHE STRING - "Whether to generate Rust cargo vendor config or use existing" -) -set_property(CACHE GENERATE_CARGO_VENDOR_CONFIG PROPERTY STRINGS AUTO ON OFF) - -set(RUST_VENDORED_CRATES_DIR "$ENV{RUST_VENDORED_CRATES_DIR}") - -if("${USE_CARGO_VENDOR}" STREQUAL "AUTO") - if(EXISTS "${RUST_VENDORED_CRATES_DIR}") - set(USE_CARGO_VENDOR ON) - else() - set(USE_CARGO_VENDOR OFF) - endif() -endif() - -if("${GENERATE_CARGO_VENDOR_CONFIG}" STREQUAL "AUTO") - set(GENERATE_CARGO_VENDOR_CONFIG "${USE_CARGO_VENDOR}") -endif() - -if(GENERATE_CARGO_VENDOR_CONFIG) - if(NOT EXISTS "${RUST_VENDORED_CRATES_DIR}") - message( - FATAL "vendored rust crates not present: " - "${RUST_VENDORED_CRATES_DIR}" - ) - endif() - - set(RUST_CARGO_HOME "${CMAKE_BINARY_DIR}/_cargo_home") - file(MAKE_DIRECTORY "${RUST_CARGO_HOME}") - - file( - TO_NATIVE_PATH "${RUST_VENDORED_CRATES_DIR}" - ESCAPED_RUST_VENDORED_CRATES_DIR - ) - string( - REPLACE "\\" "\\\\" - ESCAPED_RUST_VENDORED_CRATES_DIR - "${ESCAPED_RUST_VENDORED_CRATES_DIR}" - ) - file( - WRITE "${RUST_CARGO_HOME}/config" - "[source.crates-io]\n" - "replace-with = \"vendored-sources\"\n" - "\n" - "[source.vendored-sources]\n" - "directory = \"${ESCAPED_RUST_VENDORED_CRATES_DIR}\"\n" - ) -endif() - -find_program(CARGO_COMMAND cargo REQUIRED) - -# Cargo is a build system in itself, and thus will try to take advantage of all -# the cores on the system. Unfortunately, this conflicts with Ninja, since it -# also tries to utilize all the cores. This can lead to a system that is -# completely overloaded with compile jobs to the point where nothing else can -# be achieved on the system. -# -# Let's inform Ninja of this fact so it won't try to spawn other jobs while -# Rust being compiled. -set_property(GLOBAL APPEND PROPERTY JOB_POOLS rust_job_pool=1) - -# This function creates an interface library target based on the static library -# built by Cargo. It will call Cargo to build a staticlib and generate a CMake -# interface library with it. -# -# This function requires `find_package(Python COMPONENTS Interpreter)`. -# -# You need to set `lib:crate-type = ["staticlib"]` in your Cargo.toml to make -# Cargo build static library. -# -# ```cmake -# rust_static_library( [CRATE ] [FEATURES ]) -# ``` -# -# Parameters: -# - TARGET: -# Name of the target name. This function will create an interface library -# target with this name. -# - CRATE_NAME: -# Name of the crate. This parameter is optional. If unspecified, it will -# fallback to `${TARGET}`. -# - FEATURE_NAME: -# Name of the Rust feature to enable. -# -# This function creates two targets: -# - "${TARGET}": an interface library target contains the static library built -# from Cargo. -# - "${TARGET}.cargo": an internal custom target that invokes Cargo. -# -# If you are going to use this static library from C/C++, you will need to -# write header files for the library (or generate with cbindgen) and bind these -# headers with the interface library. -# -function(rust_static_library TARGET) - fb_cmake_parse_args(ARG "" "CRATE;FEATURES" "" "${ARGN}") - - if(DEFINED ARG_CRATE) - set(crate_name "${ARG_CRATE}") - else() - set(crate_name "${TARGET}") - endif() - if(DEFINED ARG_FEATURES) - set(features --features ${ARG_FEATURES}) - else() - set(features ) - endif() - - set(cargo_target "${TARGET}.cargo") - set(target_dir $,debug,release>) - set(staticlib_name "${CMAKE_STATIC_LIBRARY_PREFIX}${crate_name}${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(rust_staticlib "${CMAKE_CURRENT_BINARY_DIR}/${target_dir}/${staticlib_name}") - - if(DEFINED ARG_FEATURES) - set(cargo_flags build $,,--release> -p ${crate_name} --features ${ARG_FEATURES}) - else() - set(cargo_flags build $,,--release> -p ${crate_name}) - endif() - if(USE_CARGO_VENDOR) - set(extra_cargo_env "CARGO_HOME=${RUST_CARGO_HOME}") - set(cargo_flags ${cargo_flags}) - endif() - - add_custom_target( - ${cargo_target} - COMMAND - "${CMAKE_COMMAND}" -E remove -f "${CMAKE_CURRENT_SOURCE_DIR}/Cargo.lock" - COMMAND - "${CMAKE_COMMAND}" -E env - "CARGO_TARGET_DIR=${CMAKE_CURRENT_BINARY_DIR}" - ${extra_cargo_env} - ${CARGO_COMMAND} - ${cargo_flags} - COMMENT "Building Rust crate '${crate_name}'..." - JOB_POOL rust_job_pool - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - BYPRODUCTS - "${CMAKE_CURRENT_BINARY_DIR}/debug/${staticlib_name}" - "${CMAKE_CURRENT_BINARY_DIR}/release/${staticlib_name}" - ) - - add_library(${TARGET} INTERFACE) - add_dependencies(${TARGET} ${cargo_target}) - set_target_properties( - ${TARGET} - PROPERTIES - INTERFACE_STATICLIB_OUTPUT_PATH "${rust_staticlib}" - INTERFACE_INSTALL_LIBNAME - "${CMAKE_STATIC_LIBRARY_PREFIX}${crate_name}_rs${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - target_link_libraries( - ${TARGET} - INTERFACE "$" - ) -endfunction() - -# This function instructs CMake to define a target that will use `cargo build` -# to build a bin crate referenced by the Cargo.toml file in the current source -# directory. -# It accepts a single `TARGET` parameter which will be passed as the package -# name to `cargo build -p TARGET`. If binary has different name as package, -# use optional flag BINARY_NAME to override it. -# It also accepts a `FEATURES` parameter if you want to enable certain features -# in your Rust binary. -# The CMake target will be registered to build by default as part of the -# ALL target. -function(rust_executable TARGET) - fb_cmake_parse_args(ARG "" "BINARY_NAME;FEATURES" "" "${ARGN}") - - set(crate_name "${TARGET}") - set(cargo_target "${TARGET}.cargo") - set(target_dir $,debug,release>) - - if(DEFINED ARG_BINARY_NAME) - set(executable_name "${ARG_BINARY_NAME}${CMAKE_EXECUTABLE_SUFFIX}") - else() - set(executable_name "${crate_name}${CMAKE_EXECUTABLE_SUFFIX}") - endif() - if(DEFINED ARG_FEATURES) - set(features --features ${ARG_FEATURES}) - else() - set(features ) - endif() - - if(DEFINED ARG_FEATURES) - set(cargo_flags build $,,--release> -p ${crate_name} --features ${ARG_FEATURES}) - else() - set(cargo_flags build $,,--release> -p ${crate_name}) - endif() - if(USE_CARGO_VENDOR) - set(extra_cargo_env "CARGO_HOME=${RUST_CARGO_HOME}") - set(cargo_flags ${cargo_flags}) - endif() - - add_custom_target( - ${cargo_target} - ALL - COMMAND - "${CMAKE_COMMAND}" -E remove -f "${CMAKE_CURRENT_SOURCE_DIR}/Cargo.lock" - COMMAND - "${CMAKE_COMMAND}" -E env - "CARGO_TARGET_DIR=${CMAKE_CURRENT_BINARY_DIR}" - ${extra_cargo_env} - ${CARGO_COMMAND} - ${cargo_flags} - COMMENT "Building Rust executable '${crate_name}'..." - JOB_POOL rust_job_pool - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - BYPRODUCTS - "${CMAKE_CURRENT_BINARY_DIR}/debug/${executable_name}" - "${CMAKE_CURRENT_BINARY_DIR}/release/${executable_name}" - ) - - set_property(TARGET "${cargo_target}" - PROPERTY EXECUTABLE "${CMAKE_CURRENT_BINARY_DIR}/${target_dir}/${executable_name}") -endfunction() - -# This function can be used to install the executable generated by a prior -# call to the `rust_executable` function. -# It requires a `TARGET` parameter to identify the target to be installed, -# and an optional `DESTINATION` parameter to specify the installation -# directory. If DESTINATION is not specified then the `bin` directory -# will be assumed. -function(install_rust_executable TARGET) - # Parse the arguments - set(one_value_args DESTINATION) - set(multi_value_args) - fb_cmake_parse_args( - ARG "" "${one_value_args}" "${multi_value_args}" "${ARGN}" - ) - - if(NOT DEFINED ARG_DESTINATION) - set(ARG_DESTINATION bin) - endif() - - get_target_property(foo "${TARGET}.cargo" EXECUTABLE) - - install( - PROGRAMS "${foo}" - DESTINATION "${ARG_DESTINATION}" - ) -endfunction() - -# This function installs the interface target generated from the function -# `rust_static_library`. Use this function if you want to export your Rust -# target to external CMake targets. -# -# ```cmake -# install_rust_static_library( -# -# INSTALL_DIR -# [EXPORT ] -# ) -# ``` -# -# Parameters: -# - TARGET: Name of the Rust static library target. -# - EXPORT_NAME: Name of the exported target. -# - INSTALL_DIR: Path to the directory where this library will be installed. -# -function(install_rust_static_library TARGET) - fb_cmake_parse_args(ARG "" "EXPORT;INSTALL_DIR" "" "${ARGN}") - - get_property( - staticlib_output_path - TARGET "${TARGET}" - PROPERTY INTERFACE_STATICLIB_OUTPUT_PATH - ) - get_property( - staticlib_output_name - TARGET "${TARGET}" - PROPERTY INTERFACE_INSTALL_LIBNAME - ) - - if(NOT DEFINED staticlib_output_path) - message(FATAL_ERROR "Not a rust_static_library target.") - endif() - - if(NOT DEFINED ARG_INSTALL_DIR) - message(FATAL_ERROR "Missing required argument.") - endif() - - if(DEFINED ARG_EXPORT) - set(install_export_args EXPORT "${ARG_EXPORT}") - endif() - - set(install_interface_dir "${ARG_INSTALL_DIR}") - if(NOT IS_ABSOLUTE "${install_interface_dir}") - set(install_interface_dir "\${_IMPORT_PREFIX}/${install_interface_dir}") - endif() - - target_link_libraries( - ${TARGET} INTERFACE - "$" - ) - install( - TARGETS ${TARGET} - ${install_export_args} - LIBRARY DESTINATION ${ARG_INSTALL_DIR} - ) - install( - FILES ${staticlib_output_path} - RENAME ${staticlib_output_name} - DESTINATION ${ARG_INSTALL_DIR} - ) -endfunction() diff --git a/build/fbcode_builder/CMake/fb_py_test_main.py b/build/fbcode_builder/CMake/fb_py_test_main.py deleted file mode 100644 index 41626181b1ec8..0000000000000 --- a/build/fbcode_builder/CMake/fb_py_test_main.py +++ /dev/null @@ -1,819 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (c) Facebook, Inc. and its affiliates. -# -""" -This file contains the main module code for Python test programs. -""" - - -import contextlib -import ctypes -import fnmatch -import json -import logging -import optparse -import os -import platform -import re -import sys -import tempfile -import time -import traceback -import unittest -import warnings - -# Hide warning about importing "imp"; remove once python2 is gone. -with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - import imp - -try: - from StringIO import StringIO -except ImportError: - from io import StringIO -try: - import coverage -except ImportError: - coverage = None # type: ignore -try: - from importlib.machinery import SourceFileLoader -except ImportError: - SourceFileLoader = None # type: ignore - - -class get_cpu_instr_counter(object): - def read(self): - # TODO - return 0 - - -EXIT_CODE_SUCCESS = 0 -EXIT_CODE_TEST_FAILURE = 70 - - -class TestStatus(object): - - ABORTED = "FAILURE" - PASSED = "SUCCESS" - FAILED = "FAILURE" - EXPECTED_FAILURE = "SUCCESS" - UNEXPECTED_SUCCESS = "FAILURE" - SKIPPED = "ASSUMPTION_VIOLATION" - - -class PathMatcher(object): - def __init__(self, include_patterns, omit_patterns): - self.include_patterns = include_patterns - self.omit_patterns = omit_patterns - - def omit(self, path): - """ - Omit iff matches any of the omit_patterns or the include patterns are - not empty and none is matched - """ - path = os.path.realpath(path) - return any(fnmatch.fnmatch(path, p) for p in self.omit_patterns) or ( - self.include_patterns - and not any(fnmatch.fnmatch(path, p) for p in self.include_patterns) - ) - - def include(self, path): - return not self.omit(path) - - -class DebugWipeFinder(object): - """ - PEP 302 finder that uses a DebugWipeLoader for all files which do not need - coverage - """ - - def __init__(self, matcher): - self.matcher = matcher - - def find_module(self, fullname, path=None): - _, _, basename = fullname.rpartition(".") - try: - fd, pypath, (_, _, kind) = imp.find_module(basename, path) - except Exception: - # Finding without hooks using the imp module failed. One reason - # could be that there is a zip file on sys.path. The imp module - # does not support loading from there. Leave finding this module to - # the others finders in sys.meta_path. - return None - - if hasattr(fd, "close"): - fd.close() - if kind != imp.PY_SOURCE: - return None - if self.matcher.include(pypath): - return None - - """ - This is defined to match CPython's PyVarObject struct - """ - - class PyVarObject(ctypes.Structure): - _fields_ = [ - ("ob_refcnt", ctypes.c_long), - ("ob_type", ctypes.c_void_p), - ("ob_size", ctypes.c_ulong), - ] - - class DebugWipeLoader(SourceFileLoader): - """ - PEP302 loader that zeros out debug information before execution - """ - - def get_code(self, fullname): - code = super(DebugWipeLoader, self).get_code(fullname) - if code: - # Ideally we'd do - # code.co_lnotab = b'' - # But code objects are READONLY. Not to worry though; we'll - # directly modify CPython's object - code_impl = PyVarObject.from_address(id(code.co_lnotab)) - code_impl.ob_size = 0 - return code - - return DebugWipeLoader(fullname, pypath) - - -def optimize_for_coverage(cov, include_patterns, omit_patterns): - """ - We get better performance if we zero out debug information for files which - we're not interested in. Only available in CPython 3.3+ - """ - matcher = PathMatcher(include_patterns, omit_patterns) - if SourceFileLoader and platform.python_implementation() == "CPython": - sys.meta_path.insert(0, DebugWipeFinder(matcher)) - - -class TeeStream(object): - def __init__(self, *streams): - self._streams = streams - - def write(self, data): - for stream in self._streams: - stream.write(data) - - def flush(self): - for stream in self._streams: - stream.flush() - - def isatty(self): - return False - - -class CallbackStream(object): - def __init__(self, callback, bytes_callback=None, orig=None): - self._callback = callback - self._fileno = orig.fileno() if orig else None - - # Python 3 APIs: - # - `encoding` is a string holding the encoding name - # - `errors` is a string holding the error-handling mode for encoding - # - `buffer` should look like an io.BufferedIOBase object - - self.errors = orig.errors if orig else None - if bytes_callback: - # those members are only on the io.TextIOWrapper - self.encoding = orig.encoding if orig else "UTF-8" - self.buffer = CallbackStream(bytes_callback, orig=orig) - - def write(self, data): - self._callback(data) - - def flush(self): - pass - - def isatty(self): - return False - - def fileno(self): - return self._fileno - - -class BuckTestResult(unittest._TextTestResult): - """ - Our own TestResult class that outputs data in a format that can be easily - parsed by buck's test runner. - """ - - _instr_counter = get_cpu_instr_counter() - - def __init__( - self, stream, descriptions, verbosity, show_output, main_program, suite - ): - super(BuckTestResult, self).__init__(stream, descriptions, verbosity) - self._main_program = main_program - self._suite = suite - self._results = [] - self._current_test = None - self._saved_stdout = sys.stdout - self._saved_stderr = sys.stderr - self._show_output = show_output - - def getResults(self): - return self._results - - def startTest(self, test): - super(BuckTestResult, self).startTest(test) - - # Pass in the real stdout and stderr filenos. We can't really do much - # here to intercept callers who directly operate on these fileno - # objects. - sys.stdout = CallbackStream( - self.addStdout, self.addStdoutBytes, orig=sys.stdout - ) - sys.stderr = CallbackStream( - self.addStderr, self.addStderrBytes, orig=sys.stderr - ) - self._current_test = test - self._test_start_time = time.time() - self._current_status = TestStatus.ABORTED - self._messages = [] - self._stacktrace = None - self._stdout = "" - self._stderr = "" - self._start_instr_count = self._instr_counter.read() - - def _find_next_test(self, suite): - """ - Find the next test that has not been run. - """ - - for test in suite: - - # We identify test suites by test that are iterable (as is done in - # the builtin python test harness). If we see one, recurse on it. - if hasattr(test, "__iter__"): - test = self._find_next_test(test) - - # The builtin python test harness sets test references to `None` - # after they have run, so we know we've found the next test up - # if it's not `None`. - if test is not None: - return test - - def stopTest(self, test): - sys.stdout = self._saved_stdout - sys.stderr = self._saved_stderr - - super(BuckTestResult, self).stopTest(test) - - # If a failure occurred during module/class setup, then this "test" may - # actually be a `_ErrorHolder`, which doesn't contain explicit info - # about the upcoming test. Since we really only care about the test - # name field (i.e. `_testMethodName`), we use that to detect an actual - # test cases, and fall back to looking the test up from the suite - # otherwise. - if not hasattr(test, "_testMethodName"): - test = self._find_next_test(self._suite) - - result = { - "testCaseName": "{0}.{1}".format( - test.__class__.__module__, test.__class__.__name__ - ), - "testCase": test._testMethodName, - "type": self._current_status, - "time": int((time.time() - self._test_start_time) * 1000), - "message": os.linesep.join(self._messages), - "stacktrace": self._stacktrace, - "stdOut": self._stdout, - "stdErr": self._stderr, - } - - # TestPilot supports an instruction count field. - if "TEST_PILOT" in os.environ: - result["instrCount"] = ( - int(self._instr_counter.read() - self._start_instr_count), - ) - - self._results.append(result) - self._current_test = None - - def stopTestRun(self): - cov = self._main_program.get_coverage() - if cov is not None: - self._results.append({"coverage": cov}) - - @contextlib.contextmanager - def _withTest(self, test): - self.startTest(test) - yield - self.stopTest(test) - - def _setStatus(self, test, status, message=None, stacktrace=None): - assert test == self._current_test - self._current_status = status - self._stacktrace = stacktrace - if message is not None: - if message.endswith(os.linesep): - message = message[:-1] - self._messages.append(message) - - def setStatus(self, test, status, message=None, stacktrace=None): - # addError() may be called outside of a test if one of the shared - # fixtures (setUpClass/tearDownClass/setUpModule/tearDownModule) - # throws an error. - # - # In this case, create a fake test result to record the error. - if self._current_test is None: - with self._withTest(test): - self._setStatus(test, status, message, stacktrace) - else: - self._setStatus(test, status, message, stacktrace) - - def setException(self, test, status, excinfo): - exctype, value, tb = excinfo - self.setStatus( - test, - status, - "{0}: {1}".format(exctype.__name__, value), - "".join(traceback.format_tb(tb)), - ) - - def addSuccess(self, test): - super(BuckTestResult, self).addSuccess(test) - self.setStatus(test, TestStatus.PASSED) - - def addError(self, test, err): - super(BuckTestResult, self).addError(test, err) - self.setException(test, TestStatus.ABORTED, err) - - def addFailure(self, test, err): - super(BuckTestResult, self).addFailure(test, err) - self.setException(test, TestStatus.FAILED, err) - - def addSkip(self, test, reason): - super(BuckTestResult, self).addSkip(test, reason) - self.setStatus(test, TestStatus.SKIPPED, "Skipped: %s" % (reason,)) - - def addExpectedFailure(self, test, err): - super(BuckTestResult, self).addExpectedFailure(test, err) - self.setException(test, TestStatus.EXPECTED_FAILURE, err) - - def addUnexpectedSuccess(self, test): - super(BuckTestResult, self).addUnexpectedSuccess(test) - self.setStatus(test, TestStatus.UNEXPECTED_SUCCESS, "Unexpected success") - - def addStdout(self, val): - self._stdout += val - if self._show_output: - self._saved_stdout.write(val) - self._saved_stdout.flush() - - def addStdoutBytes(self, val): - string = val.decode("utf-8", errors="backslashreplace") - self.addStdout(string) - - def addStderr(self, val): - self._stderr += val - if self._show_output: - self._saved_stderr.write(val) - self._saved_stderr.flush() - - def addStderrBytes(self, val): - string = val.decode("utf-8", errors="backslashreplace") - self.addStderr(string) - - -class BuckTestRunner(unittest.TextTestRunner): - def __init__(self, main_program, suite, show_output=True, **kwargs): - super(BuckTestRunner, self).__init__(**kwargs) - self.show_output = show_output - self._main_program = main_program - self._suite = suite - - def _makeResult(self): - return BuckTestResult( - self.stream, - self.descriptions, - self.verbosity, - self.show_output, - self._main_program, - self._suite, - ) - - -def _format_test_name(test_class, attrname): - return "{0}.{1}.{2}".format(test_class.__module__, test_class.__name__, attrname) - - -class StderrLogHandler(logging.StreamHandler): - """ - This class is very similar to logging.StreamHandler, except that it - always uses the current sys.stderr object. - - StreamHandler caches the current sys.stderr object when it is constructed. - This makes it behave poorly in unit tests, which may replace sys.stderr - with a StringIO buffer during tests. The StreamHandler will continue using - the old sys.stderr object instead of the desired StringIO buffer. - """ - - def __init__(self): - logging.Handler.__init__(self) - - @property - def stream(self): - return sys.stderr - - -class RegexTestLoader(unittest.TestLoader): - def __init__(self, regex=None): - self.regex = regex - super(RegexTestLoader, self).__init__() - - def getTestCaseNames(self, testCaseClass): - """ - Return a sorted sequence of method names found within testCaseClass - """ - - testFnNames = super(RegexTestLoader, self).getTestCaseNames(testCaseClass) - if self.regex is None: - return testFnNames - robj = re.compile(self.regex) - matched = [] - for attrname in testFnNames: - fullname = _format_test_name(testCaseClass, attrname) - if robj.search(fullname): - matched.append(attrname) - return matched - - -class Loader(object): - - suiteClass = unittest.TestSuite - - def __init__(self, modules, regex=None): - self.modules = modules - self.regex = regex - - def load_all(self): - loader = RegexTestLoader(self.regex) - test_suite = self.suiteClass() - for module_name in self.modules: - __import__(module_name, level=0) - module = sys.modules[module_name] - module_suite = loader.loadTestsFromModule(module) - test_suite.addTest(module_suite) - return test_suite - - def load_args(self, args): - loader = RegexTestLoader(self.regex) - - suites = [] - for arg in args: - suite = loader.loadTestsFromName(arg) - # loadTestsFromName() can only process names that refer to - # individual test functions or modules. It can't process package - # names. If there were no module/function matches, check to see if - # this looks like a package name. - if suite.countTestCases() != 0: - suites.append(suite) - continue - - # Load all modules whose name is . - prefix = arg + "." - for module in self.modules: - if module.startswith(prefix): - suite = loader.loadTestsFromName(module) - suites.append(suite) - - return loader.suiteClass(suites) - - -_COVERAGE_INI = """\ -[report] -exclude_lines = - pragma: no cover - pragma: nocover - pragma:.*no${PLATFORM} - pragma:.*no${PY_IMPL}${PY_MAJOR}${PY_MINOR} - pragma:.*no${PY_IMPL}${PY_MAJOR} - pragma:.*nopy${PY_MAJOR} - pragma:.*nopy${PY_MAJOR}${PY_MINOR} -""" - - -class MainProgram(object): - """ - This class implements the main program. It can be subclassed by - users who wish to customize some parts of the main program. - (Adding additional command line options, customizing test loading, etc.) - """ - - DEFAULT_VERBOSITY = 2 - - def __init__(self, argv): - self.init_option_parser() - self.parse_options(argv) - self.setup_logging() - - def init_option_parser(self): - usage = "%prog [options] [TEST] ..." - op = optparse.OptionParser(usage=usage, add_help_option=False) - self.option_parser = op - - op.add_option( - "--hide-output", - dest="show_output", - action="store_false", - default=True, - help="Suppress data that tests print to stdout/stderr, and only " - "show it if the test fails.", - ) - op.add_option( - "-o", - "--output", - help="Write results to a file in a JSON format to be read by Buck", - ) - op.add_option( - "-f", - "--failfast", - action="store_true", - default=False, - help="Stop after the first failure", - ) - op.add_option( - "-l", - "--list-tests", - action="store_true", - dest="list", - default=False, - help="List tests and exit", - ) - op.add_option( - "-r", - "--regex", - default=None, - help="Regex to apply to tests, to only run those tests", - ) - op.add_option( - "--collect-coverage", - action="store_true", - default=False, - help="Collect test coverage information", - ) - op.add_option( - "--coverage-include", - default="*", - help='File globs to include in converage (split by ",")', - ) - op.add_option( - "--coverage-omit", - default="", - help='File globs to omit from converage (split by ",")', - ) - op.add_option( - "--logger", - action="append", - metavar="=", - default=[], - help="Configure log levels for specific logger categories", - ) - op.add_option( - "-q", - "--quiet", - action="count", - default=0, - help="Decrease the verbosity (may be specified multiple times)", - ) - op.add_option( - "-v", - "--verbosity", - action="count", - default=self.DEFAULT_VERBOSITY, - help="Increase the verbosity (may be specified multiple times)", - ) - op.add_option( - "-?", "--help", action="help", help="Show this help message and exit" - ) - - def parse_options(self, argv): - self.options, self.test_args = self.option_parser.parse_args(argv[1:]) - self.options.verbosity -= self.options.quiet - - if self.options.collect_coverage and coverage is None: - self.option_parser.error("coverage module is not available") - self.options.coverage_include = self.options.coverage_include.split(",") - if self.options.coverage_omit == "": - self.options.coverage_omit = [] - else: - self.options.coverage_omit = self.options.coverage_omit.split(",") - - def setup_logging(self): - # Configure the root logger to log at INFO level. - # This is similar to logging.basicConfig(), but uses our - # StderrLogHandler instead of a StreamHandler. - fmt = logging.Formatter("%(pathname)s:%(lineno)s: %(message)s") - log_handler = StderrLogHandler() - log_handler.setFormatter(fmt) - root_logger = logging.getLogger() - root_logger.addHandler(log_handler) - root_logger.setLevel(logging.INFO) - - level_names = { - "debug": logging.DEBUG, - "info": logging.INFO, - "warn": logging.WARNING, - "warning": logging.WARNING, - "error": logging.ERROR, - "critical": logging.CRITICAL, - "fatal": logging.FATAL, - } - - for value in self.options.logger: - parts = value.rsplit("=", 1) - if len(parts) != 2: - self.option_parser.error( - "--logger argument must be of the " - "form =: %s" % value - ) - name = parts[0] - level_name = parts[1].lower() - level = level_names.get(level_name) - if level is None: - self.option_parser.error( - "invalid log level %r for log " "category %s" % (parts[1], name) - ) - logging.getLogger(name).setLevel(level) - - def create_loader(self): - import __test_modules__ - - return Loader(__test_modules__.TEST_MODULES, self.options.regex) - - def load_tests(self): - loader = self.create_loader() - if self.options.collect_coverage: - self.start_coverage() - include = self.options.coverage_include - omit = self.options.coverage_omit - if include and "*" not in include: - optimize_for_coverage(self.cov, include, omit) - - if self.test_args: - suite = loader.load_args(self.test_args) - else: - suite = loader.load_all() - if self.options.collect_coverage: - self.cov.start() - return suite - - def get_tests(self, test_suite): - tests = [] - - for test in test_suite: - if isinstance(test, unittest.TestSuite): - tests.extend(self.get_tests(test)) - else: - tests.append(test) - - return tests - - def run(self): - test_suite = self.load_tests() - - if self.options.list: - for test in self.get_tests(test_suite): - method_name = getattr(test, "_testMethodName", "") - name = _format_test_name(test.__class__, method_name) - print(name) - return EXIT_CODE_SUCCESS - else: - result = self.run_tests(test_suite) - if self.options.output is not None: - with open(self.options.output, "w") as f: - json.dump(result.getResults(), f, indent=4, sort_keys=True) - if not result.wasSuccessful(): - return EXIT_CODE_TEST_FAILURE - return EXIT_CODE_SUCCESS - - def run_tests(self, test_suite): - # Install a signal handler to catch Ctrl-C and display the results - # (but only if running >2.6). - if sys.version_info[0] > 2 or sys.version_info[1] > 6: - unittest.installHandler() - - # Run the tests - runner = BuckTestRunner( - self, - test_suite, - verbosity=self.options.verbosity, - show_output=self.options.show_output, - ) - result = runner.run(test_suite) - - if self.options.collect_coverage and self.options.show_output: - self.cov.stop() - try: - self.cov.report(file=sys.stdout) - except coverage.misc.CoverageException: - print("No lines were covered, potentially restricted by file filters") - - return result - - def get_abbr_impl(self): - """Return abbreviated implementation name.""" - impl = platform.python_implementation() - if impl == "PyPy": - return "pp" - elif impl == "Jython": - return "jy" - elif impl == "IronPython": - return "ip" - elif impl == "CPython": - return "cp" - else: - raise RuntimeError("unknown python runtime") - - def start_coverage(self): - if not self.options.collect_coverage: - return - - with tempfile.NamedTemporaryFile("w", delete=False) as coverage_ini: - coverage_ini.write(_COVERAGE_INI) - self._coverage_ini_path = coverage_ini.name - - # Keep the original working dir in case tests use os.chdir - self._original_working_dir = os.getcwd() - - # for coverage config ignores by platform/python version - os.environ["PLATFORM"] = sys.platform - os.environ["PY_IMPL"] = self.get_abbr_impl() - os.environ["PY_MAJOR"] = str(sys.version_info.major) - os.environ["PY_MINOR"] = str(sys.version_info.minor) - - self.cov = coverage.Coverage( - include=self.options.coverage_include, - omit=self.options.coverage_omit, - config_file=coverage_ini.name, - ) - self.cov.erase() - self.cov.start() - - def get_coverage(self): - if not self.options.collect_coverage: - return None - - try: - os.remove(self._coverage_ini_path) - except OSError: - pass # Better to litter than to fail the test - - # Switch back to the original working directory. - os.chdir(self._original_working_dir) - - result = {} - - self.cov.stop() - - try: - f = StringIO() - self.cov.report(file=f) - lines = f.getvalue().split("\n") - except coverage.misc.CoverageException: - # Nothing was covered. That's fine by us - return result - - # N.B.: the format of the coverage library's output differs - # depending on whether one or more files are in the results - for line in lines[2:]: - if line.strip("-") == "": - break - r = line.split()[0] - analysis = self.cov.analysis2(r) - covString = self.convert_to_diff_cov_str(analysis) - if covString: - result[r] = covString - - return result - - def convert_to_diff_cov_str(self, analysis): - # Info on the format of analysis: - # http://nedbatchelder.com/code/coverage/api.html - if not analysis: - return None - numLines = max( - analysis[1][-1] if len(analysis[1]) else 0, - analysis[2][-1] if len(analysis[2]) else 0, - analysis[3][-1] if len(analysis[3]) else 0, - ) - lines = ["N"] * numLines - for l in analysis[1]: - lines[l - 1] = "C" - for l in analysis[2]: - lines[l - 1] = "X" - for l in analysis[3]: - lines[l - 1] = "U" - return "".join(lines) - - -def main(argv): - return MainProgram(sys.argv).run() - - -if __name__ == "__main__": - sys.exit(main(sys.argv)) diff --git a/build/fbcode_builder/CMake/fb_py_win_main.c b/build/fbcode_builder/CMake/fb_py_win_main.c deleted file mode 100644 index 85a95b31563f8..0000000000000 --- a/build/fbcode_builder/CMake/fb_py_win_main.c +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. - -#define WIN32_LEAN_AND_MEAN - -#include -#include -#include - -#define PATH_SIZE 32768 - -typedef int (*Py_Main)(int, wchar_t**); - -// Add the given path to Windows's DLL search path. -// For Windows DLL search path resolution, see: -// https://docs.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order -void add_search_path(const wchar_t* path) { - wchar_t buffer[PATH_SIZE]; - wchar_t** lppPart = NULL; - - if (!GetFullPathNameW(path, PATH_SIZE, buffer, lppPart)) { - fwprintf( - stderr, - L"warning: %d unable to expand path %s\n", - GetLastError(), - path); - return; - } - - if (!AddDllDirectory(buffer)) { - DWORD error = GetLastError(); - if (error != ERROR_FILE_NOT_FOUND) { - fwprintf( - stderr, - L"warning: %d unable to set DLL search path for %s\n", - GetLastError(), - path); - } - } -} - -int locate_py_main(int argc, wchar_t** argv) { - /* - * We have to dynamically locate Python3.dll because we may be loading a - * Python native module while running. If that module is built with a - * different Python version, we will end up a DLL import error. To resolve - * this, we can either ship an embedded version of Python with us or - * dynamically look up existing Python distribution installed on user's - * machine. This way, we should be able to get a consistent version of - * Python3.dll and .pyd modules. - */ - HINSTANCE python_dll; - Py_Main pymain; - - // last added directory has highest priority - add_search_path(L"C:\\Python36\\"); - add_search_path(L"C:\\tools\\fb-python\\fb-python36\\"); - add_search_path(L"C:\\Python37\\"); - add_search_path(L"C:\\tools\\fb-python\\fb-python37\\"); - add_search_path(L"C:\\Python38\\"); - add_search_path(L"C:\\tools\\fb-python\\fb-python38\\"); - // TODO(T123615656): Re-enable Python 3.9 after the fix - // add_search_path(L"C:\\tools\\fb-python\\fb-python39\\"); - - python_dll = - LoadLibraryExW(L"python3.dll", NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS); - - int returncode = 0; - if (python_dll != NULL) { - pymain = (Py_Main)GetProcAddress(python_dll, "Py_Main"); - - if (pymain != NULL) { - returncode = (pymain)(argc, argv); - } else { - fprintf(stderr, "error: %d unable to load Py_Main\n", GetLastError()); - } - - FreeLibrary(python_dll); - } else { - fprintf(stderr, "error: %d unable to locate python3.dll\n", GetLastError()); - return 1; - } - return returncode; -} - -int wmain() { - /* - * This executable will be prepended to the start of a Python ZIP archive. - * Python will be able to directly execute the ZIP archive, so we simply - * need to tell Py_Main() to run our own file. Duplicate the argument list - * and add our file name to the beginning to tell Python what file to invoke. - */ - wchar_t** pyargv = malloc(sizeof(wchar_t*) * (__argc + 1)); - if (!pyargv) { - fprintf(stderr, "error: failed to allocate argument vector\n"); - return 1; - } - - /* Py_Main wants the wide character version of the argv so we pull those - * values from the global __wargv array that has been prepared by MSVCRT. - * - * In order for the zipapp to run we need to insert an extra argument in - * the front of the argument vector that points to ourselves. - * - * An additional complication is that, depending on who prepared the argument - * string used to start our process, the computed __wargv[0] can be a simple - * shell word like `watchman-wait` which is normally resolved together with - * the PATH by the shell. - * That unresolved path isn't sufficient to start the zipapp on windows; - * we need the fully qualified path. - * - * Given: - * __wargv == {"watchman-wait", "-h"} - * - * we want to pass the following to Py_Main: - * - * { - * "z:\build\watchman\python\watchman-wait.exe", - * "z:\build\watchman\python\watchman-wait.exe", - * "-h" - * } - */ - wchar_t full_path_to_argv0[PATH_SIZE]; - DWORD len = GetModuleFileNameW(NULL, full_path_to_argv0, PATH_SIZE); - if (len == 0 || - len == PATH_SIZE && GetLastError() == ERROR_INSUFFICIENT_BUFFER) { - fprintf( - stderr, - "error: %d while retrieving full path to this executable\n", - GetLastError()); - return 1; - } - - for (int n = 1; n < __argc; ++n) { - pyargv[n + 1] = __wargv[n]; - } - pyargv[0] = full_path_to_argv0; - pyargv[1] = full_path_to_argv0; - - return locate_py_main(__argc + 1, pyargv); -} diff --git a/build/fbcode_builder/CMake/make_fbpy_archive.py b/build/fbcode_builder/CMake/make_fbpy_archive.py deleted file mode 100755 index 3724feb2183f7..0000000000000 --- a/build/fbcode_builder/CMake/make_fbpy_archive.py +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (c) Facebook, Inc. and its affiliates. -# -import argparse -import collections -import errno -import os -import shutil -import sys -import tempfile -import zipapp - -MANIFEST_SEPARATOR = " :: " -MANIFEST_HEADER_V1 = "FBPY_MANIFEST 1\n" - - -class UsageError(Exception): - def __init__(self, message): - self.message = message - - def __str__(self): - return self.message - - -class BadManifestError(UsageError): - def __init__(self, path, line_num, message): - full_msg = "%s:%s: %s" % (path, line_num, message) - super().__init__(full_msg) - self.path = path - self.line_num = line_num - self.raw_message = message - - -PathInfo = collections.namedtuple( - "PathInfo", ("src", "dest", "manifest_path", "manifest_line") -) - - -def parse_manifest(manifest, path_map): - bad_prefix = ".." + os.path.sep - manifest_dir = os.path.dirname(manifest) - with open(manifest, "r") as f: - line_num = 1 - line = f.readline() - if line != MANIFEST_HEADER_V1: - raise BadManifestError( - manifest, line_num, "Unexpected manifest file header" - ) - - for line in f: - line_num += 1 - if line.startswith("#"): - continue - line = line.rstrip("\n") - parts = line.split(MANIFEST_SEPARATOR) - if len(parts) != 2: - msg = "line must be of the form SRC %s DEST" % MANIFEST_SEPARATOR - raise BadManifestError(manifest, line_num, msg) - src, dest = parts - dest = os.path.normpath(dest) - if dest.startswith(bad_prefix): - msg = "destination path starts with %s: %s" % (bad_prefix, dest) - raise BadManifestError(manifest, line_num, msg) - - if not os.path.isabs(src): - src = os.path.normpath(os.path.join(manifest_dir, src)) - - if dest in path_map: - prev_info = path_map[dest] - msg = ( - "multiple source paths specified for destination " - "path %s. Previous source was %s from %s:%s" - % ( - dest, - prev_info.src, - prev_info.manifest_path, - prev_info.manifest_line, - ) - ) - raise BadManifestError(manifest, line_num, msg) - - info = PathInfo( - src=src, - dest=dest, - manifest_path=manifest, - manifest_line=line_num, - ) - path_map[dest] = info - - -def populate_install_tree(inst_dir, path_map): - os.mkdir(inst_dir) - dest_dirs = {"": False} - - def make_dest_dir(path): - if path in dest_dirs: - return - parent = os.path.dirname(path) - make_dest_dir(parent) - abs_path = os.path.join(inst_dir, path) - os.mkdir(abs_path) - dest_dirs[path] = False - - def install_file(info): - dir_name, base_name = os.path.split(info.dest) - make_dest_dir(dir_name) - if base_name == "__init__.py": - dest_dirs[dir_name] = True - abs_dest = os.path.join(inst_dir, info.dest) - shutil.copy2(info.src, abs_dest) - - # Copy all of the destination files - for info in path_map.values(): - install_file(info) - - # Create __init__ files in any directories that don't have them. - for dir_path, has_init in dest_dirs.items(): - if has_init: - continue - init_path = os.path.join(inst_dir, dir_path, "__init__.py") - with open(init_path, "w"): - pass - - -def build_zipapp(args, path_map): - """Create a self executing python binary using Python 3's built-in - zipapp module. - - This type of Python binary is relatively simple, as zipapp is part of the - standard library, but it does not support native language extensions - (.so/.dll files). - """ - dest_dir = os.path.dirname(args.output) - with tempfile.TemporaryDirectory(prefix="make_fbpy.", dir=dest_dir) as tmpdir: - inst_dir = os.path.join(tmpdir, "tree") - populate_install_tree(inst_dir, path_map) - - tmp_output = os.path.join(tmpdir, "output.exe") - zipapp.create_archive( - inst_dir, target=tmp_output, interpreter=args.python, main=args.main - ) - os.replace(tmp_output, args.output) - - -def create_main_module(args, inst_dir, path_map): - if not args.main: - assert "__main__.py" in path_map - return - - dest_path = os.path.join(inst_dir, "__main__.py") - main_module, main_fn = args.main.split(":") - main_contents = """\ -#!{python} - -if __name__ == "__main__": - import {main_module} - {main_module}.{main_fn}() -""".format( - python=args.python, main_module=main_module, main_fn=main_fn - ) - with open(dest_path, "w") as f: - f.write(main_contents) - os.chmod(dest_path, 0o755) - - -def build_install_dir(args, path_map): - """Create a directory that contains all of the sources, with a __main__ - module to run the program. - """ - # Populate a temporary directory first, then rename to the destination - # location. This ensures that we don't ever leave a halfway-built - # directory behind at the output path if something goes wrong. - dest_dir = os.path.dirname(args.output) - with tempfile.TemporaryDirectory(prefix="make_fbpy.", dir=dest_dir) as tmpdir: - inst_dir = os.path.join(tmpdir, "tree") - populate_install_tree(inst_dir, path_map) - create_main_module(args, inst_dir, path_map) - os.rename(inst_dir, args.output) - - -def ensure_directory(path): - try: - os.makedirs(path) - except OSError as ex: - if ex.errno != errno.EEXIST: - raise - - -def install_library(args, path_map): - """Create an installation directory a python library.""" - out_dir = args.output - out_manifest = args.output + ".manifest" - - install_dir = args.install_dir - if not install_dir: - install_dir = out_dir - - os.makedirs(out_dir) - with open(out_manifest, "w") as manifest: - manifest.write(MANIFEST_HEADER_V1) - for info in path_map.values(): - abs_dest = os.path.join(out_dir, info.dest) - ensure_directory(os.path.dirname(abs_dest)) - print("copy %r --> %r" % (info.src, abs_dest)) - shutil.copy2(info.src, abs_dest) - installed_dest = os.path.join(install_dir, info.dest) - manifest.write("%s%s%s\n" % (installed_dest, MANIFEST_SEPARATOR, info.dest)) - - -def parse_manifests(args): - # Process args.manifest_separator to help support older versions of CMake - if args.manifest_separator: - manifests = [] - for manifest_arg in args.manifests: - split_arg = manifest_arg.split(args.manifest_separator) - manifests.extend(split_arg) - args.manifests = manifests - - path_map = {} - for manifest in args.manifests: - parse_manifest(manifest, path_map) - - return path_map - - -def check_main_module(args, path_map): - # Translate an empty string in the --main argument to None, - # just to allow the CMake logic to be slightly simpler and pass in an - # empty string when it really wants the default __main__.py module to be - # used. - if args.main == "": - args.main = None - - if args.type == "lib-install": - if args.main is not None: - raise UsageError("cannot specify a --main argument with --type=lib-install") - return - - main_info = path_map.get("__main__.py") - if args.main: - if main_info is not None: - msg = ( - "specified an explicit main module with --main, " - "but the file listing already includes __main__.py" - ) - raise BadManifestError( - main_info.manifest_path, main_info.manifest_line, msg - ) - parts = args.main.split(":") - if len(parts) != 2: - raise UsageError( - "argument to --main must be of the form MODULE:CALLABLE " - "(received %s)" % (args.main,) - ) - else: - if main_info is None: - raise UsageError( - "no main module specified with --main, " - "and no __main__.py module present" - ) - - -BUILD_TYPES = { - "zipapp": build_zipapp, - "dir": build_install_dir, - "lib-install": install_library, -} - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("-o", "--output", required=True, help="The output file path") - ap.add_argument( - "--install-dir", - help="When used with --type=lib-install, this parameter specifies the " - "final location where the library where be installed. This can be " - "used to generate the library in one directory first, when you plan " - "to move or copy it to another final location later.", - ) - ap.add_argument( - "--manifest-separator", - help="Split manifest arguments around this separator. This is used " - "to support older versions of CMake that cannot supply the manifests " - "as separate arguments.", - ) - ap.add_argument( - "--main", - help="The main module to run, specified as :. " - "This must be specified if and only if the archive does not contain " - "a __main__.py file.", - ) - ap.add_argument( - "--python", - help="Explicitly specify the python interpreter to use for the " "executable.", - ) - ap.add_argument( - "--type", choices=BUILD_TYPES.keys(), help="The type of output to build." - ) - ap.add_argument( - "manifests", - nargs="+", - help="The manifest files specifying how to construct the archive", - ) - args = ap.parse_args() - - if args.python is None: - args.python = sys.executable - - if args.type is None: - # In the future we might want different default output types - # for different platforms. - args.type = "zipapp" - build_fn = BUILD_TYPES[args.type] - - try: - path_map = parse_manifests(args) - check_main_module(args, path_map) - except UsageError as ex: - print("error: %s" % (ex,), file=sys.stderr) - sys.exit(1) - - build_fn(args, path_map) - - -if __name__ == "__main__": - main() diff --git a/build/fbcode_builder/LICENSE b/build/fbcode_builder/LICENSE deleted file mode 100644 index b96dcb0480a0b..0000000000000 --- a/build/fbcode_builder/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) Facebook, Inc. and its affiliates. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/build/fbcode_builder/README.md b/build/fbcode_builder/README.md deleted file mode 100644 index d47dd41c01492..0000000000000 --- a/build/fbcode_builder/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# Easy builds for Facebook projects - -This directory contains tools designed to simplify continuous-integration -(and other builds) of Facebook open source projects. In particular, this helps -manage builds for cross-project dependencies. - -The main entry point is the `getdeps.py` script. This script has several -subcommands, but the most notable is the `build` command. This will download -and build all dependencies for a project, and then build the project itself. - -## Deployment - -This directory is copied literally into a number of different Facebook open -source repositories. Any change made to code in this directory will be -automatically be replicated by our open source tooling into all GitHub hosted -repositories that use `fbcode_builder`. Typically this directory is copied -into the open source repositories as `build/fbcode_builder/`. - - -# Project Configuration Files - -The `manifests` subdirectory contains configuration files for many different -projects, describing how to build each project. These files also list -dependencies between projects, enabling `getdeps.py` to build all dependencies -for a project before building the project itself. - - -# Shared CMake utilities - -Since this directory is copied into many Facebook open source repositories, -it is also used to help share some CMake utility files across projects. The -`CMake/` subdirectory contains a number of `.cmake` files that are shared by -the CMake-based build systems across several different projects. - - -# Older Build Scripts - -This directory also still contains a handful of older build scripts that -pre-date the current `getdeps.py` build system. Most of the other `.py` files -in this top directory, apart from `getdeps.py` itself, are from this older -build system. This older system is only used by a few remaining projects, and -new projects should generally use the newer `getdeps.py` script, by adding a -new configuration file in the `manifests/` subdirectory. diff --git a/build/fbcode_builder/getdeps.py b/build/fbcode_builder/getdeps.py deleted file mode 100755 index b6ea1aa2c6161..0000000000000 --- a/build/fbcode_builder/getdeps.py +++ /dev/null @@ -1,1340 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import json -import os -import shutil -import subprocess -import sys -import tarfile -import tempfile - -# We don't import cache.create_cache directly as the facebook -# specific import below may monkey patch it, and we want to -# observe the patched version of this function! -import getdeps.cache as cache_module -from getdeps.buildopts import setup_build_options -from getdeps.dyndeps import create_dyn_dep_munger -from getdeps.errors import TransientFailure -from getdeps.fetcher import ( - file_name_is_cmake_file, - list_files_under_dir_newer_than_timestamp, - SystemPackageFetcher, -) -from getdeps.load import ManifestLoader -from getdeps.manifest import ManifestParser -from getdeps.platform import HostType -from getdeps.runcmd import run_cmd -from getdeps.subcmd import add_subcommands, cmd, SubCmd - -try: - import getdeps.facebook # noqa: F401 -except ImportError: - # we don't ship the facebook specific subdir, - # so allow that to fail silently - pass - - -sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "getdeps")) - - -class UsageError(Exception): - pass - - -@cmd("validate-manifest", "parse a manifest and validate that it is correct") -class ValidateManifest(SubCmd): - def run(self, args): - try: - ManifestParser(file_name=args.file_name) - print("OK", file=sys.stderr) - return 0 - except Exception as exc: - print("ERROR: %s" % str(exc), file=sys.stderr) - return 1 - - def setup_parser(self, parser): - parser.add_argument("file_name", help="path to the manifest file") - - -@cmd("show-host-type", "outputs the host type tuple for the host machine") -class ShowHostType(SubCmd): - def run(self, args): - host = HostType() - print("%s" % host.as_tuple_string()) - return 0 - - -class ProjectCmdBase(SubCmd): - def run(self, args): - opts = setup_build_options(args) - - if args.current_project is not None: - opts.repo_project = args.current_project - if args.project is None: - if opts.repo_project is None: - raise UsageError( - "no project name specified, and no .projectid file found" - ) - if opts.repo_project == "fbsource": - # The fbsource repository is a little special. There is no project - # manifest file for it. A specific project must always be explicitly - # specified when building from fbsource. - raise UsageError( - "no project name specified (required when building in fbsource)" - ) - args.project = opts.repo_project - - ctx_gen = opts.get_context_generator() - if args.test_dependencies: - ctx_gen.set_value_for_all_projects("test", "on") - if args.enable_tests: - ctx_gen.set_value_for_project(args.project, "test", "on") - else: - ctx_gen.set_value_for_project(args.project, "test", "off") - - if opts.shared_libs: - ctx_gen.set_value_for_all_projects("shared_libs", "on") - - loader = ManifestLoader(opts, ctx_gen) - self.process_project_dir_arguments(args, loader) - - manifest = loader.load_manifest(args.project) - - self.run_project_cmd(args, loader, manifest) - - def process_project_dir_arguments(self, args, loader): - def parse_project_arg(arg, arg_type): - parts = arg.split(":") - if len(parts) == 2: - project, path = parts - elif len(parts) == 1: - project = args.project - path = parts[0] - # On Windows path contains colon, e.g. C:\open - elif os.name == "nt" and len(parts) == 3: - project = parts[0] - path = parts[1] + ":" + parts[2] - else: - raise UsageError( - "invalid %s argument; too many ':' characters: %s" % (arg_type, arg) - ) - - return project, os.path.abspath(path) - - # If we are currently running from a project repository, - # use the current repository for the project sources. - build_opts = loader.build_opts - if build_opts.repo_project is not None and build_opts.repo_root is not None: - loader.set_project_src_dir(build_opts.repo_project, build_opts.repo_root) - - for arg in args.src_dir: - project, path = parse_project_arg(arg, "--src-dir") - loader.set_project_src_dir(project, path) - - for arg in args.build_dir: - project, path = parse_project_arg(arg, "--build-dir") - loader.set_project_build_dir(project, path) - - for arg in args.install_dir: - project, path = parse_project_arg(arg, "--install-dir") - loader.set_project_install_dir(project, path) - - for arg in args.project_install_prefix: - project, path = parse_project_arg(arg, "--install-prefix") - loader.set_project_install_prefix(project, path) - - def setup_parser(self, parser): - parser.add_argument( - "project", - nargs="?", - help=( - "name of the project or path to a manifest " - "file describing the project" - ), - ) - parser.add_argument( - "--no-tests", - action="store_false", - dest="enable_tests", - default=True, - help="Disable building tests for this project.", - ) - parser.add_argument( - "--test-dependencies", - action="store_true", - help="Enable building tests for dependencies as well.", - ) - parser.add_argument( - "--current-project", - help="Specify the name of the fbcode_builder manifest file for the " - "current repository. If not specified, the code will attempt to find " - "this in a .projectid file in the repository root.", - ) - parser.add_argument( - "--src-dir", - default=[], - action="append", - help="Specify a local directory to use for the project source, " - "rather than fetching it.", - ) - parser.add_argument( - "--build-dir", - default=[], - action="append", - help="Explicitly specify the build directory to use for the " - "project, instead of the default location in the scratch path. " - "This only affects the project specified, and not its dependencies.", - ) - parser.add_argument( - "--install-dir", - default=[], - action="append", - help="Explicitly specify the install directory to use for the " - "project, instead of the default location in the scratch path. " - "This only affects the project specified, and not its dependencies.", - ) - parser.add_argument( - "--project-install-prefix", - default=[], - action="append", - help="Specify the final deployment installation path for a project", - ) - - self.setup_project_cmd_parser(parser) - - def setup_project_cmd_parser(self, parser): - pass - - -class CachedProject(object): - """A helper that allows calling the cache logic for a project - from both the build and the fetch code""" - - def __init__(self, cache, loader, m): - self.m = m - self.inst_dir = loader.get_project_install_dir(m) - self.project_hash = loader.get_project_hash(m) - self.ctx = loader.ctx_gen.get_context(m.name) - self.loader = loader - self.cache = cache - - self.cache_file_name = "-".join( - ( - m.name, - self.ctx.get("os"), - self.ctx.get("distro") or "none", - self.ctx.get("distro_vers") or "none", - self.project_hash, - "buildcache.tgz", - ) - ) - - def is_cacheable(self): - """We only cache third party projects""" - return self.cache and self.m.shipit_project is None - - def was_cached(self): - cached_marker = os.path.join(self.inst_dir, ".getdeps-cached-build") - return os.path.exists(cached_marker) - - def download(self): - if self.is_cacheable() and not os.path.exists(self.inst_dir): - print("check cache for %s" % self.cache_file_name) - dl_dir = os.path.join(self.loader.build_opts.scratch_dir, "downloads") - if not os.path.exists(dl_dir): - os.makedirs(dl_dir) - try: - target_file_name = os.path.join(dl_dir, self.cache_file_name) - if self.cache.download_to_file(self.cache_file_name, target_file_name): - tf = tarfile.open(target_file_name, "r") - print( - "Extracting %s -> %s..." % (self.cache_file_name, self.inst_dir) - ) - tf.extractall(self.inst_dir) - - cached_marker = os.path.join(self.inst_dir, ".getdeps-cached-build") - with open(cached_marker, "w") as f: - f.write("\n") - - return True - except Exception as exc: - print("%s" % str(exc)) - - return False - - def upload(self): - if self.is_cacheable(): - # We can prepare an archive and stick it in LFS - tempdir = tempfile.mkdtemp() - tarfilename = os.path.join(tempdir, self.cache_file_name) - print("Archiving for cache: %s..." % tarfilename) - tf = tarfile.open(tarfilename, "w:gz") - tf.add(self.inst_dir, arcname=".") - tf.close() - try: - self.cache.upload_from_file(self.cache_file_name, tarfilename) - except Exception as exc: - print( - "Failed to upload to cache (%s), continue anyway" % str(exc), - file=sys.stderr, - ) - shutil.rmtree(tempdir) - - -@cmd("fetch", "fetch the code for a given project") -class FetchCmd(ProjectCmdBase): - def setup_project_cmd_parser(self, parser): - parser.add_argument( - "--recursive", - help="fetch the transitive deps also", - action="store_true", - default=False, - ) - parser.add_argument( - "--host-type", - help=( - "When recursively fetching, fetch deps for " - "this host type rather than the current system" - ), - ) - - def run_project_cmd(self, args, loader, manifest): - if args.recursive: - projects = loader.manifests_in_dependency_order() - else: - projects = [manifest] - - cache = cache_module.create_cache() - for m in projects: - cached_project = CachedProject(cache, loader, m) - if cached_project.download(): - continue - - inst_dir = loader.get_project_install_dir(m) - built_marker = os.path.join(inst_dir, ".built-by-getdeps") - if os.path.exists(built_marker): - with open(built_marker, "r") as f: - built_hash = f.read().strip() - - project_hash = loader.get_project_hash(m) - if built_hash == project_hash: - continue - - # We need to fetch the sources - fetcher = loader.create_fetcher(m) - fetcher.update() - - -@cmd("install-system-deps", "Install system packages to satisfy the deps for a project") -class InstallSysDepsCmd(ProjectCmdBase): - def setup_project_cmd_parser(self, parser): - parser.add_argument( - "--recursive", - help="install the transitive deps also", - action="store_true", - default=False, - ) - parser.add_argument( - "--dry-run", - action="store_true", - default=False, - help="Don't install, just print the commands specs we would run", - ) - parser.add_argument( - "--os-type", - help="Filter to just this OS type to run", - choices=["linux", "darwin", "windows"], - action="store", - dest="ostype", - default=None, - ) - parser.add_argument( - "--distro", - help="Filter to just this distro to run", - choices=["ubuntu", "centos_stream"], - action="store", - dest="distro", - default=None, - ) - parser.add_argument( - "--distro-version", - help="Filter to just this distro version", - action="store", - dest="distrovers", - default=None, - ) - - def run_project_cmd(self, args, loader, manifest): - if args.recursive: - projects = loader.manifests_in_dependency_order() - else: - projects = [manifest] - - rebuild_ctx_gen = False - if args.ostype: - loader.build_opts.host_type.ostype = args.ostype - loader.build_opts.host_type.distro = None - loader.build_opts.host_type.distrovers = None - rebuild_ctx_gen = True - - if args.distro: - loader.build_opts.host_type.distro = args.distro - loader.build_opts.host_type.distrovers = None - rebuild_ctx_gen = True - - if args.distrovers: - loader.build_opts.host_type.distrovers = args.distrovers - rebuild_ctx_gen = True - - if rebuild_ctx_gen: - loader.ctx_gen = loader.build_opts.get_context_generator() - - manager = loader.build_opts.host_type.get_package_manager() - - all_packages = {} - for m in projects: - ctx = loader.ctx_gen.get_context(m.name) - packages = m.get_required_system_packages(ctx) - for k, v in packages.items(): - merged = all_packages.get(k, []) - merged += v - all_packages[k] = merged - - cmd_args = None - if manager == "rpm": - packages = sorted(set(all_packages["rpm"])) - if packages: - cmd_args = ["sudo", "dnf", "install", "-y"] + packages - elif manager == "deb": - packages = sorted(set(all_packages["deb"])) - if packages: - cmd_args = ["sudo", "apt", "install", "-y"] + packages - elif manager == "homebrew": - packages = sorted(set(all_packages["homebrew"])) - if packages: - cmd_args = ["brew", "install"] + packages - - else: - host_tuple = loader.build_opts.host_type.as_tuple_string() - print( - f"I don't know how to install any packages on this system {host_tuple}" - ) - return - - if cmd_args: - if args.dry_run: - print(" ".join(cmd_args)) - else: - run_cmd(cmd_args) - else: - print("no packages to install") - - -@cmd("list-deps", "lists the transitive deps for a given project") -class ListDepsCmd(ProjectCmdBase): - def run_project_cmd(self, args, loader, manifest): - for m in loader.manifests_in_dependency_order(): - print(m.name) - return 0 - - def setup_project_cmd_parser(self, parser): - parser.add_argument( - "--host-type", - help=( - "Produce the list for the specified host type, " - "rather than that of the current system" - ), - ) - - -def clean_dirs(opts): - for d in ["build", "installed", "extracted", "shipit"]: - d = os.path.join(opts.scratch_dir, d) - print("Cleaning %s..." % d) - if os.path.exists(d): - shutil.rmtree(d) - - -@cmd("clean", "clean up the scratch dir") -class CleanCmd(SubCmd): - def run(self, args): - opts = setup_build_options(args) - clean_dirs(opts) - - -@cmd("show-build-dir", "print the build dir for a given project") -class ShowBuildDirCmd(ProjectCmdBase): - def run_project_cmd(self, args, loader, manifest): - if args.recursive: - manifests = loader.manifests_in_dependency_order() - else: - manifests = [manifest] - - for m in manifests: - inst_dir = loader.get_project_build_dir(m) - print(inst_dir) - - def setup_project_cmd_parser(self, parser): - parser.add_argument( - "--recursive", - help="print the transitive deps also", - action="store_true", - default=False, - ) - - -@cmd("show-inst-dir", "print the installation dir for a given project") -class ShowInstDirCmd(ProjectCmdBase): - def run_project_cmd(self, args, loader, manifest): - if args.recursive: - manifests = loader.manifests_in_dependency_order() - else: - manifests = [manifest] - - for m in manifests: - inst_dir = loader.get_project_install_dir_respecting_install_prefix(m) - print(inst_dir) - - def setup_project_cmd_parser(self, parser): - parser.add_argument( - "--recursive", - help="print the transitive deps also", - action="store_true", - default=False, - ) - - -@cmd("show-source-dir", "print the source dir for a given project") -class ShowSourceDirCmd(ProjectCmdBase): - def run_project_cmd(self, args, loader, manifest): - if args.recursive: - manifests = loader.manifests_in_dependency_order() - else: - manifests = [manifest] - - for m in manifests: - fetcher = loader.create_fetcher(m) - print(fetcher.get_src_dir()) - - def setup_project_cmd_parser(self, parser): - parser.add_argument( - "--recursive", - help="print the transitive deps also", - action="store_true", - default=False, - ) - - -@cmd("build", "build a given project") -class BuildCmd(ProjectCmdBase): - def run_project_cmd(self, args, loader, manifest): - if args.clean: - clean_dirs(loader.build_opts) - - print("Building on %s" % loader.ctx_gen.get_context(args.project)) - projects = loader.manifests_in_dependency_order() - - cache = cache_module.create_cache() if args.use_build_cache else None - - # Accumulate the install directories so that the build steps - # can find their dep installation - install_dirs = [] - - for m in projects: - fetcher = loader.create_fetcher(m) - - if isinstance(fetcher, SystemPackageFetcher): - # We are guaranteed that if the fetcher is set to - # SystemPackageFetcher then this item is completely - # satisfied by the appropriate system packages - continue - - if args.clean: - fetcher.clean() - - build_dir = loader.get_project_build_dir(m) - inst_dir = loader.get_project_install_dir(m) - - if ( - m == manifest - and not args.only_deps - or m != manifest - and not args.no_deps - ): - print("Assessing %s..." % m.name) - project_hash = loader.get_project_hash(m) - ctx = loader.ctx_gen.get_context(m.name) - built_marker = os.path.join(inst_dir, ".built-by-getdeps") - - cached_project = CachedProject(cache, loader, m) - - reconfigure, sources_changed = self.compute_source_change_status( - cached_project, fetcher, m, built_marker, project_hash - ) - - if os.path.exists(built_marker) and not cached_project.was_cached(): - # We've previously built this. We may need to reconfigure if - # our deps have changed, so let's check them. - dep_reconfigure, dep_build = self.compute_dep_change_status( - m, built_marker, loader - ) - if dep_reconfigure: - reconfigure = True - if dep_build: - sources_changed = True - - extra_cmake_defines = ( - json.loads(args.extra_cmake_defines) - if args.extra_cmake_defines - else {} - ) - - extra_b2_args = args.extra_b2_args or [] - - if sources_changed or reconfigure or not os.path.exists(built_marker): - if os.path.exists(built_marker): - os.unlink(built_marker) - src_dir = fetcher.get_src_dir() - # Prepare builders write out config before the main builder runs - prepare_builders = m.create_prepare_builders( - loader.build_opts, - ctx, - src_dir, - build_dir, - inst_dir, - loader, - ) - for preparer in prepare_builders: - preparer.prepare(install_dirs, reconfigure=reconfigure) - - builder = m.create_builder( - loader.build_opts, - src_dir, - build_dir, - inst_dir, - ctx, - loader, - final_install_prefix=loader.get_project_install_prefix(m), - extra_cmake_defines=extra_cmake_defines, - cmake_target=args.cmake_target if m == manifest else "install", - extra_b2_args=extra_b2_args, - ) - builder.build(install_dirs, reconfigure=reconfigure) - - # If we are building the project (not dependency) and a specific - # cmake_target (not 'install') has been requested, then we don't - # set the built_marker. This allows subsequent runs of getdeps.py - # for the project to run with different cmake_targets to trigger - # cmake - has_built_marker = False - if not (m == manifest and args.cmake_target != "install"): - with open(built_marker, "w") as f: - f.write(project_hash) - has_built_marker = True - - # Only populate the cache from continuous build runs, and - # only if we have a built_marker. - if args.schedule_type == "continuous" and has_built_marker: - cached_project.upload() - elif args.verbose: - print("found good %s" % built_marker) - - # Paths are resolved from front. We prepend rather than append as - # the last project in topo order is the project itself, which - # should be first in the path, then its deps and so on. - install_dirs.insert(0, inst_dir) - - def compute_dep_change_status(self, m, built_marker, loader): - reconfigure = False - sources_changed = False - st = os.lstat(built_marker) - - ctx = loader.ctx_gen.get_context(m.name) - dep_list = m.get_dependencies(ctx) - for dep in dep_list: - if reconfigure and sources_changed: - break - - dep_manifest = loader.load_manifest(dep) - dep_root = loader.get_project_install_dir(dep_manifest) - for dep_file in list_files_under_dir_newer_than_timestamp( - dep_root, st.st_mtime - ): - if os.path.basename(dep_file) == ".built-by-getdeps": - continue - if file_name_is_cmake_file(dep_file): - if not reconfigure: - reconfigure = True - print( - f"Will reconfigure cmake because {dep_file} is newer than {built_marker}" - ) - else: - if not sources_changed: - sources_changed = True - print( - f"Will run build because {dep_file} is newer than {built_marker}" - ) - - if reconfigure and sources_changed: - break - - return reconfigure, sources_changed - - def compute_source_change_status( - self, cached_project, fetcher, m, built_marker, project_hash - ): - reconfigure = False - sources_changed = False - if cached_project.download(): - if not os.path.exists(built_marker): - fetcher.update() - else: - check_fetcher = True - if os.path.exists(built_marker): - check_fetcher = False - with open(built_marker, "r") as f: - built_hash = f.read().strip() - if built_hash == project_hash: - if cached_project.is_cacheable(): - # We can blindly trust the build status - reconfigure = False - sources_changed = False - else: - # Otherwise, we may have changed the source, so let's - # check in with the fetcher layer - check_fetcher = True - else: - # Some kind of inconsistency with a prior build, - # let's run it again to be sure - os.unlink(built_marker) - reconfigure = True - sources_changed = True - # While we don't need to consult the fetcher for the - # status in this case, we may still need to have eg: shipit - # run in order to have a correct source tree. - fetcher.update() - - if check_fetcher: - change_status = fetcher.update() - reconfigure = change_status.build_changed() - sources_changed = change_status.sources_changed() - - return reconfigure, sources_changed - - def setup_project_cmd_parser(self, parser): - parser.add_argument( - "--clean", - action="store_true", - default=False, - help=( - "Clean up the build and installation area prior to building, " - "causing the projects to be built from scratch" - ), - ) - parser.add_argument( - "--no-deps", - action="store_true", - default=False, - help=( - "Only build the named project, not its deps. " - "This is most useful after you've built all of the deps, " - "and helps to avoid waiting for relatively " - "slow up-to-date-ness checks" - ), - ) - parser.add_argument( - "--only-deps", - action="store_true", - default=False, - help=( - "Only build the named project's deps. " - "This is most useful when you want to separate out building " - "of all of the deps and your project" - ), - ) - parser.add_argument( - "--no-build-cache", - action="store_false", - default=True, - dest="use_build_cache", - help="Do not attempt to use the build cache.", - ) - parser.add_argument( - "--schedule-type", help="Indicates how the build was activated" - ) - parser.add_argument( - "--extra-cmake-defines", - help=( - "Input json map that contains extra cmake defines to be used " - "when compiling the current project and all its deps. " - 'e.g: \'{"CMAKE_CXX_FLAGS": "--bla"}\'' - ), - ) - parser.add_argument( - "--cmake-target", - help=("Target for cmake build."), - default="install", - ) - parser.add_argument( - "--extra-b2-args", - help=( - "Repeatable argument that contains extra arguments to pass " - "to b2, which compiles boost. " - "e.g.: 'cxxflags=-fPIC' 'cflags=-fPIC'" - ), - action="append", - ) - parser.add_argument( - "--shared-libs", - help="Build shared libraries if possible", - action="store_true", - default=False, - ) - parser.add_argument( - "--free-up-disk", - help="Remove unused tools and clean up intermediate files if possible to maximise space for the build", - action="store_true", - default=False, - ) - - -@cmd("fixup-dyn-deps", "Adjusts dynamic dependencies for packaging purposes") -class FixupDeps(ProjectCmdBase): - def run_project_cmd(self, args, loader, manifest): - projects = loader.manifests_in_dependency_order() - - # Accumulate the install directories so that the build steps - # can find their dep installation - install_dirs = [] - - for m in projects: - inst_dir = loader.get_project_install_dir_respecting_install_prefix(m) - install_dirs.append(inst_dir) - - if m == manifest: - dep_munger = create_dyn_dep_munger( - loader.build_opts, install_dirs, args.strip - ) - if dep_munger is None: - print(f"dynamic dependency fixups not supported on {sys.platform}") - else: - dep_munger.process_deps(args.destdir, args.final_install_prefix) - - def setup_project_cmd_parser(self, parser): - parser.add_argument("destdir", help="Where to copy the fixed up executables") - parser.add_argument( - "--final-install-prefix", help="specify the final installation prefix" - ) - parser.add_argument( - "--strip", - action="store_true", - default=False, - help="Strip debug info while processing executables", - ) - - -@cmd("test", "test a given project") -class TestCmd(ProjectCmdBase): - def run_project_cmd(self, args, loader, manifest): - projects = loader.manifests_in_dependency_order() - - # Accumulate the install directories so that the test steps - # can find their dep installation - install_dirs = [] - - for m in projects: - inst_dir = loader.get_project_install_dir(m) - - if m == manifest or args.test_dependencies: - built_marker = os.path.join(inst_dir, ".built-by-getdeps") - if not os.path.exists(built_marker): - print("project %s has not been built" % m.name) - # TODO: we could just go ahead and build it here, but I - # want to tackle that as part of adding build-for-test - # support. - return 1 - fetcher = loader.create_fetcher(m) - src_dir = fetcher.get_src_dir() - ctx = loader.ctx_gen.get_context(m.name) - build_dir = loader.get_project_build_dir(m) - builder = m.create_builder( - loader.build_opts, src_dir, build_dir, inst_dir, ctx, loader - ) - - builder.run_tests( - install_dirs, - schedule_type=args.schedule_type, - owner=args.test_owner, - test_filter=args.filter, - retry=args.retry, - no_testpilot=args.no_testpilot, - ) - - install_dirs.append(inst_dir) - - def setup_project_cmd_parser(self, parser): - parser.add_argument( - "--schedule-type", help="Indicates how the build was activated" - ) - parser.add_argument("--test-owner", help="Owner for testpilot") - parser.add_argument("--filter", help="Only run the tests matching the regex") - parser.add_argument( - "--retry", - type=int, - default=3, - help="Number of immediate retries for failed tests " - "(noop in continuous and testwarden runs)", - ) - parser.add_argument( - "--no-testpilot", - help="Do not use Test Pilot even when available", - action="store_true", - ) - - -@cmd("generate-github-actions", "generate a GitHub actions configuration") -class GenerateGitHubActionsCmd(ProjectCmdBase): - RUN_ON_ALL = """ [push, pull_request]""" - - def run_project_cmd(self, args, loader, manifest): - platforms = [ - HostType("linux", "ubuntu", "18"), - HostType("darwin", None, None), - HostType("windows", None, None), - ] - - for p in platforms: - if args.os_types and p.ostype not in args.os_types: - continue - self.write_job_for_platform(p, args) - - def get_run_on(self, args): - if args.run_on_all_branches: - return self.RUN_ON_ALL - return f""" - push: - branches: - - {args.main_branch} - pull_request: - branches: - - {args.main_branch}""" - - # TODO: Break up complex function - def write_job_for_platform(self, platform, args): # noqa: C901 - build_opts = setup_build_options(args, platform) - ctx_gen = build_opts.get_context_generator() - loader = ManifestLoader(build_opts, ctx_gen) - manifest = loader.load_manifest(args.project) - manifest_ctx = loader.ctx_gen.get_context(manifest.name) - run_on = self.get_run_on(args) - - # Some projects don't do anything "useful" as a leaf project, only - # as a dep for a leaf project. Check for those here; we don't want - # to waste the effort scheduling them on CI. - # We do this by looking at the builder type in the manifest file - # rather than creating a builder and checking its type because we - # don't know enough to create the full builder instance here. - builder_name = manifest.get("build", "builder", ctx=manifest_ctx) - if builder_name == "nop": - return None - - # We want to be sure that we're running things with python 3 - # but python versioning is honestly a bit of a frustrating mess. - # `python` may be version 2 or version 3 depending on the system. - # python3 may not be a thing at all! - # Assume an optimistic default - py3 = "python3" - - if build_opts.is_linux(): - artifacts = "linux" - runs_on = f"ubuntu-{args.ubuntu_version}" - elif build_opts.is_windows(): - artifacts = "windows" - runs_on = "windows-2019" - # The windows runners are python 3 by default; python2.exe - # is available if needed. - py3 = "python" - else: - artifacts = "mac" - runs_on = "macOS-latest" - - os.makedirs(args.output_dir, exist_ok=True) - - job_file_prefix = "getdeps_" - if args.job_file_prefix: - job_file_prefix = args.job_file_prefix - - output_file = os.path.join(args.output_dir, f"{job_file_prefix}{artifacts}.yml") - - if args.job_name_prefix: - job_name = args.job_name_prefix + artifacts.capitalize() - else: - job_name = artifacts - - with open(output_file, "w") as out: - # Deliberate line break here because the @ and the generated - # symbols are meaningful to our internal tooling when they - # appear in a single token - out.write("# This file was @") - out.write("generated by getdeps.py\n") - out.write( - f""" -name: {job_name} - -on:{run_on} - -permissions: - contents: read # to fetch code (actions/checkout) - -jobs: -""" - ) - - getdepscmd = f"{py3} build/fbcode_builder/getdeps.py" - - out.write(" build:\n") - out.write(" runs-on: %s\n" % runs_on) - out.write(" steps:\n") - - if build_opts.is_windows(): - # cmake relies on BOOST_ROOT but GH deliberately don't set it in order - # to avoid versioning issues: - # https://github.com/actions/virtual-environments/issues/319 - # Instead, set the version we think we need; this is effectively - # coupled with the boost manifest - # This is the unusual syntax for setting an env var for the rest of - # the steps in a workflow: - # https://github.blog/changelog/2020-10-01-github-actions-deprecating-set-env-and-add-path-commands/ - out.write(" - name: Export boost environment\n") - out.write( - ' run: "echo BOOST_ROOT=%BOOST_ROOT_1_83_0% >> %GITHUB_ENV%"\n' - ) - out.write(" shell: cmd\n") - - # The git installation may not like long filenames, so tell it - # that we want it to use them! - out.write(" - name: Fix Git config\n") - out.write(" run: git config --system core.longpaths true\n") - out.write(" - name: Disable autocrlf\n") - out.write(" run: git config --system core.autocrlf false\n") - - out.write(" - uses: actions/checkout@v2\n") - - if build_opts.free_up_disk: - free_up_disk = "--free-up-disk " - if not build_opts.is_windows(): - out.write(" - name: Show disk space at start\n") - out.write(" run: df -h\n") - # remove the unused github supplied android dev tools - out.write(" - name: Free up disk space\n") - out.write(" run: sudo rm -rf /usr/local/lib/android\n") - out.write(" - name: Show disk space after freeing up\n") - out.write(" run: df -h\n") - else: - free_up_disk = "" - - allow_sys_arg = "" - if ( - build_opts.allow_system_packages - and build_opts.host_type.get_package_manager() - ): - sudo_arg = "sudo " - allow_sys_arg = " --allow-system-packages" - if build_opts.host_type.get_package_manager() == "deb": - out.write(" - name: Update system package info\n") - out.write(f" run: {sudo_arg}apt-get update\n") - - out.write(" - name: Install system deps\n") - if build_opts.is_darwin(): - # brew is installed as regular user - sudo_arg = "" - out.write( - f" run: {sudo_arg}python3 build/fbcode_builder/getdeps.py --allow-system-packages install-system-deps --recursive {manifest.name}\n" - ) - - projects = loader.manifests_in_dependency_order() - - main_repo_url = manifest.get_repo_url(manifest_ctx) - has_same_repo_dep = False - - # Add the rust dep which doesn't have a manifest - for m in projects: - if m == manifest: - continue - mbuilder_name = m.get("build", "builder", ctx=manifest_ctx) - if ( - m.name == "rust" - or builder_name == "cargo" - or mbuilder_name == "cargo" - ): - out.write(" - name: Install Rust Stable\n") - out.write(" uses: dtolnay/rust-toolchain@stable\n") - break - - # Normal deps that have manifests - for m in projects: - if m == manifest or m.name == "rust": - continue - ctx = loader.ctx_gen.get_context(m.name) - if m.get_repo_url(ctx) != main_repo_url: - out.write(" - name: Fetch %s\n" % m.name) - out.write( - f" run: {getdepscmd}{allow_sys_arg} fetch --no-tests {m.name}\n" - ) - - for m in projects: - if m != manifest: - if m.name == "rust": - continue - else: - src_dir_arg = "" - ctx = loader.ctx_gen.get_context(m.name) - if main_repo_url and m.get_repo_url(ctx) == main_repo_url: - # Its in the same repo, so src-dir is also . - src_dir_arg = "--src-dir=. " - has_same_repo_dep = True - out.write(" - name: Build %s\n" % m.name) - out.write( - f" run: {getdepscmd}{allow_sys_arg} build {src_dir_arg}{free_up_disk}--no-tests {m.name}\n" - ) - - out.write(" - name: Build %s\n" % manifest.name) - - project_prefix = "" - if not build_opts.is_windows(): - project_prefix = ( - " --project-install-prefix %s:/usr/local" % manifest.name - ) - - # If we have dep from same repo, we already built it and don't want to rebuild it again - no_deps_arg = "" - if has_same_repo_dep: - no_deps_arg = "--no-deps " - - no_tests_arg = "" - if not args.enable_tests: - no_tests_arg = "--no-tests " - - out.write( - f" run: {getdepscmd}{allow_sys_arg} build {no_tests_arg}{no_deps_arg}--src-dir=. {manifest.name} {project_prefix}\n" - ) - - out.write(" - name: Copy artifacts\n") - if build_opts.is_linux(): - # Strip debug info from the binaries, but only on linux. - # While the `strip` utility is also available on macOS, - # attempting to strip there results in an error. - # The `strip` utility is not available on Windows. - strip = " --strip" - else: - strip = "" - - out.write( - f" run: {getdepscmd}{allow_sys_arg} fixup-dyn-deps{strip} " - f"--src-dir=. {manifest.name} _artifacts/{artifacts} {project_prefix} " - f"--final-install-prefix /usr/local\n" - ) - - out.write(" - uses: actions/upload-artifact@v2\n") - out.write(" with:\n") - out.write(" name: %s\n" % manifest.name) - out.write(" path: _artifacts\n") - - if ( - args.enable_tests - and manifest.get("github.actions", "run_tests", ctx=manifest_ctx) - != "off" - ): - out.write(" - name: Test %s\n" % manifest.name) - out.write( - f" run: {getdepscmd}{allow_sys_arg} test --src-dir=. {manifest.name} {project_prefix}\n" - ) - if build_opts.free_up_disk and not build_opts.is_windows(): - out.write(" - name: Show disk space at end\n") - out.write(" run: df -h\n") - - def setup_project_cmd_parser(self, parser): - parser.add_argument( - "--disallow-system-packages", - help="Disallow satisfying third party deps from installed system packages", - action="store_true", - default=False, - ) - parser.add_argument( - "--output-dir", help="The directory that will contain the yml files" - ) - parser.add_argument( - "--run-on-all-branches", - action="store_true", - help="Allow CI to fire on all branches - Handy for testing", - ) - parser.add_argument( - "--ubuntu-version", default="20.04", help="Version of Ubuntu to use" - ) - parser.add_argument( - "--main-branch", - default="main", - help="Main branch to trigger GitHub Action on", - ) - parser.add_argument( - "--os-type", - help="Filter to just this OS type to run", - choices=["linux", "darwin", "windows"], - action="append", - dest="os_types", - default=[], - ) - parser.add_argument( - "--job-file-prefix", - type=str, - help="add a prefix to all job file names", - default=None, - ) - parser.add_argument( - "--job-name-prefix", - type=str, - help="add a prefix to all job names", - default=None, - ) - parser.add_argument( - "--free-up-disk", - help="Remove unused tools and clean up intermediate files if possible to maximise space for the build", - action="store_true", - default=False, - ) - - -def get_arg_var_name(args): - for arg in args: - if arg.startswith("--"): - return arg[2:].replace("-", "_") - - raise Exception("unable to determine argument variable name from %r" % (args,)) - - -def parse_args(): - # We want to allow common arguments to be specified either before or after - # the subcommand name. In order to do this we add them to the main parser - # and to subcommand parsers. In order for this to work, we need to tell - # argparse that the default value is SUPPRESS, so that the default values - # from the subparser arguments won't override values set by the user from - # the main parser. We maintain our own list of desired defaults in the - # common_defaults dictionary, and manually set those if the argument wasn't - # present at all. - common_args = argparse.ArgumentParser(add_help=False) - common_defaults = {} - - def add_common_arg(*args, **kwargs): - var_name = get_arg_var_name(args) - default_value = kwargs.pop("default", None) - common_defaults[var_name] = default_value - kwargs["default"] = argparse.SUPPRESS - common_args.add_argument(*args, **kwargs) - - add_common_arg("--scratch-path", help="Where to maintain checkouts and build dirs") - add_common_arg( - "--vcvars-path", default=None, help="Path to the vcvarsall.bat on Windows." - ) - add_common_arg( - "--install-prefix", - help=( - "Where the final build products will be installed " - "(default is [scratch-path]/installed)" - ), - ) - add_common_arg( - "--num-jobs", - type=int, - help=( - "Number of concurrent jobs to use while building. " - "(default=number of cpu cores)" - ), - ) - add_common_arg( - "--use-shipit", - help="use the real ShipIt instead of the simple shipit transformer", - action="store_true", - default=False, - ) - add_common_arg( - "--facebook-internal", - help="Setup the build context as an FB internal build", - action="store_true", - default=None, - ) - add_common_arg( - "--no-facebook-internal", - help="Perform a non-FB internal build, even when in an fbsource repository", - action="store_false", - dest="facebook_internal", - ) - add_common_arg( - "--allow-system-packages", - help="Allow satisfying third party deps from installed system packages", - action="store_true", - default=False, - ) - add_common_arg( - "-v", - "--verbose", - help="Print more output", - action="store_true", - default=False, - ) - add_common_arg( - "--lfs-path", - help="Provide a parent directory for lfs when fbsource is unavailable", - default=None, - ) - - ap = argparse.ArgumentParser( - description="Get and build dependencies and projects", parents=[common_args] - ) - sub = ap.add_subparsers( - # metavar suppresses the long and ugly default list of subcommands on a - # single line. We still render the nicer list below where we would - # have shown the nasty one. - metavar="", - title="Available commands", - help="", - ) - - add_subcommands(sub, common_args) - - args = ap.parse_args() - for var_name, default_value in common_defaults.items(): - if not hasattr(args, var_name): - setattr(args, var_name, default_value) - - return ap, args - - -def main(): - ap, args = parse_args() - if getattr(args, "func", None) is None: - ap.print_help() - return 0 - try: - return args.func(args) - except UsageError as exc: - ap.error(str(exc)) - return 1 - except TransientFailure as exc: - print("TransientFailure: %s" % str(exc)) - # This return code is treated as a retryable transient infrastructure - # error by Facebook's internal CI, rather than eg: a build or code - # related error that needs to be fixed before progress can be made. - return 128 - except subprocess.CalledProcessError as exc: - print("%s" % str(exc), file=sys.stderr) - print("!! Failed", file=sys.stderr) - return 1 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/build/fbcode_builder/getdeps/__init__.py b/build/fbcode_builder/getdeps/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/build/fbcode_builder/getdeps/builder.py b/build/fbcode_builder/getdeps/builder.py deleted file mode 100644 index 3bcadb5d6e986..0000000000000 --- a/build/fbcode_builder/getdeps/builder.py +++ /dev/null @@ -1,1202 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import glob -import json -import os -import pathlib -import shutil -import stat -import subprocess -import sys -import typing -from typing import Optional - -from .dyndeps import create_dyn_dep_munger -from .envfuncs import add_path_entry, Env, path_search -from .fetcher import copy_if_different -from .runcmd import run_cmd - -if typing.TYPE_CHECKING: - from .buildopts import BuildOptions - - -class BuilderBase(object): - def __init__( - self, - build_opts: "BuildOptions", - ctx, - manifest, - src_dir, - build_dir, - inst_dir, - env=None, - final_install_prefix=None, - ) -> None: - self.env = Env() - if env: - self.env.update(env) - - subdir = manifest.get("build", "subdir", ctx=ctx) - if subdir: - src_dir = os.path.join(src_dir, subdir) - - self.patchfile = manifest.get("build", "patchfile", ctx=ctx) - self.patchfile_opts = manifest.get("build", "patchfile_opts", ctx=ctx) or "" - self.ctx = ctx - self.src_dir = src_dir - self.build_dir = build_dir or src_dir - self.inst_dir = inst_dir - self.build_opts = build_opts - self.manifest = manifest - self.final_install_prefix = final_install_prefix - - def _get_cmd_prefix(self): - if self.build_opts.is_windows(): - vcvarsall = self.build_opts.get_vcvars_path() - if vcvarsall is not None: - # Since it sets rather a large number of variables we mildly abuse - # the cmd quoting rules to assemble a command that calls the script - # to prep the environment and then triggers the actual command that - # we wanted to run. - return [vcvarsall, "amd64", "&&"] - return [] - - def _run_cmd( - self, - cmd, - cwd=None, - env=None, - use_cmd_prefix: bool = True, - allow_fail: bool = False, - ) -> int: - if env: - e = self.env.copy() - e.update(env) - env = e - else: - env = self.env - - if use_cmd_prefix: - cmd_prefix = self._get_cmd_prefix() - if cmd_prefix: - cmd = cmd_prefix + cmd - - log_file = os.path.join(self.build_dir, "getdeps_build.log") - return run_cmd( - cmd=cmd, - env=env, - cwd=cwd or self.build_dir, - log_file=log_file, - allow_fail=allow_fail, - ) - - def _reconfigure(self, reconfigure: bool) -> bool: - if self.build_dir is not None: - if not os.path.isdir(self.build_dir): - os.makedirs(self.build_dir) - reconfigure = True - return reconfigure - - def _apply_patchfile(self) -> None: - if self.patchfile is None: - return - patched_sentinel_file = pathlib.Path(self.src_dir + "/.getdeps_patched") - if patched_sentinel_file.exists(): - return - old_wd = os.getcwd() - os.chdir(self.src_dir) - print(f"Patching {self.manifest.name} with {self.patchfile} in {self.src_dir}") - patchfile = os.path.join( - self.build_opts.fbcode_builder_dir, "patches", self.patchfile - ) - patchcmd = ["git", "apply"] - if self.patchfile_opts: - patchcmd.append(self.patchfile_opts) - try: - subprocess.check_call(patchcmd + [patchfile]) - except subprocess.CalledProcessError: - raise ValueError(f"Failed to apply patch to {self.manifest.name}") - os.chdir(old_wd) - patched_sentinel_file.touch() - - def prepare(self, install_dirs, reconfigure: bool) -> None: - print("Preparing %s..." % self.manifest.name) - reconfigure = self._reconfigure(reconfigure) - self._apply_patchfile() - self._prepare(install_dirs=install_dirs, reconfigure=reconfigure) - - def build(self, install_dirs, reconfigure: bool) -> None: - print("Building %s..." % self.manifest.name) - reconfigure = self._reconfigure(reconfigure) - self._apply_patchfile() - self._prepare(install_dirs=install_dirs, reconfigure=reconfigure) - self._build(install_dirs=install_dirs, reconfigure=reconfigure) - - if self.build_opts.free_up_disk: - # don't clean --src-dir=. case as user may want to build again or run tests on the build - if self.src_dir.startswith(self.build_opts.scratch_dir) and os.path.isdir( - self.build_dir - ): - if os.path.islink(self.build_dir): - os.remove(self.build_dir) - else: - shutil.rmtree(self.build_dir) - - # On Windows, emit a wrapper script that can be used to run build artifacts - # directly from the build directory, without installing them. On Windows $PATH - # needs to be updated to include all of the directories containing the runtime - # library dependencies in order to run the binaries. - if self.build_opts.is_windows(): - script_path = self.get_dev_run_script_path() - dep_munger = create_dyn_dep_munger(self.build_opts, install_dirs) - dep_dirs = self.get_dev_run_extra_path_dirs(install_dirs, dep_munger) - # pyre-fixme[16]: Optional type has no attribute `emit_dev_run_script`. - dep_munger.emit_dev_run_script(script_path, dep_dirs) - - @property - def num_jobs(self) -> int: - # This is a hack, but we don't have a "defaults manifest" that we can - # customize per platform. - # TODO: Introduce some sort of defaults config that can select by - # platform, just like manifest contexts. - if sys.platform.startswith("freebsd"): - # clang on FreeBSD is quite memory-efficient. - default_job_weight = 512 - else: - # 1.5 GiB is a lot to assume, but it's typical of Facebook-style C++. - # Some manifests are even heavier and should override. - default_job_weight = 1536 - return self.build_opts.get_num_jobs( - int( - self.manifest.get( - "build", "job_weight_mib", default_job_weight, ctx=self.ctx - ) - ) - ) - - def run_tests( - self, install_dirs, schedule_type, owner, test_filter, retry, no_testpilot - ) -> None: - """Execute any tests that we know how to run. If they fail, - raise an exception.""" - pass - - def _prepare(self, install_dirs, reconfigure) -> None: - """Prepare the build. Useful when need to generate config, - but builder is not the primary build system. - e.g. cargo when called from cmake""" - pass - - def _build(self, install_dirs, reconfigure) -> None: - """Perform the build. - install_dirs contains the list of installation directories for - the dependencies of this project. - reconfigure will be set to true if the fetcher determined - that the sources have changed in such a way that the build - system needs to regenerate its rules.""" - pass - - def _compute_env(self, install_dirs): - # CMAKE_PREFIX_PATH is only respected when passed through the - # environment, so we construct an appropriate path to pass down - return self.build_opts.compute_env_for_install_dirs( - install_dirs, env=self.env, manifest=self.manifest - ) - - def get_dev_run_script_path(self): - assert self.build_opts.is_windows() - return os.path.join(self.build_dir, "run.ps1") - - def get_dev_run_extra_path_dirs(self, install_dirs, dep_munger=None): - assert self.build_opts.is_windows() - if dep_munger is None: - dep_munger = create_dyn_dep_munger(self.build_opts, install_dirs) - return dep_munger.compute_dependency_paths(self.build_dir) - - -class MakeBuilder(BuilderBase): - def __init__( - self, - build_opts, - ctx, - manifest, - src_dir, - build_dir, - inst_dir, - build_args, - install_args, - test_args, - ) -> None: - super(MakeBuilder, self).__init__( - build_opts, ctx, manifest, src_dir, build_dir, inst_dir - ) - self.build_args = build_args or [] - self.install_args = install_args or [] - self.test_args = test_args - - @property - def _make_binary(self): - return self.manifest.get("build", "make_binary", "make", ctx=self.ctx) - - def _get_prefix(self): - return ["PREFIX=" + self.inst_dir, "prefix=" + self.inst_dir] - - def _build(self, install_dirs, reconfigure) -> None: - - env = self._compute_env(install_dirs) - - # Need to ensure that PREFIX is set prior to install because - # libbpf uses it when generating its pkg-config file. - # The lowercase prefix is used by some projects. - cmd = ( - [self._make_binary, "-j%s" % self.num_jobs] - + self.build_args - + self._get_prefix() - ) - self._run_cmd(cmd, env=env) - - install_cmd = [self._make_binary] + self.install_args + self._get_prefix() - self._run_cmd(install_cmd, env=env) - - # bz2's Makefile doesn't install its .so properly - if self.manifest and self.manifest.name == "bz2": - libdir = os.path.join(self.inst_dir, "lib") - srcpattern = os.path.join(self.src_dir, "lib*.so.*") - print(f"copying to {libdir} from {srcpattern}") - for file in glob.glob(srcpattern): - shutil.copy(file, libdir) - - def run_tests( - self, install_dirs, schedule_type, owner, test_filter, retry, no_testpilot - ) -> None: - if not self.test_args: - return - - env = self._compute_env(install_dirs) - - cmd = [self._make_binary] + self.test_args + self._get_prefix() - self._run_cmd(cmd, env=env) - - -class CMakeBootStrapBuilder(MakeBuilder): - def _build(self, install_dirs, reconfigure) -> None: - self._run_cmd( - [ - "./bootstrap", - "--prefix=" + self.inst_dir, - f"--parallel={self.num_jobs}", - ] - ) - super(CMakeBootStrapBuilder, self)._build(install_dirs, reconfigure) - - -class AutoconfBuilder(BuilderBase): - def __init__( - self, - build_opts, - ctx, - manifest, - src_dir, - build_dir, - inst_dir, - args, - conf_env_args, - ) -> None: - super(AutoconfBuilder, self).__init__( - build_opts, ctx, manifest, src_dir, build_dir, inst_dir - ) - self.args = args or [] - self.conf_env_args = conf_env_args or {} - - @property - def _make_binary(self): - return self.manifest.get("build", "make_binary", "make", ctx=self.ctx) - - def _build(self, install_dirs, reconfigure) -> None: - configure_path = os.path.join(self.src_dir, "configure") - autogen_path = os.path.join(self.src_dir, "autogen.sh") - - env = self._compute_env(install_dirs) - - # Some configure scripts need additional env values passed derived from cmds - for (k, cmd_args) in self.conf_env_args.items(): - out = ( - subprocess.check_output(cmd_args, env=dict(env.items())) - .decode("utf-8") - .strip() - ) - if out: - env.set(k, out) - - if not os.path.exists(configure_path): - print("%s doesn't exist, so reconfiguring" % configure_path) - # This libtoolize call is a bit gross; the issue is that - # `autoreconf` as invoked by libsodium's `autogen.sh` doesn't - # seem to realize that it should invoke libtoolize and then - # error out when the configure script references a libtool - # related symbol. - self._run_cmd(["libtoolize"], cwd=self.src_dir, env=env) - - # We generally prefer to call the `autogen.sh` script provided - # by the project on the basis that it may know more than plain - # autoreconf does. - if os.path.exists(autogen_path): - self._run_cmd(["bash", autogen_path], cwd=self.src_dir, env=env) - else: - self._run_cmd(["autoreconf", "-ivf"], cwd=self.src_dir, env=env) - configure_cmd = [configure_path, "--prefix=" + self.inst_dir] + self.args - self._run_cmd(configure_cmd, env=env) - self._run_cmd([self._make_binary, "-j%s" % self.num_jobs], env=env) - self._run_cmd([self._make_binary, "install"], env=env) - - -class Iproute2Builder(BuilderBase): - # ./configure --prefix does not work for iproute2. - # Thus, explicitly copy sources from src_dir to build_dir, build, - # and then install to inst_dir using DESTDIR - # lastly, also copy include from build_dir to inst_dir - def __init__(self, build_opts, ctx, manifest, src_dir, build_dir, inst_dir) -> None: - super(Iproute2Builder, self).__init__( - build_opts, ctx, manifest, src_dir, build_dir, inst_dir - ) - - def _patch(self) -> None: - # FBOSS build currently depends on an old version of iproute2 (commit - # 7ca63aef7d1b0c808da0040c6b366ef7a61f38c1). This is missing a commit - # (ae717baf15fb4d30749ada3948d9445892bac239) needed to build iproute2 - # successfully. Apply it viz.: include stdint.h - # Reference: https://fburl.com/ilx9g5xm - with open(self.build_dir + "/tc/tc_core.c", "r") as f: - data = f.read() - - with open(self.build_dir + "/tc/tc_core.c", "w") as f: - f.write("#include \n") - f.write(data) - - def _build(self, install_dirs, reconfigure) -> None: - configure_path = os.path.join(self.src_dir, "configure") - - env = self.env.copy() - self._run_cmd([configure_path], env=env) - shutil.rmtree(self.build_dir) - shutil.copytree(self.src_dir, self.build_dir) - self._patch() - self._run_cmd(["make", "-j%s" % self.num_jobs], env=env) - install_cmd = ["make", "install", "DESTDIR=" + self.inst_dir] - - for d in ["include", "lib"]: - if not os.path.isdir(os.path.join(self.inst_dir, d)): - shutil.copytree( - os.path.join(self.build_dir, d), os.path.join(self.inst_dir, d) - ) - - self._run_cmd(install_cmd, env=env) - - -class CMakeBuilder(BuilderBase): - MANUAL_BUILD_SCRIPT = """\ -#!{sys.executable} - - -import argparse -import subprocess -import sys - -CMAKE = {cmake!r} -CTEST = {ctest!r} -SRC_DIR = {src_dir!r} -BUILD_DIR = {build_dir!r} -INSTALL_DIR = {install_dir!r} -CMD_PREFIX = {cmd_prefix!r} -CMAKE_ENV = {env_str} -CMAKE_DEFINE_ARGS = {define_args_str} - - -def get_jobs_argument(num_jobs_arg: int) -> str: - if num_jobs_arg > 0: - return "-j" + str(num_jobs_arg) - - import multiprocessing - num_jobs = multiprocessing.cpu_count() // 2 - return "-j" + str(num_jobs) - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument( - "cmake_args", - nargs=argparse.REMAINDER, - help='Any extra arguments after an "--" argument will be passed ' - "directly to CMake." - ) - ap.add_argument( - "--mode", - choices=["configure", "build", "install", "test"], - default="configure", - help="The mode to run: configure, build, or install. " - "Defaults to configure", - ) - ap.add_argument( - "--build", - action="store_const", - const="build", - dest="mode", - help="An alias for --mode=build", - ) - ap.add_argument( - "-j", - "--num-jobs", - action="store", - type=int, - default=0, - help="Run the build or tests with the specified number of parallel jobs", - ) - ap.add_argument( - "--install", - action="store_const", - const="install", - dest="mode", - help="An alias for --mode=install", - ) - ap.add_argument( - "--test", - action="store_const", - const="test", - dest="mode", - help="An alias for --mode=test", - ) - args = ap.parse_args() - - # Strip off a leading "--" from the additional CMake arguments - if args.cmake_args and args.cmake_args[0] == "--": - args.cmake_args = args.cmake_args[1:] - - env = CMAKE_ENV - - if args.mode == "configure": - full_cmd = CMD_PREFIX + [CMAKE, SRC_DIR] + CMAKE_DEFINE_ARGS + args.cmake_args - elif args.mode in ("build", "install"): - target = "all" if args.mode == "build" else "install" - full_cmd = CMD_PREFIX + [ - CMAKE, - "--build", - BUILD_DIR, - "--target", - target, - "--config", - "Release", - get_jobs_argument(args.num_jobs), - ] + args.cmake_args - elif args.mode == "test": - full_cmd = CMD_PREFIX + [ - {dev_run_script}CTEST, - "--output-on-failure", - get_jobs_argument(args.num_jobs), - ] + args.cmake_args - else: - ap.error("unknown invocation mode: %s" % (args.mode,)) - - cmd_str = " ".join(full_cmd) - print("Running: %r" % (cmd_str,)) - proc = subprocess.run(full_cmd, env=env, cwd=BUILD_DIR) - sys.exit(proc.returncode) - - -if __name__ == "__main__": - main() -""" - - def __init__( - self, - build_opts, - ctx, - manifest, - src_dir, - build_dir, - inst_dir, - defines, - loader=None, - final_install_prefix=None, - extra_cmake_defines=None, - cmake_target="install", - ) -> None: - super(CMakeBuilder, self).__init__( - build_opts, - ctx, - manifest, - src_dir, - build_dir, - inst_dir, - final_install_prefix=final_install_prefix, - ) - self.defines = defines or {} - if extra_cmake_defines: - self.defines.update(extra_cmake_defines) - self.cmake_target = cmake_target - - try: - from .facebook.vcvarsall import extra_vc_cmake_defines - except ImportError: - pass - else: - self.defines.update(extra_vc_cmake_defines) - - self.loader = loader - if build_opts.shared_libs: - self.defines["BUILD_SHARED_LIBS"] = "ON" - - def _invalidate_cache(self) -> None: - for name in [ - "CMakeCache.txt", - "CMakeFiles/CMakeError.log", - "CMakeFiles/CMakeOutput.log", - ]: - name = os.path.join(self.build_dir, name) - if os.path.isdir(name): - shutil.rmtree(name) - elif os.path.exists(name): - os.unlink(name) - - def _needs_reconfigure(self) -> bool: - for name in ["CMakeCache.txt", "build.ninja"]: - name = os.path.join(self.build_dir, name) - if not os.path.exists(name): - return True - return False - - def _write_build_script(self, **kwargs) -> None: - env_lines = [" {!r}: {!r},".format(k, v) for k, v in kwargs["env"].items()] - kwargs["env_str"] = "\n".join(["{"] + env_lines + ["}"]) - - if self.build_opts.is_windows(): - kwargs["dev_run_script"] = '"powershell.exe", {!r}, '.format( - self.get_dev_run_script_path() - ) - else: - kwargs["dev_run_script"] = "" - - define_arg_lines = ["["] - for arg in kwargs["define_args"]: - # Replace the CMAKE_INSTALL_PREFIX argument to use the INSTALL_DIR - # variable that we define in the MANUAL_BUILD_SCRIPT code. - if arg.startswith("-DCMAKE_INSTALL_PREFIX="): - value = " {!r}.format(INSTALL_DIR),".format( - "-DCMAKE_INSTALL_PREFIX={}" - ) - else: - value = " {!r},".format(arg) - define_arg_lines.append(value) - define_arg_lines.append("]") - kwargs["define_args_str"] = "\n".join(define_arg_lines) - - # In order to make it easier for developers to manually run builds for - # CMake-based projects, write out some build scripts that can be used to invoke - # CMake manually. - build_script_path = os.path.join(self.build_dir, "run_cmake.py") - script_contents = self.MANUAL_BUILD_SCRIPT.format(**kwargs) - with open(build_script_path, "wb") as f: - f.write(script_contents.encode()) - os.chmod(build_script_path, 0o755) - - def _compute_cmake_define_args(self, env): - defines = { - "CMAKE_INSTALL_PREFIX": self.final_install_prefix or self.inst_dir, - "BUILD_SHARED_LIBS": "OFF", - # Some of the deps (rsocket) default to UBSAN enabled if left - # unspecified. Some of the deps fail to compile in release mode - # due to warning->error promotion. RelWithDebInfo is the happy - # medium. - "CMAKE_BUILD_TYPE": "RelWithDebInfo", - } - if "SANDCASTLE" not in os.environ: - # We sometimes see intermittent ccache related breakages on some - # of the FB internal CI hosts, so we prefer to disable ccache - # when running in that environment. - ccache = path_search(env, "ccache") - if ccache: - defines["CMAKE_CXX_COMPILER_LAUNCHER"] = ccache - else: - # rocksdb does its own probing for ccache. - # Ensure that it is disabled on sandcastle - env["CCACHE_DISABLE"] = "1" - # Some sandcastle hosts have broken ccache related dirs, and - # even though we've asked for it to be disabled ccache is - # still invoked by rocksdb's cmake. - # Redirect its config directory to somewhere that is guaranteed - # fresh to us, and that won't have any ccache data inside. - env["CCACHE_DIR"] = f"{self.build_opts.scratch_dir}/ccache" - - if "GITHUB_ACTIONS" in os.environ and self.build_opts.is_windows(): - # GitHub actions: the host has both gcc and msvc installed, and - # the default behavior of cmake is to prefer gcc. - # Instruct cmake that we want it to use cl.exe; this is important - # because Boost prefers cl.exe and the mismatch results in cmake - # with gcc not being able to find boost built with cl.exe. - defines["CMAKE_C_COMPILER"] = "cl.exe" - defines["CMAKE_CXX_COMPILER"] = "cl.exe" - - if self.build_opts.is_darwin(): - # Try to persuade cmake to set the rpath to match the lib - # dirs of the dependencies. This isn't automatic, and to - # make things more interesting, cmake uses `;` as the path - # separator, so translate the runtime path to something - # that cmake will parse - defines["CMAKE_INSTALL_RPATH"] = ";".join( - env.get("DYLD_LIBRARY_PATH", "").split(":") - ) - # Tell cmake that we want to set the rpath in the tree - # at build time. Without this the rpath is only set - # at the moment that the binaries are installed. That - # default is problematic for example when using the - # gtest integration in cmake which runs the built test - # executables during the build to discover the set of - # tests. - defines["CMAKE_BUILD_WITH_INSTALL_RPATH"] = "ON" - - boost_169_is_required = False - if self.loader: - for m in self.loader.manifests_in_dependency_order(): - preinstalled = m.get_section_as_dict("preinstalled.env", self.ctx) - boost_169_is_required = "BOOST_ROOT_1_69_0" in preinstalled.keys() - if boost_169_is_required: - break - - if ( - boost_169_is_required - and self.build_opts.allow_system_packages - and self.build_opts.host_type.get_package_manager() - and self.build_opts.host_type.get_package_manager() == "rpm" - ): - # Boost 1.69 rpms don't install cmake config to the system, so to point to them explicitly - defines["BOOST_INCLUDEDIR"] = "/usr/include/boost169" - defines["BOOST_LIBRARYDIR"] = "/usr/lib64/boost169" - - defines.update(self.defines) - define_args = ["-D%s=%s" % (k, v) for (k, v) in defines.items()] - - # if self.build_opts.is_windows(): - # define_args += ["-G", "Visual Studio 15 2017 Win64"] - define_args += ["-G", "Ninja"] - - return define_args - - def _build(self, install_dirs, reconfigure: bool) -> None: - reconfigure = reconfigure or self._needs_reconfigure() - - env = self._compute_env(install_dirs) - if not self.build_opts.is_windows() and self.final_install_prefix: - env["DESTDIR"] = self.inst_dir - - # Resolve the cmake that we installed - cmake = path_search(env, "cmake") - if cmake is None: - raise Exception("Failed to find CMake") - - if reconfigure: - define_args = self._compute_cmake_define_args(env) - self._write_build_script( - cmd_prefix=self._get_cmd_prefix(), - cmake=cmake, - ctest=path_search(env, "ctest"), - env=env, - define_args=define_args, - src_dir=self.src_dir, - build_dir=self.build_dir, - install_dir=self.inst_dir, - sys=sys, - ) - - self._invalidate_cache() - self._run_cmd([cmake, self.src_dir] + define_args, env=env) - - self._run_cmd( - [ - cmake, - "--build", - self.build_dir, - "--target", - self.cmake_target, - "--config", - "Release", - "-j", - str(self.num_jobs), - ], - env=env, - ) - - def run_tests( - self, install_dirs, schedule_type, owner, test_filter, retry: int, no_testpilot - ) -> None: - env = self._compute_env(install_dirs) - ctest = path_search(env, "ctest") - cmake = path_search(env, "cmake") - - def require_command(path: Optional[str], name: str) -> str: - if path is None: - raise RuntimeError("unable to find command `{}`".format(name)) - return path - - # On Windows, we also need to update $PATH to include the directories that - # contain runtime library dependencies. This is not needed on other platforms - # since CMake will emit RPATH properly in the binary so they can find these - # dependencies. - if self.build_opts.is_windows(): - path_entries = self.get_dev_run_extra_path_dirs(install_dirs) - path = env.get("PATH") - if path: - path_entries.insert(0, path) - env["PATH"] = ";".join(path_entries) - - # Don't use the cmd_prefix when running tests. This is vcvarsall.bat on - # Windows. vcvarsall.bat is only needed for the build, not tests. It - # unfortunately fails if invoked with a long PATH environment variable when - # running the tests. - use_cmd_prefix = False - - def get_property(test, propname, defval=None): - """extracts a named property from a cmake test info json blob. - The properties look like: - [{"name": "WORKING_DIRECTORY"}, - {"value": "something"}] - We assume that it is invalid for the same named property to be - listed more than once. - """ - props = test.get("properties", []) - for p in props: - if p.get("name", None) == propname: - return p.get("value", defval) - return defval - - def list_tests(): - output = subprocess.check_output( - [require_command(ctest, "ctest"), "--show-only=json-v1"], - env=env, - cwd=self.build_dir, - ) - try: - data = json.loads(output.decode("utf-8")) - except ValueError as exc: - raise Exception( - "Failed to decode cmake test info using %s: %s. Output was: %r" - % (ctest, str(exc), output) - ) - - tests = [] - machine_suffix = self.build_opts.host_type.as_tuple_string() - for test in data["tests"]: - working_dir = get_property(test, "WORKING_DIRECTORY") - labels = [] - machine_suffix = self.build_opts.host_type.as_tuple_string() - labels.append("tpx-fb-test-type=3") - labels.append("tpx_test_config::buildsystem=getdeps") - labels.append("tpx_test_config::platform={}".format(machine_suffix)) - - if get_property(test, "DISABLED"): - labels.append("disabled") - command = test["command"] - if working_dir: - command = [ - require_command(cmake, "cmake"), - "-E", - "chdir", - working_dir, - ] + command - - import os - - tests.append( - { - "type": "custom", - "target": "%s-%s-getdeps-%s" - % (self.manifest.name, test["name"], machine_suffix), - "command": command, - "labels": labels, - "env": {}, - "required_paths": [], - "contacts": [], - "cwd": os.getcwd(), - } - ) - return tests - - if schedule_type == "continuous" or schedule_type == "testwarden": - # for continuous and testwarden runs, disabling retry can give up - # better signals for flaky tests. - retry = 0 - - tpx = path_search(env, "tpx") - if tpx and not no_testpilot: - buck_test_info = list_tests() - import os - - from .facebook.testinfra import start_run - - buck_test_info_name = os.path.join(self.build_dir, ".buck-test-info.json") - with open(buck_test_info_name, "w") as f: - json.dump(buck_test_info, f) - - env.set("http_proxy", "") - env.set("https_proxy", "") - runs = [] - from sys import platform - - with start_run(env["FBSOURCE_HASH"]) as run_id: - testpilot_args = [ - tpx, - "--force-local-execution", - "--buck-test-info", - buck_test_info_name, - "--retry=%d" % retry, - "-j=%s" % str(self.num_jobs), - "--print-long-results", - ] - - if owner: - testpilot_args += ["--contacts", owner] - - if env: - testpilot_args.append("--env") - testpilot_args.extend(f"{key}={val}" for key, val in env.items()) - - if run_id is not None: - testpilot_args += ["--run-id", run_id] - - if test_filter: - testpilot_args += ["--", test_filter] - - if schedule_type == "diff": - runs.append(["--collection", "oss-diff", "--purpose", "diff"]) - elif schedule_type == "continuous": - runs.append( - [ - "--tag-new-tests", - "--collection", - "oss-continuous", - "--purpose", - "continuous", - ] - ) - elif schedule_type == "testwarden": - # One run to assess new tests - runs.append( - [ - "--tag-new-tests", - "--collection", - "oss-new-test-stress", - "--stress-runs", - "10", - "--purpose", - "stress-run-new-test", - ] - ) - # And another for existing tests - runs.append( - [ - "--tag-new-tests", - "--collection", - "oss-existing-test-stress", - "--stress-runs", - "10", - "--purpose", - "stress-run", - ] - ) - else: - runs.append([]) - - for run in runs: - self._run_cmd( - testpilot_args + run, - cwd=self.build_opts.fbcode_builder_dir, - env=env, - use_cmd_prefix=use_cmd_prefix, - ) - else: - args = [ - require_command(ctest, "ctest"), - "--output-on-failure", - "-j", - str(self.num_jobs), - ] - if test_filter: - args += ["-R", test_filter] - - count = 0 - while count <= retry: - retcode = self._run_cmd( - args, env=env, use_cmd_prefix=use_cmd_prefix, allow_fail=True - ) - - if retcode == 0: - break - if count == 0: - # Only add this option in the second run. - args += ["--rerun-failed"] - count += 1 - # pyre-fixme[61]: `retcode` is undefined, or not always defined. - if retcode != 0: - # Allow except clause in getdeps.main to catch and exit gracefully - # This allows non-testpilot runs to fail through the same logic as failed testpilot runs, which may become handy in case if post test processing is needed in the future - # pyre-fixme[61]: `retcode` is undefined, or not always defined. - raise subprocess.CalledProcessError(retcode, args) - - -class NinjaBootstrap(BuilderBase): - def __init__(self, build_opts, ctx, manifest, build_dir, src_dir, inst_dir) -> None: - super(NinjaBootstrap, self).__init__( - build_opts, ctx, manifest, src_dir, build_dir, inst_dir - ) - - def _build(self, install_dirs, reconfigure) -> None: - self._run_cmd([sys.executable, "configure.py", "--bootstrap"], cwd=self.src_dir) - src_ninja = os.path.join(self.src_dir, "ninja") - dest_ninja = os.path.join(self.inst_dir, "bin/ninja") - bin_dir = os.path.dirname(dest_ninja) - if not os.path.exists(bin_dir): - os.makedirs(bin_dir) - shutil.copyfile(src_ninja, dest_ninja) - shutil.copymode(src_ninja, dest_ninja) - - -class OpenSSLBuilder(BuilderBase): - def __init__(self, build_opts, ctx, manifest, build_dir, src_dir, inst_dir) -> None: - super(OpenSSLBuilder, self).__init__( - build_opts, ctx, manifest, src_dir, build_dir, inst_dir - ) - - def _build(self, install_dirs, reconfigure) -> None: - configure = os.path.join(self.src_dir, "Configure") - - # prefer to resolve the perl that we installed from - # our manifest on windows, but fall back to the system - # path on eg: darwin - env = self.env.copy() - for d in install_dirs: - bindir = os.path.join(d, "bin") - add_path_entry(env, "PATH", bindir, append=False) - - perl = typing.cast(str, path_search(env, "perl", "perl")) - - make_j_args = [] - if self.build_opts.is_windows(): - make = "nmake.exe" - args = ["VC-WIN64A-masm", "-utf-8"] - elif self.build_opts.is_darwin(): - make = "make" - make_j_args = ["-j%s" % self.num_jobs] - args = ( - ["darwin64-x86_64-cc"] - if not self.build_opts.is_arm() - else ["darwin64-arm64-cc"] - ) - elif self.build_opts.is_linux(): - make = "make" - make_j_args = ["-j%s" % self.num_jobs] - args = ( - ["linux-x86_64"] if not self.build_opts.is_arm() else ["linux-aarch64"] - ) - else: - raise Exception("don't know how to build openssl for %r" % self.ctx) - - self._run_cmd( - [ - perl, - configure, - "--prefix=%s" % self.inst_dir, - "--openssldir=%s" % self.inst_dir, - ] - + args - + [ - "enable-static-engine", - "enable-capieng", - "no-makedepend", - "no-unit-test", - "no-tests", - ] - ) - make_build = [make] + make_j_args - self._run_cmd(make_build) - make_install = [make, "install_sw", "install_ssldirs"] - self._run_cmd(make_install) - - -class Boost(BuilderBase): - def __init__( - self, build_opts, ctx, manifest, src_dir, build_dir, inst_dir, b2_args - ) -> None: - children = os.listdir(src_dir) - assert len(children) == 1, "expected a single directory entry: %r" % (children,) - boost_src = children[0] - assert boost_src.startswith("boost") - src_dir = os.path.join(src_dir, children[0]) - super(Boost, self).__init__( - build_opts, ctx, manifest, src_dir, build_dir, inst_dir - ) - self.b2_args = b2_args - - def _build(self, install_dirs, reconfigure) -> None: - env = self._compute_env(install_dirs) - linkage = ["static"] - if self.build_opts.is_windows() or self.build_opts.shared_libs: - linkage.append("shared") - - args = [] - if self.build_opts.is_darwin(): - clang = subprocess.check_output(["xcrun", "--find", "clang"]) - user_config = os.path.join(self.build_dir, "project-config.jam") - with open(user_config, "w") as jamfile: - jamfile.write("using clang : : %s ;\n" % clang.decode().strip()) - args.append("--user-config=%s" % user_config) - - for link in linkage: - bootstrap_args = self.manifest.get_section_as_args( - "bootstrap.args", self.ctx - ) - if self.build_opts.is_windows(): - bootstrap = os.path.join(self.src_dir, "bootstrap.bat") - self._run_cmd([bootstrap] + bootstrap_args, cwd=self.src_dir, env=env) - args += ["address-model=64"] - else: - bootstrap = os.path.join(self.src_dir, "bootstrap.sh") - self._run_cmd( - [bootstrap, "--prefix=%s" % self.inst_dir] + bootstrap_args, - cwd=self.src_dir, - env=env, - ) - - b2 = os.path.join(self.src_dir, "b2") - self._run_cmd( - [ - b2, - "-j%s" % self.num_jobs, - "--prefix=%s" % self.inst_dir, - "--builddir=%s" % self.build_dir, - ] - + args - + self.b2_args - + [ - "link=%s" % link, - "runtime-link=shared", - "variant=release", - "threading=multi", - "debug-symbols=on", - "visibility=global", - "-d2", - "install", - ], - cwd=self.src_dir, - env=env, - ) - - -class NopBuilder(BuilderBase): - def __init__(self, build_opts, ctx, manifest, src_dir, inst_dir) -> None: - super(NopBuilder, self).__init__( - build_opts, ctx, manifest, src_dir, None, inst_dir - ) - - def build(self, install_dirs, reconfigure: bool) -> None: - print("Installing %s -> %s" % (self.src_dir, self.inst_dir)) - parent = os.path.dirname(self.inst_dir) - if not os.path.exists(parent): - os.makedirs(parent) - - install_files = self.manifest.get_section_as_ordered_pairs( - "install.files", self.ctx - ) - if install_files: - for src_name, dest_name in self.manifest.get_section_as_ordered_pairs( - "install.files", self.ctx - ): - full_dest = os.path.join(self.inst_dir, dest_name) - full_src = os.path.join(self.src_dir, src_name) - - dest_parent = os.path.dirname(full_dest) - if not os.path.exists(dest_parent): - os.makedirs(dest_parent) - if os.path.isdir(full_src): - if not os.path.exists(full_dest): - shutil.copytree(full_src, full_dest) - else: - shutil.copyfile(full_src, full_dest) - shutil.copymode(full_src, full_dest) - # This is a bit gross, but the mac ninja.zip doesn't - # give ninja execute permissions, so force them on - # for things that look like they live in a bin dir - if os.path.dirname(dest_name) == "bin": - st = os.lstat(full_dest) - os.chmod(full_dest, st.st_mode | stat.S_IXUSR) - else: - if not os.path.exists(self.inst_dir): - shutil.copytree(self.src_dir, self.inst_dir) - - -class SqliteBuilder(BuilderBase): - def __init__(self, build_opts, ctx, manifest, src_dir, build_dir, inst_dir) -> None: - super(SqliteBuilder, self).__init__( - build_opts, ctx, manifest, src_dir, build_dir, inst_dir - ) - - def _build(self, install_dirs, reconfigure) -> None: - for f in ["sqlite3.c", "sqlite3.h", "sqlite3ext.h"]: - src = os.path.join(self.src_dir, f) - dest = os.path.join(self.build_dir, f) - copy_if_different(src, dest) - - cmake_lists = """ -cmake_minimum_required(VERSION 3.1.3 FATAL_ERROR) -project(sqlite3 C) -add_library(sqlite3 STATIC sqlite3.c) -# These options are taken from the defaults in Makefile.msc in -# the sqlite distribution -target_compile_definitions(sqlite3 PRIVATE - -DSQLITE_ENABLE_COLUMN_METADATA=1 - -DSQLITE_ENABLE_FTS3=1 - -DSQLITE_ENABLE_RTREE=1 - -DSQLITE_ENABLE_GEOPOLY=1 - -DSQLITE_ENABLE_JSON1=1 - -DSQLITE_ENABLE_STMTVTAB=1 - -DSQLITE_ENABLE_DBPAGE_VTAB=1 - -DSQLITE_ENABLE_DBSTAT_VTAB=1 - -DSQLITE_INTROSPECTION_PRAGMAS=1 - -DSQLITE_ENABLE_DESERIALIZE=1 -) -install(TARGETS sqlite3) -install(FILES sqlite3.h sqlite3ext.h DESTINATION include) - """ - - with open(os.path.join(self.build_dir, "CMakeLists.txt"), "w") as f: - f.write(cmake_lists) - - defines = { - "CMAKE_INSTALL_PREFIX": self.inst_dir, - "BUILD_SHARED_LIBS": "ON" if self.build_opts.shared_libs else "OFF", - "CMAKE_BUILD_TYPE": "RelWithDebInfo", - } - define_args = ["-D%s=%s" % (k, v) for (k, v) in defines.items()] - define_args += ["-G", "Ninja"] - - env = self._compute_env(install_dirs) - - # Resolve the cmake that we installed - cmake = path_search(env, "cmake") - - self._run_cmd([cmake, self.build_dir] + define_args, env=env) - self._run_cmd( - [ - cmake, - "--build", - self.build_dir, - "--target", - "install", - "--config", - "Release", - "-j", - str(self.num_jobs), - ], - env=env, - ) diff --git a/build/fbcode_builder/getdeps/buildopts.py b/build/fbcode_builder/getdeps/buildopts.py deleted file mode 100644 index 48b000f90ebc0..0000000000000 --- a/build/fbcode_builder/getdeps/buildopts.py +++ /dev/null @@ -1,619 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import errno -import glob -import ntpath -import os -import subprocess -import sys -import tempfile -from typing import Mapping, Optional - -from .copytree import containing_repo_type -from .envfuncs import add_flag, add_path_entry, Env -from .fetcher import get_fbsource_repo_data, homebrew_package_prefix -from .manifest import ContextGenerator -from .platform import get_available_ram, HostType, is_windows - - -def detect_project(path): - repo_type, repo_root = containing_repo_type(path) - if repo_type is None: - return None, None - - # Look for a .projectid file. If it exists, read the project name from it. - project_id_path = os.path.join(repo_root, ".projectid") - try: - with open(project_id_path, "r") as f: - project_name = f.read().strip() - return repo_root, project_name - except EnvironmentError as ex: - if ex.errno != errno.ENOENT: - raise - - return repo_root, None - - -class BuildOptions(object): - def __init__( - self, - fbcode_builder_dir, - scratch_dir, - host_type, - install_dir=None, - num_jobs: int = 0, - use_shipit: bool = False, - vcvars_path=None, - allow_system_packages: bool = False, - lfs_path=None, - shared_libs: bool = False, - facebook_internal=None, - free_up_disk: bool = False, - ) -> None: - """fbcode_builder_dir - the path to either the in-fbsource fbcode_builder dir, - or for shipit-transformed repos, the build dir that - has been mapped into that dir. - scratch_dir - a place where we can store repos and build bits. - This path should be stable across runs and ideally - should not be in the repo of the project being built, - but that is ultimately where we generally fall back - for builds outside of FB - install_dir - where the project will ultimately be installed - num_jobs - the level of concurrency to use while building - use_shipit - use real shipit instead of the simple shipit transformer - vcvars_path - Path to external VS toolchain's vsvarsall.bat - shared_libs - whether to build shared libraries - free_up_disk - take extra actions to save runner disk space - """ - - if not install_dir: - install_dir = os.path.join(scratch_dir, "installed") - - self.project_hashes = None - for p in ["../deps/github_hashes", "../project_hashes"]: - hashes = os.path.join(fbcode_builder_dir, p) - if os.path.exists(hashes): - self.project_hashes = hashes - break - - # Detect what repository and project we are being run from. - self.repo_root, self.repo_project = detect_project(os.getcwd()) - - # If we are running from an fbsource repository, set self.fbsource_dir - # to allow the ShipIt-based fetchers to use it. - if self.repo_project == "fbsource": - self.fbsource_dir: Optional[str] = self.repo_root - else: - self.fbsource_dir = None - - if facebook_internal is None: - if self.fbsource_dir: - facebook_internal = True - else: - facebook_internal = False - - self.facebook_internal = facebook_internal - self.specified_num_jobs = num_jobs - self.scratch_dir = scratch_dir - self.install_dir = install_dir - self.fbcode_builder_dir = fbcode_builder_dir - self.host_type = host_type - self.use_shipit = use_shipit - self.allow_system_packages = allow_system_packages - self.lfs_path = lfs_path - self.shared_libs = shared_libs - self.free_up_disk = free_up_disk - - lib_path = None - if self.is_darwin(): - lib_path = "DYLD_LIBRARY_PATH" - elif self.is_linux(): - lib_path = "LD_LIBRARY_PATH" - elif self.is_windows(): - lib_path = "PATH" - else: - lib_path = None - self.lib_path = lib_path - - if vcvars_path is None and is_windows(): - - try: - # Allow a site-specific vcvarsall path. - from .facebook.vcvarsall import build_default_vcvarsall - except ImportError: - vcvarsall = [] - else: - vcvarsall = ( - build_default_vcvarsall(self.fbsource_dir) - if self.fbsource_dir is not None - else [] - ) - - # On Windows, the compiler is not available in the PATH by - # default so we need to run the vcvarsall script to populate the - # environment. We use a glob to find some version of this script - # as deployed with Visual Studio 2017. This logic can also - # locate Visual Studio 2019 but note that at the time of writing - # the version of boost in our manifest cannot be built with - # VS 2019, so we're effectively tied to VS 2017 until we upgrade - # the boost dependency. - for year in ["2017", "2019"]: - vcvarsall += glob.glob( - os.path.join( - os.environ["ProgramFiles(x86)"], - "Microsoft Visual Studio", - year, - "*", - "VC", - "Auxiliary", - "Build", - "vcvarsall.bat", - ) - ) - vcvars_path = vcvarsall[0] - - self.vcvars_path = vcvars_path - - @property - def manifests_dir(self): - return os.path.join(self.fbcode_builder_dir, "manifests") - - def is_darwin(self): - return self.host_type.is_darwin() - - def is_windows(self): - return self.host_type.is_windows() - - def is_arm(self): - return self.host_type.is_arm() - - def get_vcvars_path(self): - return self.vcvars_path - - def is_linux(self): - return self.host_type.is_linux() - - def is_freebsd(self): - return self.host_type.is_freebsd() - - def get_num_jobs(self, job_weight: int) -> int: - """Given an estimated job_weight in MiB, compute a reasonable concurrency limit.""" - if self.specified_num_jobs: - return self.specified_num_jobs - - available_ram = get_available_ram() - - import multiprocessing - - return max(1, min(multiprocessing.cpu_count(), available_ram // job_weight)) - - def get_context_generator(self, host_tuple=None): - """Create a manifest ContextGenerator for the specified target platform.""" - if host_tuple is None: - host_type = self.host_type - elif isinstance(host_tuple, HostType): - host_type = host_tuple - else: - host_type = HostType.from_tuple_string(host_tuple) - - return ContextGenerator( - { - "os": host_type.ostype, - "distro": host_type.distro, - "distro_vers": host_type.distrovers, - "fb": "on" if self.facebook_internal else "off", - "fbsource": "on" if self.fbsource_dir else "off", - "test": "off", - "shared_libs": "on" if self.shared_libs else "off", - } - ) - - def compute_env_for_install_dirs( - self, install_dirs, env=None, manifest=None - ): # noqa: C901 - if env is not None: - env = env.copy() - else: - env = Env() - - env["GETDEPS_BUILD_DIR"] = os.path.join(self.scratch_dir, "build") - env["GETDEPS_INSTALL_DIR"] = self.install_dir - - # Python setuptools attempts to discover a local MSVC for - # building Python extensions. On Windows, getdeps already - # supports invoking a vcvarsall prior to compilation. - # - # Tell setuptools to bypass its own search. This fixes a bug - # where setuptools would fail when run from CMake on GitHub - # Actions with the inscrutable message 'error: Microsoft - # Visual C++ 14.0 is required. Get it with "Build Tools for - # Visual Studio"'. I suspect the actual error is that the - # environment or PATH is overflowing. - # - # For extra credit, someone could patch setuptools to - # propagate the actual error message from vcvarsall, because - # often it does not mean Visual C++ is not available. - # - # Related discussions: - # - https://github.com/pypa/setuptools/issues/2028 - # - https://github.com/pypa/setuptools/issues/2307 - # - https://developercommunity.visualstudio.com/t/error-microsoft-visual-c-140-is-required/409173 - # - https://github.com/OpenMS/OpenMS/pull/4779 - # - https://github.com/actions/virtual-environments/issues/1484 - - if self.is_windows() and self.get_vcvars_path(): - env["DISTUTILS_USE_SDK"] = "1" - - # On macOS we need to set `SDKROOT` when we use clang for system - # header files. - if self.is_darwin() and "SDKROOT" not in env: - sdkroot = subprocess.check_output(["xcrun", "--show-sdk-path"]) - env["SDKROOT"] = sdkroot.decode().strip() - - if ( - self.is_darwin() - and self.allow_system_packages - and self.host_type.get_package_manager() == "homebrew" - and manifest - and manifest.resolved_system_packages - ): - # Homebrew packages may not be on the default PATHs - brew_packages = manifest.resolved_system_packages.get("homebrew", []) - for p in brew_packages: - found = self.add_homebrew_package_to_env(p, env) - # Try extra hard to find openssl, needed with homebrew on macOS - if found and p.startswith("openssl"): - candidate = homebrew_package_prefix("openssl@1.1") - if os.path.exists(candidate): - os.environ["OPENSSL_ROOT_DIR"] = candidate - env["OPENSSL_ROOT_DIR"] = os.environ["OPENSSL_ROOT_DIR"] - - if self.fbsource_dir: - env["YARN_YARN_OFFLINE_MIRROR"] = os.path.join( - self.fbsource_dir, "xplat/third-party/yarn/offline-mirror" - ) - yarn_exe = "yarn.bat" if self.is_windows() else "yarn" - env["YARN_PATH"] = os.path.join( - self.fbsource_dir, "xplat/third-party/yarn/", yarn_exe - ) - node_exe = "node-win-x64.exe" if self.is_windows() else "node" - env["NODE_BIN"] = os.path.join( - self.fbsource_dir, "xplat/third-party/node/bin/", node_exe - ) - env["RUST_VENDORED_CRATES_DIR"] = os.path.join( - self.fbsource_dir, "third-party/rust/vendor" - ) - hash_data = get_fbsource_repo_data(self) - env["FBSOURCE_HASH"] = hash_data.hash - env["FBSOURCE_DATE"] = hash_data.date - - # reverse as we are prepending to the PATHs - for d in reversed(install_dirs): - self.add_prefix_to_env(d, env, append=False) - - # Linux is always system openssl - system_openssl = self.is_linux() - - # For other systems lets see if package is requested - if not system_openssl and manifest and manifest.resolved_system_packages: - for _pkg_type, pkgs in manifest.resolved_system_packages.items(): - for p in pkgs: - if p.startswith("openssl") or p.startswith("libssl"): - system_openssl = True - break - - # Let openssl know to pick up the system certs if present - if system_openssl or "OPENSSL_DIR" in env: - for system_ssl_cfg in ["/etc/pki/tls", "/etc/ssl"]: - if os.path.isdir(system_ssl_cfg): - cert_dir = system_ssl_cfg + "/certs" - if os.path.isdir(cert_dir): - env["SSL_CERT_DIR"] = cert_dir - cert_file = system_ssl_cfg + "/cert.pem" - if os.path.isfile(cert_file): - env["SSL_CERT_FILE"] = cert_file - - return env - - def add_homebrew_package_to_env(self, package, env) -> bool: - prefix = homebrew_package_prefix(package) - if prefix and os.path.exists(prefix): - return self.add_prefix_to_env( - prefix, env, append=False, add_library_path=True - ) - return False - - def add_prefix_to_env( - self, d, env, append: bool = True, add_library_path: bool = False - ) -> bool: # noqa: C901 - bindir = os.path.join(d, "bin") - found = False - pkgconfig = os.path.join(d, "lib", "pkgconfig") - if os.path.exists(pkgconfig): - found = True - add_path_entry(env, "PKG_CONFIG_PATH", pkgconfig, append=append) - - pkgconfig = os.path.join(d, "lib64", "pkgconfig") - if os.path.exists(pkgconfig): - found = True - add_path_entry(env, "PKG_CONFIG_PATH", pkgconfig, append=append) - - add_path_entry(env, "CMAKE_PREFIX_PATH", d, append=append) - - # Tell the thrift compiler about includes it needs to consider - thriftdir = os.path.join(d, "include", "thrift-files") - if os.path.exists(thriftdir): - found = True - add_path_entry(env, "THRIFT_INCLUDE_PATH", thriftdir, append=append) - - # module detection for python is old fashioned and needs flags - includedir = os.path.join(d, "include") - if os.path.exists(includedir): - found = True - ncursesincludedir = os.path.join(d, "include", "ncurses") - if os.path.exists(ncursesincludedir): - add_path_entry(env, "C_INCLUDE_PATH", ncursesincludedir, append=append) - add_flag(env, "CPPFLAGS", f"-I{includedir}", append=append) - add_flag(env, "CPPFLAGS", f"-I{ncursesincludedir}", append=append) - elif "/bz2-" in d: - add_flag(env, "CPPFLAGS", f"-I{includedir}", append=append) - - # Map from FB python manifests to PYTHONPATH - pydir = os.path.join(d, "lib", "fb-py-libs") - if os.path.exists(pydir): - found = True - manifest_ext = ".manifest" - pymanifestfiles = [ - f - for f in os.listdir(pydir) - if f.endswith(manifest_ext) and os.path.isfile(os.path.join(pydir, f)) - ] - for f in pymanifestfiles: - subdir = f[: -len(manifest_ext)] - add_path_entry( - env, "PYTHONPATH", os.path.join(pydir, subdir), append=append - ) - - # Allow resolving shared objects built earlier (eg: zstd - # doesn't include the full path to the dylib in its linkage - # so we need to give it an assist) - if self.lib_path: - for lib in ["lib", "lib64"]: - libdir = os.path.join(d, lib) - if os.path.exists(libdir): - found = True - add_path_entry(env, self.lib_path, libdir, append=append) - # module detection for python is old fashioned and needs flags - if "/ncurses-" in d: - add_flag(env, "LDFLAGS", f"-L{libdir}", append=append) - elif "/bz2-" in d: - add_flag(env, "LDFLAGS", f"-L{libdir}", append=append) - if add_library_path: - add_path_entry(env, "LIBRARY_PATH", libdir, append=append) - - # Allow resolving binaries (eg: cmake, ninja) and dlls - # built by earlier steps - if os.path.exists(bindir): - found = True - add_path_entry(env, "PATH", bindir, append=append) - - # If rustc is present in the `bin` directory, set RUSTC to prevent - # cargo uses the rustc installed in the system. - if self.is_windows(): - cargo_path = os.path.join(bindir, "cargo.exe") - rustc_path = os.path.join(bindir, "rustc.exe") - rustdoc_path = os.path.join(bindir, "rustdoc.exe") - else: - cargo_path = os.path.join(bindir, "cargo") - rustc_path = os.path.join(bindir, "rustc") - rustdoc_path = os.path.join(bindir, "rustdoc") - - if os.path.isfile(rustc_path): - env["CARGO_BIN"] = cargo_path - env["RUSTC"] = rustc_path - env["RUSTDOC"] = rustdoc_path - - openssl_include = os.path.join(d, "include", "openssl") - if os.path.isdir(openssl_include) and any( - os.path.isfile(os.path.join(d, "lib", libcrypto)) - for libcrypto in ("libcrypto.lib", "libcrypto.so", "libcrypto.a") - ): - # This must be the openssl library, let Rust know about it - env["OPENSSL_DIR"] = d - - return found - - -def list_win32_subst_letters(): - output = subprocess.check_output(["subst"]).decode("utf-8") - # The output is a set of lines like: `F:\: => C:\open\some\where` - lines = output.strip().split("\r\n") - mapping = {} - for line in lines: - fields = line.split(": => ") - if len(fields) != 2: - continue - letter = fields[0] - path = fields[1] - mapping[letter] = path - - return mapping - - -def find_existing_win32_subst_for_path( - path: str, - subst_mapping: Mapping[str, str], -) -> Optional[str]: - path = ntpath.normcase(ntpath.normpath(path)) - for letter, target in subst_mapping.items(): - if ntpath.normcase(target) == path: - return letter - return None - - -def find_unused_drive_letter(): - import ctypes - - buffer_len = 256 - blen = ctypes.c_uint(buffer_len) - rv = ctypes.c_uint() - bufs = ctypes.create_string_buffer(buffer_len) - rv = ctypes.windll.kernel32.GetLogicalDriveStringsA(blen, bufs) - if rv > buffer_len: - raise Exception("GetLogicalDriveStringsA result too large for buffer") - nul = "\x00".encode("ascii") - - used = [drive.decode("ascii")[0] for drive in bufs.raw.strip(nul).split(nul)] - possible = [c for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] - available = sorted(list(set(possible) - set(used))) - if len(available) == 0: - return None - # Prefer to assign later letters rather than earlier letters - return available[-1] - - -def create_subst_path(path: str) -> str: - for _attempt in range(0, 24): - drive = find_existing_win32_subst_for_path( - path, subst_mapping=list_win32_subst_letters() - ) - if drive: - return drive - available = find_unused_drive_letter() - if available is None: - raise Exception( - ( - "unable to make shorter subst mapping for %s; " - "no available drive letters" - ) - % path - ) - - # Try to set up a subst mapping; note that we may be racing with - # other processes on the same host, so this may not succeed. - try: - subprocess.check_call(["subst", "%s:" % available, path]) - return "%s:\\" % available - except Exception: - print("Failed to map %s -> %s" % (available, path)) - - raise Exception("failed to set up a subst path for %s" % path) - - -def _check_host_type(args, host_type): - if host_type is None: - host_tuple_string = getattr(args, "host_type", None) - if host_tuple_string: - host_type = HostType.from_tuple_string(host_tuple_string) - else: - host_type = HostType() - - assert isinstance(host_type, HostType) - return host_type - - -def setup_build_options(args, host_type=None) -> BuildOptions: - """Create a BuildOptions object based on the arguments""" - - fbcode_builder_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - scratch_dir = args.scratch_path - if not scratch_dir: - # TODO: `mkscratch` doesn't currently know how best to place things on - # sandcastle, so whip up something reasonable-ish - if "SANDCASTLE" in os.environ: - if "DISK_TEMP" not in os.environ: - raise Exception( - ( - "I need DISK_TEMP to be set in the sandcastle environment " - "so that I can store build products somewhere sane" - ) - ) - scratch_dir = os.path.join( - os.environ["DISK_TEMP"], "fbcode_builder_getdeps" - ) - if not scratch_dir: - try: - scratch_dir = ( - subprocess.check_output( - ["mkscratch", "path", "--subdir", "fbcode_builder_getdeps"] - ) - .strip() - .decode("utf-8") - ) - except OSError as exc: - if exc.errno != errno.ENOENT: - # A legit failure; don't fall back, surface the error - raise - # This system doesn't have mkscratch so we fall back to - # something local. - munged = fbcode_builder_dir.replace("Z", "zZ") - for s in ["/", "\\", ":"]: - munged = munged.replace(s, "Z") - - if is_windows() and os.path.isdir("c:/open"): - temp = "c:/open/scratch" - else: - temp = tempfile.gettempdir() - - scratch_dir = os.path.join(temp, "fbcode_builder_getdeps-%s" % munged) - if not is_windows() and os.geteuid() == 0: - # Running as root; in the case where someone runs - # sudo getdeps.py install-system-deps - # and then runs as build without privs, we want to avoid creating - # a scratch dir that the second stage cannot write to. - # So we generate a different path if we are root. - scratch_dir += "-root" - - if not os.path.exists(scratch_dir): - os.makedirs(scratch_dir) - - if is_windows(): - subst = create_subst_path(scratch_dir) - print( - "Mapping scratch dir %s -> %s" % (scratch_dir, subst), file=sys.stderr - ) - scratch_dir = subst - else: - if not os.path.exists(scratch_dir): - os.makedirs(scratch_dir) - - # Make sure we normalize the scratch path. This path is used as part of the hash - # computation for detecting if projects have been updated, so we need to always - # use the exact same string to refer to a given directory. - # But! realpath in some combinations of Windows/Python3 versions can expand the - # drive substitutions on Windows, so avoid that! - if not is_windows(): - scratch_dir = os.path.realpath(scratch_dir) - - # Save these args passed by the user in an env variable, so it - # can be used while hashing this build. - os.environ["GETDEPS_CMAKE_DEFINES"] = getattr(args, "extra_cmake_defines", "") or "" - - host_type = _check_host_type(args, host_type) - - build_args = { - k: v - for (k, v) in vars(args).items() - if k - in { - "num_jobs", - "use_shipit", - "vcvars_path", - "allow_system_packages", - "lfs_path", - "shared_libs", - "free_up_disk", - } - } - - return BuildOptions( - fbcode_builder_dir, - scratch_dir, - host_type, - install_dir=args.install_prefix, - facebook_internal=args.facebook_internal, - **build_args, - ) diff --git a/build/fbcode_builder/getdeps/cache.py b/build/fbcode_builder/getdeps/cache.py deleted file mode 100644 index 4d2786e7e1b5f..0000000000000 --- a/build/fbcode_builder/getdeps/cache.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -class ArtifactCache(object): - """The ArtifactCache is a small abstraction that allows caching - named things in some external storage mechanism. - The primary use case is for storing the build products on CI - systems to accelerate the build""" - - def download_to_file(self, name, dest_file_name) -> bool: - """If `name` exists in the cache, download it and place it - in the specified `dest_file_name` location on the filesystem. - If a transient issue was encountered a TransientFailure shall - be raised. - If `name` doesn't exist in the cache `False` shall be returned. - If `dest_file_name` was successfully updated `True` shall be - returned. - All other conditions shall raise an appropriate exception.""" - return False - - def upload_from_file(self, name, source_file_name) -> None: - """Causes `name` to be populated in the cache by uploading - the contents of `source_file_name` to the storage system. - If a transient issue was encountered a TransientFailure shall - be raised. - If the upload failed for some other reason, an appropriate - exception shall be raised.""" - pass - - -def create_cache() -> None: - """This function is monkey patchable to provide an actual - implementation""" - return None diff --git a/build/fbcode_builder/getdeps/cargo.py b/build/fbcode_builder/getdeps/cargo.py deleted file mode 100644 index bb41cc3e294b1..0000000000000 --- a/build/fbcode_builder/getdeps/cargo.py +++ /dev/null @@ -1,449 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import os -import re -import shutil -import typing - -from .builder import BuilderBase - -if typing.TYPE_CHECKING: - from .buildopts import BuildOptions - - -class CargoBuilder(BuilderBase): - def __init__( - self, - build_opts: "BuildOptions", - ctx, - manifest, - src_dir, - build_dir, - inst_dir, - build_doc, - workspace_dir, - manifests_to_build, - loader, - cargo_config_file, - ) -> None: - super(CargoBuilder, self).__init__( - build_opts, ctx, manifest, src_dir, build_dir, inst_dir - ) - self.build_doc = build_doc - self.ws_dir = workspace_dir - self.manifests_to_build = manifests_to_build and manifests_to_build.split(",") - self.loader = loader - self.cargo_config_file_subdir = cargo_config_file - - def run_cargo(self, install_dirs, operation, args=None) -> None: - args = args or [] - env = self._compute_env(install_dirs) - # Enable using nightly features with stable compiler - env["RUSTC_BOOTSTRAP"] = "1" - env["LIBZ_SYS_STATIC"] = "1" - cmd = [ - "cargo", - operation, - "--workspace", - "-j%s" % self.num_jobs, - ] + args - self._run_cmd(cmd, cwd=self.workspace_dir(), env=env) - - def build_source_dir(self): - return os.path.join(self.build_dir, "source") - - def workspace_dir(self): - return os.path.join(self.build_source_dir(), self.ws_dir or "") - - def manifest_dir(self, manifest): - return os.path.join(self.build_source_dir(), manifest) - - def recreate_dir(self, src, dst) -> None: - if os.path.isdir(dst): - if os.path.islink(dst): - os.remove(dst) - else: - shutil.rmtree(dst) - shutil.copytree(src, dst) - - def cargo_config_file(self): - build_source_dir = self.build_dir - if self.cargo_config_file_subdir: - return os.path.join(build_source_dir, self.cargo_config_file_subdir) - else: - return os.path.join(build_source_dir, ".cargo", "config") - - def _create_cargo_config(self): - cargo_config_file = self.cargo_config_file() - cargo_config_dir = os.path.dirname(cargo_config_file) - if not os.path.isdir(cargo_config_dir): - os.mkdir(cargo_config_dir) - - dep_to_git = self._resolve_dep_to_git() - - if os.path.isfile(cargo_config_file): - with open(cargo_config_file, "r") as f: - print(f"Reading {cargo_config_file}") - cargo_content = f.read() - else: - cargo_content = "" - - new_content = cargo_content - if "# Generated by getdeps.py" not in cargo_content: - new_content += """\ -# Generated by getdeps.py -[build] -target-dir = '''{}''' - -[profile.dev] -debug = false -incremental = false -""".format( - self.build_dir.replace("\\", "\\\\") - ) - - # Point to vendored sources from getdeps manifests - for _dep, git_conf in dep_to_git.items(): - if "cargo_vendored_sources" in git_conf: - vendored_dir = git_conf["cargo_vendored_sources"].replace("\\", "\\\\") - override = ( - f'[source."{git_conf["repo_url"]}"]\ndirectory = "{vendored_dir}"\n' - ) - if override not in cargo_content: - new_content += override - - if new_content != cargo_content: - with open(cargo_config_file, "w") as f: - print( - f"Writing cargo config for {self.manifest.name} to {cargo_config_file}" - ) - f.write(new_content) - - if self.build_opts.fbsource_dir: - # Point to vendored crates.io if possible - try: - from .facebook.rust import vendored_crates - - vendored_crates(self.build_opts.fbsource_dir, cargo_config_file) - except ImportError: - # This FB internal module isn't shippped to github, - # so just rely on cargo downloading crates on it's own - pass - - return dep_to_git - - def _prepare(self, install_dirs, reconfigure) -> None: - build_source_dir = self.build_source_dir() - self.recreate_dir(self.src_dir, build_source_dir) - - dep_to_git = self._create_cargo_config() - - if self.ws_dir is not None: - self._patchup_workspace(dep_to_git) - - def _build(self, install_dirs, reconfigure) -> None: - # _prepare has been run already. Actually do the build - build_source_dir = self.build_source_dir() - if self.manifests_to_build is None: - self.run_cargo( - install_dirs, - "build", - ["--out-dir", os.path.join(self.inst_dir, "bin"), "-Zunstable-options"], - ) - else: - for manifest in self.manifests_to_build: - self.run_cargo( - install_dirs, - "build", - [ - "--out-dir", - os.path.join(self.inst_dir, "bin"), - "-Zunstable-options", - "--manifest-path", - self.manifest_dir(manifest), - ], - ) - - self.recreate_dir(build_source_dir, os.path.join(self.inst_dir, "source")) - - def run_tests( - self, install_dirs, schedule_type, owner, test_filter, retry, no_testpilot - ) -> None: - if test_filter: - args = ["--", test_filter] - else: - args = [] - - if self.manifests_to_build is None: - self.run_cargo(install_dirs, "test", args) - if self.build_doc: - self.run_cargo(install_dirs, "doc", ["--no-deps"]) - else: - for manifest in self.manifests_to_build: - margs = ["--manifest-path", self.manifest_dir(manifest)] - self.run_cargo(install_dirs, "test", args + margs) - if self.build_doc: - self.run_cargo(install_dirs, "doc", ["--no-deps"] + margs) - - def _patchup_workspace(self, dep_to_git) -> None: - """ - This method makes some assumptions about the state of the project and - its cargo dependendies: - 1. Crates from cargo dependencies can be extracted from Cargo.toml files - using _extract_crates function. It is using a heuristic so check its - code to understand how it is done. - 2. The extracted cargo dependencies crates can be found in the - dependency's install dir using _resolve_crate_to_path function - which again is using a heuristic. - - Notice that many things might go wrong here. E.g. if someone depends - on another getdeps crate by writing in their Cargo.toml file: - - my-rename-of-crate = { package = "crate", git = "..." } - - they can count themselves lucky because the code will raise an - Exception. There might be more cases where the code will silently pass - producing bad results. - """ - workspace_dir = self.workspace_dir() - git_url_to_crates_and_paths = self._resolve_config(dep_to_git) - if git_url_to_crates_and_paths: - patch_cargo = os.path.join(workspace_dir, "Cargo.toml") - if os.path.isfile(patch_cargo): - with open(patch_cargo, "r") as f: - manifest_content = f.read() - else: - manifest_content = "" - - new_content = manifest_content - if "[package]" not in manifest_content: - # A fake manifest has to be crated to change the virtual - # manifest into a non-virtual. The virtual manifests are limited - # in many ways and the inability to define patches on them is - # one. Check https://github.com/rust-lang/cargo/issues/4934 to - # see if it is resolved. - null_file = "/dev/null" - if self.build_opts.is_windows(): - null_file = "nul" - new_content += f""" -[package] -name = "fake_manifest_of_{self.manifest.name}" -version = "0.0.0" - -[lib] -path = "{null_file}" -""" - config = [] - for git_url, crates_to_patch_path in git_url_to_crates_and_paths.items(): - crates_patches = [ - '{} = {{ path = "{}" }}'.format( - crate, - crates_to_patch_path[crate].replace("\\", "\\\\"), - ) - for crate in sorted(crates_to_patch_path.keys()) - ] - patch_key = f'[patch."{git_url}"]' - if patch_key not in manifest_content: - config.append(f"\n{patch_key}\n" + "\n".join(crates_patches)) - new_content += "\n".join(config) - if new_content != manifest_content: - with open(patch_cargo, "w") as f: - print(f"writing patch to {patch_cargo}") - f.write(new_content) - - def _resolve_config(self, dep_to_git) -> typing.Dict[str, typing.Dict[str, str]]: - """ - Returns a configuration to be put inside root Cargo.toml file which - patches the dependencies git code with local getdeps versions. - See https://doc.rust-lang.org/cargo/reference/manifest.html#the-patch-section - """ - dep_to_crates = self._resolve_dep_to_crates(self.build_source_dir(), dep_to_git) - - git_url_to_crates_and_paths = {} - for dep_name in sorted(dep_to_git.keys()): - git_conf = dep_to_git[dep_name] - req_crates = sorted(dep_to_crates.get(dep_name, [])) - if not req_crates: - continue # nothing to patch, move along - - git_url = git_conf.get("repo_url", None) - crate_source_map = git_conf["crate_source_map"] - if git_url and crate_source_map: - crates_to_patch_path = git_url_to_crates_and_paths.get(git_url, {}) - for c in req_crates: - if c in crate_source_map and c not in crates_to_patch_path: - crates_to_patch_path[c] = crate_source_map[c] - print( - f"{self.manifest.name}: Patching crate {c} via virtual manifest in {self.workspace_dir()}" - ) - if crates_to_patch_path: - git_url_to_crates_and_paths[git_url] = crates_to_patch_path - - return git_url_to_crates_and_paths - - def _resolve_dep_to_git(self): - """ - For each direct dependency of the currently build manifest check if it - is also cargo-builded and if yes then extract it's git configs and - install dir - """ - dependencies = self.manifest.get_dependencies(self.ctx) - if not dependencies: - return [] - - dep_to_git = {} - for dep in dependencies: - dep_manifest = self.loader.load_manifest(dep) - dep_builder = dep_manifest.get("build", "builder", ctx=self.ctx) - - dep_cargo_conf = dep_manifest.get_section_as_dict("cargo", self.ctx) - dep_crate_map = dep_manifest.get_section_as_dict("crate.pathmap", self.ctx) - - if ( - not (dep_crate_map or dep_cargo_conf) - and dep_builder not in ["cargo"] - or dep == "rust" - ): - # This dependency has no cargo rust content so ignore it. - # The "rust" dependency is an exception since it contains the - # toolchain. - continue - - git_conf = dep_manifest.get_section_as_dict("git", self.ctx) - if dep != "rust" and "repo_url" not in git_conf: - raise Exception( - f"{dep}: A cargo dependency requires git.repo_url to be defined." - ) - - if dep_builder == "cargo": - dep_source_dir = self.loader.get_project_install_dir(dep_manifest) - dep_source_dir = os.path.join(dep_source_dir, "source") - else: - fetcher = self.loader.create_fetcher(dep_manifest) - dep_source_dir = fetcher.get_src_dir() - - crate_source_map = {} - if dep_crate_map: - for (crate, subpath) in dep_crate_map.items(): - if crate not in crate_source_map: - if self.build_opts.is_windows(): - subpath = subpath.replace("/", "\\") - crate_path = os.path.join(dep_source_dir, subpath) - print( - f"{self.manifest.name}: Mapped crate {crate} to dep {dep} dir {crate_path}" - ) - crate_source_map[crate] = crate_path - elif dep_cargo_conf: - # We don't know what crates are defined buy the dep, look for them - search_pattern = re.compile('\\[package\\]\nname = "(.*)"') - for crate_root, _, files in os.walk(dep_source_dir): - if "Cargo.toml" in files: - with open(os.path.join(crate_root, "Cargo.toml"), "r") as f: - content = f.read() - match = search_pattern.search(content) - if match: - crate = match.group(1) - if crate: - print( - f"{self.manifest.name}: Discovered crate {crate} in dep {dep} dir {crate_root}" - ) - crate_source_map[crate] = crate_root - - git_conf["crate_source_map"] = crate_source_map - - if not dep_crate_map and dep_cargo_conf: - dep_cargo_dir = self.loader.get_project_build_dir(dep_manifest) - dep_cargo_dir = os.path.join(dep_cargo_dir, "source") - dep_ws_dir = dep_cargo_conf.get("workspace_dir", None) - if dep_ws_dir: - dep_cargo_dir = os.path.join(dep_cargo_dir, dep_ws_dir) - git_conf["cargo_vendored_sources"] = dep_cargo_dir - - dep_to_git[dep] = git_conf - return dep_to_git - - def _resolve_dep_to_crates(self, build_source_dir, dep_to_git): - """ - This function traverse the build_source_dir in search of Cargo.toml - files, extracts the crate names from them using _extract_crates - function and returns a merged result containing crate names per - dependency name from all Cargo.toml files in the project. - """ - if not dep_to_git: - return {} # no deps, so don't waste time traversing files - - dep_to_crates = {} - - # First populate explicit crate paths from dependencies - for name, git_conf in dep_to_git.items(): - crates = git_conf["crate_source_map"].keys() - if crates: - dep_to_crates.setdefault(name, set()).update(crates) - - # Now find from Cargo.tomls - for root, _, files in os.walk(build_source_dir): - for f in files: - if f == "Cargo.toml": - more_dep_to_crates = CargoBuilder._extract_crates_used( - os.path.join(root, f), dep_to_git - ) - for dep_name, crates in more_dep_to_crates.items(): - existing_crates = dep_to_crates.get(dep_name, set()) - for c in crates: - if c not in existing_crates: - print( - f"Patch {self.manifest.name} uses {dep_name} crate {crates}" - ) - existing_crates.add(c) - dep_to_crates.setdefault(name, set()).update(existing_crates) - return dep_to_crates - - @staticmethod - def _extract_crates_used(cargo_toml_file, dep_to_git): - """ - This functions reads content of provided cargo toml file and extracts - crate names per each dependency. The extraction is done by a heuristic - so it might be incorrect. - """ - deps_to_crates = {} - with open(cargo_toml_file, "r") as f: - for line in f.readlines(): - if line.startswith("#") or "git = " not in line: - continue # filter out commented lines and ones without git deps - for dep_name, conf in dep_to_git.items(): - # Only redirect deps that point to git URLS - if 'git = "{}"'.format(conf["repo_url"]) in line: - pkg_template = ' package = "' - if pkg_template in line: - crate_name, _, _ = line.partition(pkg_template)[ - 2 - ].partition('"') - else: - crate_name, _, _ = line.partition("=") - deps_to_crates.setdefault(dep_name, set()).add( - crate_name.strip() - ) - return deps_to_crates - - def _resolve_crate_to_path(self, crate, crate_source_map): - """ - Tries to find in source_dir by searching a [package] - keyword followed by name = "". - """ - search_pattern = '[package]\nname = "{}"'.format(crate) - - for (_crate, crate_source_dir) in crate_source_map.items(): - for crate_root, _, files in os.walk(crate_source_dir): - if "Cargo.toml" in files: - with open(os.path.join(crate_root, "Cargo.toml"), "r") as f: - content = f.read() - if search_pattern in content: - return crate_root - - raise Exception( - f"{self.manifest.name}: Failed to find dep crate {crate} in paths {crate_source_map}" - ) diff --git a/build/fbcode_builder/getdeps/copytree.py b/build/fbcode_builder/getdeps/copytree.py deleted file mode 100644 index e7b3971cf8e79..0000000000000 --- a/build/fbcode_builder/getdeps/copytree.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import os -import shutil -import subprocess - -from .platform import is_windows - - -PREFETCHED_DIRS = set() - - -def containing_repo_type(path): - while True: - if os.path.exists(os.path.join(path, ".git")): - return ("git", path) - if os.path.exists(os.path.join(path, ".hg")): - return ("hg", path) - - parent = os.path.dirname(path) - if parent == path: - return None, None - path = parent - - -def find_eden_root(dirpath): - """If the specified directory is inside an EdenFS checkout, returns - the canonical absolute path to the root of that checkout. - - Returns None if the specified directory is not in an EdenFS checkout. - """ - if is_windows(): - repo_type, repo_root = containing_repo_type(dirpath) - if repo_root is not None: - if os.path.exists(os.path.join(repo_root, ".eden", "config")): - return repo_root - return None - - try: - return os.readlink(os.path.join(dirpath, ".eden", "root")) - except OSError: - return None - - -def prefetch_dir_if_eden(dirpath) -> None: - """After an amend/rebase, Eden may need to fetch a large number - of trees from the servers. The simplistic single threaded walk - performed by copytree makes this more expensive than is desirable - so we help accelerate things by performing a prefetch on the - source directory""" - global PREFETCHED_DIRS - if dirpath in PREFETCHED_DIRS: - return - root = find_eden_root(dirpath) - if root is None: - return - glob = f"{os.path.relpath(dirpath, root).replace(os.sep, '/')}/**" - print(f"Prefetching {glob}") - subprocess.call( - ["edenfsctl", "prefetch", "--repo", root, "--silent", glob, "--background"] - ) - PREFETCHED_DIRS.add(dirpath) - - -# pyre-fixme[9]: ignore has type `bool`; used as `None`. -def copytree(src_dir, dest_dir, ignore: bool = None): - """Recursively copy the src_dir to the dest_dir, filtering - out entries using the ignore lambda. The behavior of the - ignore lambda must match that described by `shutil.copytree`. - This `copytree` function knows how to prefetch data when - running in an eden repo. - TODO: I'd like to either extend this or add a variant that - uses watchman to mirror src_dir into dest_dir. - """ - prefetch_dir_if_eden(src_dir) - # pyre-fixme[6]: For 3rd param expected - # `Union[typing.Callable[[Union[PathLike[str], str], List[str]], Iterable[str]], - # typing.Callable[[str, List[str]], Iterable[str]], None]` but got `bool`. - return shutil.copytree(src_dir, dest_dir, ignore=ignore) diff --git a/build/fbcode_builder/getdeps/dyndeps.py b/build/fbcode_builder/getdeps/dyndeps.py deleted file mode 100644 index e33db7940c570..0000000000000 --- a/build/fbcode_builder/getdeps/dyndeps.py +++ /dev/null @@ -1,454 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import errno -import glob -import os -import re -import shlex -import shutil -import stat -import subprocess -import sys -from struct import unpack -from typing import List, Optional - -OBJECT_SUBDIRS = ("bin", "lib", "lib64") - - -def copyfile(src, dest) -> None: - shutil.copyfile(src, dest) - shutil.copymode(src, dest) - - -class DepBase(object): - def __init__(self, buildopts, install_dirs, strip) -> None: - self.buildopts = buildopts - self.env = buildopts.compute_env_for_install_dirs(install_dirs) - self.install_dirs = install_dirs - self.strip = strip - - # Deduplicates dependency processing. Keyed on the library - # destination path. - self.processed_deps = set() - - def list_dynamic_deps(self, objfile): - raise RuntimeError("list_dynamic_deps not implemented") - - def interesting_dep(self, d) -> bool: - return True - - # final_install_prefix must be the equivalent path to `destdir` on the - # installed system. For example, if destdir is `/tmp/RANDOM/usr/local' which - # is intended to map to `/usr/local` in the install image, then - # final_install_prefix='/usr/local'. - # If left unspecified, destdir will be used. - def process_deps(self, destdir, final_install_prefix=None) -> None: - if self.buildopts.is_windows(): - lib_dir = "bin" - else: - lib_dir = "lib" - # pyre-fixme[16]: `DepBase` has no attribute `munged_lib_dir`. - self.munged_lib_dir = os.path.join(destdir, lib_dir) - - final_lib_dir = os.path.join(final_install_prefix or destdir, lib_dir) - - if not os.path.isdir(self.munged_lib_dir): - os.makedirs(self.munged_lib_dir) - - # Look only at the things that got installed in the leaf package, - # which will be the last entry in the install dirs list - inst_dir = self.install_dirs[-1] - print("Process deps under %s" % inst_dir, file=sys.stderr) - - for dir in OBJECT_SUBDIRS: - src_dir = os.path.join(inst_dir, dir) - if not os.path.isdir(src_dir): - continue - dest_dir = os.path.join(destdir, dir) - if not os.path.exists(dest_dir): - os.makedirs(dest_dir) - - for objfile in self.list_objs_in_dir(src_dir): - print("Consider %s/%s" % (dir, objfile)) - dest_obj = os.path.join(dest_dir, objfile) - copyfile(os.path.join(src_dir, objfile), dest_obj) - self.munge_in_place(dest_obj, final_lib_dir) - - def find_all_dependencies(self, build_dir): - all_deps = set() - for objfile in self.list_objs_in_dir( - build_dir, recurse=True, output_prefix=build_dir - ): - for d in self.list_dynamic_deps(objfile): - all_deps.add(d) - - interesting_deps = {d for d in all_deps if self.interesting_dep(d)} - dep_paths = [] - for dep in interesting_deps: - dep_path = self.resolve_loader_path(dep) - if dep_path: - dep_paths.append(dep_path) - - return dep_paths - - def munge_in_place(self, objfile, final_lib_dir) -> None: - print("Munging %s" % objfile) - for d in self.list_dynamic_deps(objfile): - if not self.interesting_dep(d): - continue - - # Resolve this dep: does it exist in any of our installation - # directories? If so, then it is a candidate for processing - dep = self.resolve_loader_path(d) - if dep: - # pyre-fixme[16]: `DepBase` has no attribute `munged_lib_dir`. - dest_dep = os.path.join(self.munged_lib_dir, os.path.basename(dep)) - print("dep: %s -> %s" % (d, dest_dep)) - if dest_dep in self.processed_deps: - # A previous dependency with the same name has already - # been installed at dest_dep, so there is no need to copy - # or munge the dependency again. - # TODO: audit that both source paths have the same inode number - pass - else: - self.processed_deps.add(dest_dep) - copyfile(dep, dest_dep) - self.munge_in_place(dest_dep, final_lib_dir) - - self.rewrite_dep(objfile, d, dep, dest_dep, final_lib_dir) - - if self.strip: - self.strip_debug_info(objfile) - - def rewrite_dep(self, objfile, depname, old_dep, new_dep, final_lib_dir): - raise RuntimeError("rewrite_dep not implemented") - - def resolve_loader_path(self, dep: str) -> Optional[str]: - if os.path.isabs(dep): - return dep - d = os.path.basename(dep) - for inst_dir in self.install_dirs: - for libdir in OBJECT_SUBDIRS: - candidate = os.path.join(inst_dir, libdir, d) - if os.path.exists(candidate): - return candidate - return None - - def list_objs_in_dir(self, dir, recurse: bool = False, output_prefix: str = ""): - for entry in os.listdir(dir): - entry_path = os.path.join(dir, entry) - st = os.lstat(entry_path) - if stat.S_ISREG(st.st_mode): - if self.is_objfile(entry_path): - relative_result = os.path.join(output_prefix, entry) - yield os.path.normcase(relative_result) - elif recurse and stat.S_ISDIR(st.st_mode): - child_prefix = os.path.join(output_prefix, entry) - for result in self.list_objs_in_dir( - entry_path, recurse=recurse, output_prefix=child_prefix - ): - yield result - - def is_objfile(self, objfile) -> bool: - return True - - def strip_debug_info(self, objfile) -> None: - """override this to define how to remove debug information - from an object file""" - pass - - def check_call_verbose(self, args: List[str]) -> None: - print(" ".join(map(shlex.quote, args))) - subprocess.check_call(args) - - -class WinDeps(DepBase): - def __init__(self, buildopts, install_dirs, strip) -> None: - super(WinDeps, self).__init__(buildopts, install_dirs, strip) - self.dumpbin = self.find_dumpbin() - - def find_dumpbin(self) -> str: - # Looking for dumpbin in the following hardcoded paths. - # The registry option to find the install dir doesn't work anymore. - globs = [ - ( - "C:/Program Files (x86)/" - "Microsoft Visual Studio/" - "*/*/VC/Tools/" - "MSVC/*/bin/Hostx64/x64/dumpbin.exe" - ), - ( - "C:/Program Files (x86)/" - "Common Files/" - "Microsoft/Visual C++ for Python/*/" - "VC/bin/dumpbin.exe" - ), - ("c:/Program Files (x86)/Microsoft Visual Studio */VC/bin/dumpbin.exe"), - ( - "C:/Program Files/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/HostX64/x64/dumpbin.exe" - ), - ] - for pattern in globs: - for exe in glob.glob(pattern): - return exe - - raise RuntimeError("could not find dumpbin.exe") - - def list_dynamic_deps(self, exe): - deps = [] - print("Resolve deps for %s" % exe) - output = subprocess.check_output( - [self.dumpbin, "/nologo", "/dependents", exe] - ).decode("utf-8") - - lines = output.split("\n") - for line in lines: - m = re.match("\\s+(\\S+.dll)", line, re.IGNORECASE) - if m: - deps.append(m.group(1).lower()) - - return deps - - def rewrite_dep(self, objfile, depname, old_dep, new_dep, final_lib_dir) -> None: - # We can't rewrite on windows, but we will - # place the deps alongside the exe so that - # they end up in the search path - pass - - # These are the Windows system dll, which we don't want to copy while - # packaging. - SYSTEM_DLLS = set( # noqa: C405 - [ - "advapi32.dll", - "dbghelp.dll", - "kernel32.dll", - "msvcp140.dll", - "vcruntime140.dll", - "ws2_32.dll", - "ntdll.dll", - "shlwapi.dll", - ] - ) - - def interesting_dep(self, d) -> bool: - if "api-ms-win-crt" in d: - return False - if d in self.SYSTEM_DLLS: - return False - return True - - def is_objfile(self, objfile) -> bool: - if not os.path.isfile(objfile): - return False - if objfile.lower().endswith(".exe"): - return True - return False - - def emit_dev_run_script(self, script_path, dep_dirs) -> None: - """Emit a script that can be used to run build artifacts directly from the - build directory, without installing them. - - The dep_dirs parameter should be a list of paths that need to be added to $PATH. - This can be computed by calling compute_dependency_paths() or - compute_dependency_paths_fast(). - - This is only necessary on Windows, which does not have RPATH, and instead - requires the $PATH environment variable be updated in order to find the proper - library dependencies. - """ - contents = self._get_dev_run_script_contents(dep_dirs) - with open(script_path, "w") as f: - f.write(contents) - - def compute_dependency_paths(self, build_dir): - """Return a list of all directories that need to be added to $PATH to ensure - that library dependencies can be found correctly. This is computed by scanning - binaries to determine exactly the right list of dependencies. - - The compute_dependency_paths_fast() is a alternative function that runs faster - but may return additional extraneous paths. - """ - dep_dirs = set() - # Find paths by scanning the binaries. - for dep in self.find_all_dependencies(build_dir): - dep_dirs.add(os.path.dirname(dep)) - - dep_dirs.update(self.read_custom_dep_dirs(build_dir)) - return sorted(dep_dirs) - - def compute_dependency_paths_fast(self, build_dir): - """Similar to compute_dependency_paths(), but rather than actually scanning - binaries, just add all library paths from the specified installation - directories. This is much faster than scanning the binaries, but may result in - more paths being returned than actually necessary. - """ - dep_dirs = set() - for inst_dir in self.install_dirs: - for subdir in OBJECT_SUBDIRS: - path = os.path.join(inst_dir, subdir) - if os.path.exists(path): - dep_dirs.add(path) - - dep_dirs.update(self.read_custom_dep_dirs(build_dir)) - return sorted(dep_dirs) - - def read_custom_dep_dirs(self, build_dir): - # The build system may also have included libraries from other locations that - # we might not be able to find normally in find_all_dependencies(). - # To handle this situation we support reading additional library paths - # from a LIBRARY_DEP_DIRS.txt file that may have been generated in the build - # output directory. - dep_dirs = set() - try: - explicit_dep_dirs_path = os.path.join(build_dir, "LIBRARY_DEP_DIRS.txt") - with open(explicit_dep_dirs_path, "r") as f: - for line in f.read().splitlines(): - dep_dirs.add(line) - except OSError as ex: - if ex.errno != errno.ENOENT: - raise - - return dep_dirs - - def _get_dev_run_script_contents(self, path_dirs) -> str: - path_entries = ["$env:PATH"] + path_dirs - path_str = ";".join(path_entries) - return """\ -$orig_env = $env:PATH -$env:PATH = "{path_str}" - -try {{ - $cmd_args = $args[1..$args.length] - & $args[0] @cmd_args -}} finally {{ - $env:PATH = $orig_env -}} -""".format( - path_str=path_str - ) - - -class ElfDeps(DepBase): - def __init__(self, buildopts, install_dirs, strip) -> None: - super(ElfDeps, self).__init__(buildopts, install_dirs, strip) - - # We need patchelf to rewrite deps, so ensure that it is built... - subprocess.check_call([sys.executable, sys.argv[0], "build", "patchelf"]) - # ... and that we know where it lives - self.patchelf = os.path.join( - os.fsdecode( - subprocess.check_output( - [sys.executable, sys.argv[0], "show-inst-dir", "patchelf"] - ).strip() - ), - "bin/patchelf", - ) - - def list_dynamic_deps(self, objfile): - out = ( - subprocess.check_output( - [self.patchelf, "--print-needed", objfile], env=dict(self.env.items()) - ) - .decode("utf-8") - .strip() - ) - lines = out.split("\n") - return lines - - def rewrite_dep(self, objfile, depname, old_dep, new_dep, final_lib_dir) -> None: - final_dep = os.path.join( - final_lib_dir, - # pyre-fixme[16]: `ElfDeps` has no attribute `munged_lib_dir`. - os.path.relpath(new_dep, self.munged_lib_dir), - ) - self.check_call_verbose( - [self.patchelf, "--replace-needed", depname, final_dep, objfile] - ) - - def is_objfile(self, objfile) -> bool: - if not os.path.isfile(objfile): - return False - with open(objfile, "rb") as f: - # https://en.wikipedia.org/wiki/Executable_and_Linkable_Format#File_header - magic = f.read(4) - return magic == b"\x7fELF" - - def strip_debug_info(self, objfile) -> None: - self.check_call_verbose(["strip", objfile]) - - -# MACH-O magic number -MACH_MAGIC = 0xFEEDFACF - - -class MachDeps(DepBase): - def interesting_dep(self, d) -> bool: - if d.startswith("/usr/lib/") or d.startswith("/System/"): - return False - return True - - def is_objfile(self, objfile): - if not os.path.isfile(objfile): - return False - with open(objfile, "rb") as f: - # mach stores the magic number in native endianness, - # so unpack as native here and compare - header = f.read(4) - if len(header) != 4: - return False - magic = unpack("I", header)[0] - return magic == MACH_MAGIC - - def list_dynamic_deps(self, objfile): - if not self.interesting_dep(objfile): - return - out = ( - subprocess.check_output( - ["otool", "-L", objfile], env=dict(self.env.items()) - ) - .decode("utf-8") - .strip() - ) - lines = out.split("\n") - deps = [] - for line in lines: - m = re.match("\t(\\S+)\\s", line) - if m: - if os.path.basename(m.group(1)) != os.path.basename(objfile): - deps.append(os.path.normcase(m.group(1))) - return deps - - def rewrite_dep(self, objfile, depname, old_dep, new_dep, final_lib_dir) -> None: - if objfile.endswith(".dylib"): - # Erase the original location from the id of the shared - # object. It doesn't appear to hurt to retain it, but - # it does look weird, so let's rewrite it to be sure. - self.check_call_verbose( - ["install_name_tool", "-id", os.path.basename(objfile), objfile] - ) - final_dep = os.path.join( - final_lib_dir, - # pyre-fixme[16]: `MachDeps` has no attribute `munged_lib_dir`. - os.path.relpath(new_dep, self.munged_lib_dir), - ) - - self.check_call_verbose( - ["install_name_tool", "-change", depname, final_dep, objfile] - ) - - -def create_dyn_dep_munger( - buildopts, install_dirs, strip: bool = False -) -> Optional[DepBase]: - if buildopts.is_linux(): - return ElfDeps(buildopts, install_dirs, strip) - if buildopts.is_darwin(): - return MachDeps(buildopts, install_dirs, strip) - if buildopts.is_windows(): - return WinDeps(buildopts, install_dirs, strip) - if buildopts.is_freebsd(): - return ElfDeps(buildopts, install_dirs, strip) - return None diff --git a/build/fbcode_builder/getdeps/envfuncs.py b/build/fbcode_builder/getdeps/envfuncs.py deleted file mode 100644 index 60de6b23143e0..0000000000000 --- a/build/fbcode_builder/getdeps/envfuncs.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import os -import shlex -import sys -from typing import Optional - - -class Env(object): - def __init__(self, src=None) -> None: - self._dict = {} - if src is None: - self.update(os.environ) - else: - self.update(src) - - def update(self, src) -> None: - for k, v in src.items(): - self.set(k, v) - - def copy(self) -> "Env": - return Env(self._dict) - - def _key(self, key): - # The `str` cast may not appear to be needed, but without it we run - # into issues when passing the environment to subprocess. The main - # issue is that in python2 `os.environ` (which is the initial source - # of data for the environment) uses byte based strings, but this - # project uses `unicode_literals`. `subprocess` will raise an error - # if the environment that it is passed has a mixture of byte and - # unicode strings. - # It is simplest to force everything to be `str` for the sake of - # consistency. - key = str(key) - if sys.platform.startswith("win"): - # Windows env var names are case insensitive but case preserving. - # An implementation of PAR files on windows gets confused if - # the env block contains keys with conflicting case, so make a - # pass over the contents to remove any. - # While this O(n) scan is technically expensive and gross, it - # is practically not a problem because the volume of calls is - # relatively low and the cost of manipulating the env is dwarfed - # by the cost of spawning a process on windows. In addition, - # since the processes that we run are expensive anyway, this - # overhead is not the worst thing to worry about. - for k in list(self._dict.keys()): - if str(k).lower() == key.lower(): - return k - elif key in self._dict: - return key - return None - - def get(self, key, defval=None): - key = self._key(key) - if key is None: - return defval - return self._dict[key] - - def __getitem__(self, key): - val = self.get(key) - if key is None: - raise KeyError(key) - return val - - def unset(self, key) -> None: - if key is None: - raise KeyError("attempting to unset env[None]") - - key = self._key(key) - if key: - del self._dict[key] - - def __delitem__(self, key) -> None: - self.unset(key) - - def __repr__(self): - return repr(self._dict) - - def set(self, key, value) -> None: - if key is None: - raise KeyError("attempting to assign env[None] = %r" % value) - - if value is None: - raise ValueError("attempting to assign env[%s] = None" % key) - - # The `str` conversion is important to avoid triggering errors - # with subprocess if we pass in a unicode value; see commentary - # in the `_key` method. - key = str(key) - value = str(value) - - # The `unset` call is necessary on windows where the keys are - # case insensitive. Since this dict is case sensitive, simply - # assigning the value to the new key is not sufficient to remove - # the old value. The `unset` call knows how to match keys and - # remove any potential duplicates. - self.unset(key) - self._dict[key] = value - - def __setitem__(self, key, value) -> None: - self.set(key, value) - - def __iter__(self): - return self._dict.__iter__() - - def __len__(self) -> int: - return len(self._dict) - - def keys(self): - return self._dict.keys() - - def values(self): - return self._dict.values() - - def items(self): - return self._dict.items() - - -def add_path_entry( - env, name, item, append: bool = True, separator: str = os.pathsep -) -> None: - """Cause `item` to be added to the path style env var named - `name` held in the `env` dict. `append` specifies whether - the item is added to the end (the default) or should be - prepended if `name` already exists.""" - val = env.get(name, "") - if len(val) > 0: - val = val.split(separator) - else: - val = [] - if append: - val.append(item) - else: - val.insert(0, item) - env.set(name, separator.join(val)) - - -def add_flag(env, name, flag: str, append: bool = True) -> None: - """Cause `flag` to be added to the CXXFLAGS-style env var named - `name` held in the `env` dict. `append` specifies whether the - flag is added to the end (the default) or should be prepended if - `name` already exists.""" - val = shlex.split(env.get(name, "")) - if append: - val.append(flag) - else: - val.insert(0, flag) - env.set(name, " ".join(val)) - - -_path_search_cache = {} -_not_found = object() - - -def tpx_path() -> str: - return "xplat/testinfra/tpx/ctp.tpx" - - -def path_search(env, exename: str, defval: Optional[str] = None) -> Optional[str]: - """Search for exename in the PATH specified in env. - exename is eg: `ninja` and this function knows to append a .exe - to the end on windows. - Returns the path to the exe if found, or None if either no - PATH is set in env or no executable is found.""" - - path = env.get("PATH", None) - if path is None: - return defval - - # The project hash computation code searches for C++ compilers (g++, clang, etc) - # repeatedly. Cache the result so we don't end up searching for these over and over - # again. - cache_key = (path, exename) - result = _path_search_cache.get(cache_key, _not_found) - if result is _not_found: - result = _perform_path_search(path, exename) - _path_search_cache[cache_key] = result - return result - - -def _perform_path_search(path, exename: str) -> Optional[str]: - is_win = sys.platform.startswith("win") - if is_win: - exename = "%s.exe" % exename - - for bindir in path.split(os.pathsep): - full_name = os.path.join(bindir, exename) - if os.path.exists(full_name) and os.path.isfile(full_name): - if not is_win and not os.access(full_name, os.X_OK): - continue - return full_name - - return None diff --git a/build/fbcode_builder/getdeps/errors.py b/build/fbcode_builder/getdeps/errors.py deleted file mode 100644 index 92240c9538c21..0000000000000 --- a/build/fbcode_builder/getdeps/errors.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -class TransientFailure(Exception): - """Raising this error causes getdeps to return with an error code - that Sandcastle will consider to be a retryable transient - infrastructure error""" - - pass - - -class ManifestNotFound(Exception): - def __init__(self, manifest_name) -> None: - super(Exception, self).__init__("Unable to find manifest '%s'" % manifest_name) diff --git a/build/fbcode_builder/getdeps/expr.py b/build/fbcode_builder/getdeps/expr.py deleted file mode 100644 index df8c3022732df..0000000000000 --- a/build/fbcode_builder/getdeps/expr.py +++ /dev/null @@ -1,184 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import re -import shlex - - -def parse_expr(expr_text, valid_variables): - """parses the simple criteria expression syntax used in - dependency specifications. - Returns an ExprNode instance that can be evaluated like this: - - ``` - expr = parse_expr("os=windows") - ok = expr.eval({ - "os": "windows" - }) - ``` - - Whitespace is allowed between tokens. The following terms - are recognized: - - KEY = VALUE # Evaluates to True if ctx[KEY] == VALUE - not(EXPR) # Evaluates to True if EXPR evaluates to False - # and vice versa - all(EXPR1, EXPR2, ...) # Evaluates True if all of the supplied - # EXPR's also evaluate True - any(EXPR1, EXPR2, ...) # Evaluates True if any of the supplied - # EXPR's also evaluate True, False if - # none of them evaluated true. - """ - - p = Parser(expr_text, valid_variables) - return p.parse() - - -class ExprNode(object): - def eval(self, ctx) -> bool: - return False - - -class TrueExpr(ExprNode): - def eval(self, ctx) -> bool: - return True - - def __str__(self) -> str: - return "true" - - -class NotExpr(ExprNode): - def __init__(self, node) -> None: - self._node = node - - def eval(self, ctx) -> bool: - return not self._node.eval(ctx) - - def __str__(self) -> str: - return "not(%s)" % self._node - - -class AllExpr(ExprNode): - def __init__(self, nodes) -> None: - self._nodes = nodes - - def eval(self, ctx) -> bool: - for node in self._nodes: - if not node.eval(ctx): - return False - return True - - def __str__(self) -> str: - items = [] - for node in self._nodes: - items.append(str(node)) - return "all(%s)" % ",".join(items) - - -class AnyExpr(ExprNode): - def __init__(self, nodes) -> None: - self._nodes = nodes - - def eval(self, ctx) -> bool: - for node in self._nodes: - if node.eval(ctx): - return True - return False - - def __str__(self) -> str: - items = [] - for node in self._nodes: - items.append(str(node)) - return "any(%s)" % ",".join(items) - - -class EqualExpr(ExprNode): - def __init__(self, key, value) -> None: - self._key = key - self._value = value - - def eval(self, ctx): - return ctx.get(self._key) == self._value - - def __str__(self) -> str: - return "%s=%s" % (self._key, self._value) - - -class Parser(object): - def __init__(self, text, valid_variables) -> None: - self.text = text - self.lex = shlex.shlex(text) - self.valid_variables = valid_variables - - def parse(self): - expr = self.top() - garbage = self.lex.get_token() - if garbage != "": - raise Exception( - "Unexpected token %s after EqualExpr in %s" % (garbage, self.text) - ) - return expr - - def top(self): - name = self.ident() - op = self.lex.get_token() - - if op == "(": - parsers = { - "not": self.parse_not, - "any": self.parse_any, - "all": self.parse_all, - } - func = parsers.get(name) - if not func: - raise Exception("invalid term %s in %s" % (name, self.text)) - return func() - - if op == "=": - if name not in self.valid_variables: - raise Exception("unknown variable %r in expression" % (name,)) - # remove shell quote from value so can test things with period in them, e.g "18.04" - unquoted = " ".join(shlex.split(self.lex.get_token())) - return EqualExpr(name, unquoted) - - raise Exception( - "Unexpected token sequence '%s %s' in %s" % (name, op, self.text) - ) - - def ident(self) -> str: - ident = self.lex.get_token() - if not re.match("[a-zA-Z]+", ident): - raise Exception("expected identifier found %s" % ident) - return ident - - def parse_not(self) -> NotExpr: - node = self.top() - expr = NotExpr(node) - tok = self.lex.get_token() - if tok != ")": - raise Exception("expected ')' found %s" % tok) - return expr - - def parse_any(self) -> AnyExpr: - nodes = [] - while True: - nodes.append(self.top()) - tok = self.lex.get_token() - if tok == ")": - break - if tok != ",": - raise Exception("expected ',' or ')' but found %s" % tok) - return AnyExpr(nodes) - - def parse_all(self) -> AllExpr: - nodes = [] - while True: - nodes.append(self.top()) - tok = self.lex.get_token() - if tok == ")": - break - if tok != ",": - raise Exception("expected ',' or ')' but found %s" % tok) - return AllExpr(nodes) diff --git a/build/fbcode_builder/getdeps/fetcher.py b/build/fbcode_builder/getdeps/fetcher.py deleted file mode 100644 index ae96ac133426c..0000000000000 --- a/build/fbcode_builder/getdeps/fetcher.py +++ /dev/null @@ -1,853 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import errno -import hashlib -import os -import re -import shutil -import stat -import subprocess -import sys -import tarfile -import time -import zipfile -from datetime import datetime -from typing import Dict, NamedTuple -from urllib.parse import urlparse -from urllib.request import Request, urlopen - -from .copytree import prefetch_dir_if_eden -from .envfuncs import Env -from .errors import TransientFailure -from .platform import is_windows -from .runcmd import run_cmd - - -def file_name_is_cmake_file(file_name): - file_name = file_name.lower() - base = os.path.basename(file_name) - return ( - base.endswith(".cmake") - or base.endswith(".cmake.in") - or base == "cmakelists.txt" - ) - - -class ChangeStatus(object): - """Indicates the nature of changes that happened while updating - the source directory. There are two broad uses: - * When extracting archives for third party software we want to - know that we did something (eg: we either extracted code or - we didn't do anything) - * For 1st party code where we use shipit to transform the code, - we want to know if we changed anything so that we can perform - a build, but we generally want to be a little more nuanced - and be able to distinguish between just changing a source file - and whether we might need to reconfigure the build system. - """ - - def __init__(self, all_changed: bool = False) -> None: - """Construct a ChangeStatus object. The default is to create - a status that indicates no changes, but passing all_changed=True - will create one that indicates that everything changed""" - if all_changed: - self.source_files = 1 - self.make_files = 1 - else: - self.source_files = 0 - self.make_files = 0 - - def record_change(self, file_name) -> None: - """Used by the shipit fetcher to record changes as it updates - files in the destination. If the file name might be one used - in the cmake build system that we use for 1st party code, then - record that as a "make file" change. We could broaden this - to match any file used by various build systems, but it is - only really useful for our internal cmake stuff at this time. - If the file isn't a build file and is under the `fbcode_builder` - dir then we don't class that as an interesting change that we - might need to rebuild, so we ignore it. - Otherwise we record the file as a source file change.""" - - file_name = file_name.lower() - if file_name_is_cmake_file(file_name): - self.make_files += 1 - elif "/fbcode_builder/cmake" in file_name: - self.source_files += 1 - elif "/fbcode_builder/" not in file_name: - self.source_files += 1 - - def sources_changed(self) -> bool: - """Returns true if any source files were changed during - an update operation. This will typically be used to decide - that the build system to be run on the source dir in an - incremental mode""" - return self.source_files > 0 - - def build_changed(self) -> bool: - """Returns true if any build files were changed during - an update operation. This will typically be used to decidfe - that the build system should be reconfigured and re-run - as a full build""" - return self.make_files > 0 - - -class Fetcher(object): - """The Fetcher is responsible for fetching and extracting the - sources for project. The Fetcher instance defines where the - extracted data resides and reports this to the consumer via - its `get_src_dir` method.""" - - def update(self) -> ChangeStatus: - """Brings the src dir up to date, ideally minimizing - changes so that a subsequent build doesn't over-build. - Returns a ChangeStatus object that helps the caller to - understand the nature of the changes required during - the update.""" - return ChangeStatus() - - def clean(self) -> None: - """Reverts any changes that might have been made to - the src dir""" - pass - - def hash(self) -> None: - """Returns a hash that identifies the version of the code in the - working copy. For a git repo this is commit hash for the working - copy. For other Fetchers this should relate to the version of - the code in the src dir. The intent is that if a manifest - changes the version/rev of a project that the hash be different. - Importantly, this should be computable without actually fetching - the code, as we want this to factor into a hash used to download - a pre-built version of the code, without having to first download - and extract its sources (eg: boost on windows is pretty painful). - """ - pass - - def get_src_dir(self) -> None: - """Returns the source directory that the project was - extracted into""" - pass - - -class LocalDirFetcher(object): - """This class exists to override the normal fetching behavior, and - use an explicit user-specified directory for the project sources. - - This fetcher cannot update or track changes. It always reports that the - project has changed, forcing it to always be built.""" - - def __init__(self, path) -> None: - self.path = os.path.realpath(path) - - def update(self) -> ChangeStatus: - return ChangeStatus(all_changed=True) - - def hash(self) -> str: - return "0" * 40 - - def get_src_dir(self): - return self.path - - -class SystemPackageFetcher(object): - def __init__(self, build_options, packages) -> None: - self.manager = build_options.host_type.get_package_manager() - self.packages = packages.get(self.manager) - self.host_type = build_options.host_type - if self.packages: - self.installed = None - else: - self.installed = False - - def packages_are_installed(self): - if self.installed is not None: - return self.installed - - cmd = None - if self.manager == "rpm": - cmd = ["rpm", "-q"] + sorted(self.packages) - elif self.manager == "deb": - cmd = ["dpkg", "-s"] + sorted(self.packages) - elif self.manager == "homebrew": - cmd = ["brew", "ls", "--versions"] + sorted(self.packages) - - if cmd: - proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if proc.returncode == 0: - # captured as binary as we will hash this later - self.installed = proc.stdout - else: - # Need all packages to be present to consider us installed - self.installed = False - - else: - self.installed = False - - return bool(self.installed) - - def update(self) -> ChangeStatus: - assert self.installed - return ChangeStatus(all_changed=False) - - def hash(self) -> str: - if self.packages_are_installed(): - return hashlib.sha256(self.installed).hexdigest() - else: - return "0" * 40 - - def get_src_dir(self) -> None: - return None - - -class PreinstalledNopFetcher(SystemPackageFetcher): - def __init__(self) -> None: - self.installed = True - - -class GitFetcher(Fetcher): - DEFAULT_DEPTH = 1 - - def __init__(self, build_options, manifest, repo_url, rev, depth) -> None: - # Extract the host/path portions of the URL and generate a flattened - # directory name. eg: - # github.com/facebook/folly.git -> github.com-facebook-folly.git - url = urlparse(repo_url) - directory = "%s%s" % (url.netloc, url.path) - for s in ["/", "\\", ":"]: - directory = directory.replace(s, "-") - - # Place it in a repos dir in the scratch space - repos_dir = os.path.join(build_options.scratch_dir, "repos") - if not os.path.exists(repos_dir): - os.makedirs(repos_dir) - self.repo_dir = os.path.join(repos_dir, directory) - - if not rev and build_options.project_hashes: - hash_file = os.path.join( - build_options.project_hashes, - re.sub("\\.git$", "-rev.txt", url.path[1:]), - ) - if os.path.exists(hash_file): - with open(hash_file, "r") as f: - data = f.read() - m = re.match("Subproject commit ([a-fA-F0-9]{40})", data) - if not m: - raise Exception("Failed to parse rev from %s" % hash_file) - rev = m.group(1) - print("Using pinned rev %s for %s" % (rev, repo_url)) - - self.rev = rev or "main" - self.origin_repo = repo_url - self.manifest = manifest - self.depth = depth if depth else GitFetcher.DEFAULT_DEPTH - - def _update(self) -> ChangeStatus: - current_hash = ( - subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=self.repo_dir) - .strip() - .decode("utf-8") - ) - target_hash = ( - subprocess.check_output(["git", "rev-parse", self.rev], cwd=self.repo_dir) - .strip() - .decode("utf-8") - ) - if target_hash == current_hash: - # It's up to date, so there are no changes. This doesn't detect eg: - # if origin/main moved and rev='main', but that's ok for our purposes; - # we should be using explicit hashes or eg: a stable branch for the cases - # that we care about, and it isn't unreasonable to require that the user - # explicitly perform a clean build if those have moved. For the most - # part we prefer that folks build using a release tarball from github - # rather than use the git protocol, as it is generally a bit quicker - # to fetch and easier to hash and verify tarball downloads. - return ChangeStatus() - - print("Updating %s -> %s" % (self.repo_dir, self.rev)) - run_cmd(["git", "fetch", "origin", self.rev], cwd=self.repo_dir) - run_cmd(["git", "checkout", self.rev], cwd=self.repo_dir) - run_cmd(["git", "submodule", "update", "--init"], cwd=self.repo_dir) - - return ChangeStatus(True) - - def update(self) -> ChangeStatus: - if os.path.exists(self.repo_dir): - return self._update() - self._clone() - return ChangeStatus(True) - - def _clone(self) -> None: - print("Cloning %s..." % self.origin_repo) - # The basename/dirname stuff allows us to dance around issues where - # eg: this python process is native win32, but the git.exe is cygwin - # or msys and doesn't like the absolute windows path that we'd otherwise - # pass to it. Careful use of cwd helps avoid headaches with cygpath. - run_cmd( - [ - "git", - "clone", - "--depth=" + str(self.depth), - "--", - self.origin_repo, - os.path.basename(self.repo_dir), - ], - cwd=os.path.dirname(self.repo_dir), - ) - self._update() - - def clean(self) -> None: - if os.path.exists(self.repo_dir): - run_cmd(["git", "clean", "-fxd"], cwd=self.repo_dir) - - def hash(self): - return self.rev - - def get_src_dir(self): - return self.repo_dir - - -def does_file_need_update(src_name, src_st, dest_name): - try: - target_st = os.lstat(dest_name) - except OSError as exc: - if exc.errno != errno.ENOENT: - raise - return True - - if src_st.st_size != target_st.st_size: - return True - - if stat.S_IFMT(src_st.st_mode) != stat.S_IFMT(target_st.st_mode): - return True - if stat.S_ISLNK(src_st.st_mode): - return os.readlink(src_name) != os.readlink(dest_name) - if not stat.S_ISREG(src_st.st_mode): - return True - - # They might have the same content; compare. - with open(src_name, "rb") as sf, open(dest_name, "rb") as df: - chunk_size = 8192 - while True: - src_data = sf.read(chunk_size) - dest_data = df.read(chunk_size) - if src_data != dest_data: - return True - if len(src_data) < chunk_size: - # EOF - break - return False - - -def copy_if_different(src_name, dest_name) -> bool: - """Copy src_name -> dest_name, but only touch dest_name - if src_name is different from dest_name, making this a - more build system friendly way to copy.""" - src_st = os.lstat(src_name) - if not does_file_need_update(src_name, src_st, dest_name): - return False - - dest_parent = os.path.dirname(dest_name) - if not os.path.exists(dest_parent): - os.makedirs(dest_parent) - if stat.S_ISLNK(src_st.st_mode): - try: - os.unlink(dest_name) - except OSError as exc: - if exc.errno != errno.ENOENT: - raise - target = os.readlink(src_name) - print("Symlinking %s -> %s" % (dest_name, target)) - os.symlink(target, dest_name) - else: - print("Copying %s -> %s" % (src_name, dest_name)) - shutil.copy2(src_name, dest_name) - - return True - - -def list_files_under_dir_newer_than_timestamp(dir_to_scan, ts): - for root, _dirs, files in os.walk(dir_to_scan): - for src_file in files: - full_name = os.path.join(root, src_file) - st = os.lstat(full_name) - if st.st_mtime > ts: - yield full_name - - -class ShipitPathMap(object): - def __init__(self) -> None: - self.roots = [] - self.mapping = [] - self.exclusion = [] - - def add_mapping(self, fbsource_dir, target_dir) -> None: - """Add a posix path or pattern. We cannot normpath the input - here because that would change the paths from posix to windows - form and break the logic throughout this class.""" - self.roots.append(fbsource_dir) - self.mapping.append((fbsource_dir, target_dir)) - - def add_exclusion(self, pattern) -> None: - self.exclusion.append(re.compile(pattern)) - - def _minimize_roots(self) -> None: - """compute the de-duplicated set of roots within fbsource. - We take the shortest common directory prefix to make this - determination""" - self.roots.sort(key=len) - minimized = [] - - for r in self.roots: - add_this_entry = True - for existing in minimized: - if r.startswith(existing + "/"): - add_this_entry = False - break - if add_this_entry: - minimized.append(r) - - self.roots = minimized - - def _sort_mapping(self) -> None: - self.mapping.sort(reverse=True, key=lambda x: len(x[0])) - - def _map_name(self, norm_name, dest_root): - if norm_name.endswith(".pyc") or norm_name.endswith(".swp"): - # Ignore some incidental garbage while iterating - return None - - for excl in self.exclusion: - if excl.match(norm_name): - return None - - for src_name, dest_name in self.mapping: - if norm_name == src_name or norm_name.startswith(src_name + "/"): - rel_name = os.path.relpath(norm_name, src_name) - # We can have "." as a component of some paths, depending - # on the contents of the shipit transformation section. - # normpath doesn't always remove `.` as the final component - # of the path, which be problematic when we later mkdir - # the dirname of the path that we return. Take care to avoid - # returning a path with a `.` in it. - rel_name = os.path.normpath(rel_name) - if dest_name == ".": - return os.path.normpath(os.path.join(dest_root, rel_name)) - dest_name = os.path.normpath(dest_name) - return os.path.normpath(os.path.join(dest_root, dest_name, rel_name)) - - raise Exception("%s did not match any rules" % norm_name) - - def mirror(self, fbsource_root, dest_root) -> ChangeStatus: - self._minimize_roots() - self._sort_mapping() - - change_status = ChangeStatus() - - # Record the full set of files that should be in the tree - full_file_list = set() - - if sys.platform == "win32": - # Let's not assume st_dev has a consistent value on Windows. - def st_dev(path): - return 1 - - else: - - def st_dev(path): - return os.lstat(path).st_dev - - for fbsource_subdir in self.roots: - dir_to_mirror = os.path.join(fbsource_root, fbsource_subdir) - root_dev = st_dev(dir_to_mirror) - prefetch_dir_if_eden(dir_to_mirror) - if not os.path.exists(dir_to_mirror): - raise Exception( - "%s doesn't exist; check your sparse profile!" % dir_to_mirror - ) - - for root, dirs, files in os.walk(dir_to_mirror): - dirs[:] = [d for d in dirs if root_dev == st_dev(os.path.join(root, d))] - - for src_file in files: - full_name = os.path.join(root, src_file) - rel_name = os.path.relpath(full_name, fbsource_root) - norm_name = rel_name.replace("\\", "/") - - target_name = self._map_name(norm_name, dest_root) - if target_name: - full_file_list.add(target_name) - if copy_if_different(full_name, target_name): - change_status.record_change(target_name) - - # Compare the list of previously shipped files; if a file is - # in the old list but not the new list then it has been - # removed from the source and should be removed from the - # destination. - # Why don't we simply create this list by walking dest_root? - # Some builds currently have to be in-source builds and - # may legitimately need to keep some state in the source tree :-/ - installed_name = os.path.join(dest_root, ".shipit_shipped") - if os.path.exists(installed_name): - with open(installed_name, "rb") as f: - for name in f.read().decode("utf-8").splitlines(): - name = name.strip() - if name not in full_file_list: - print("Remove %s" % name) - os.unlink(name) - change_status.record_change(name) - - with open(installed_name, "wb") as f: - for name in sorted(list(full_file_list)): - f.write(("%s\n" % name).encode("utf-8")) - - return change_status - - -class FbsourceRepoData(NamedTuple): - hash: str - date: str - - -FBSOURCE_REPO_DATA: Dict[str, FbsourceRepoData] = {} - - -def get_fbsource_repo_data(build_options) -> FbsourceRepoData: - """Returns the commit metadata for the fbsource repo. - Since we may have multiple first party projects to - hash, and because we don't mutate the repo, we cache - this hash in a global.""" - cached_data = FBSOURCE_REPO_DATA.get(build_options.fbsource_dir) - if cached_data: - return cached_data - - if "GETDEPS_HG_REPO_DATA" in os.environ: - log_data = os.environ["GETDEPS_HG_REPO_DATA"] - else: - cmd = ["hg", "log", "-r.", "-T{node}\n{date|hgdate}"] - env = Env() - env.set("HGPLAIN", "1") - log_data = subprocess.check_output( - cmd, cwd=build_options.fbsource_dir, env=dict(env.items()) - ).decode("ascii") - - (hash, datestr) = log_data.split("\n") - - # datestr is like "seconds fractionalseconds" - # We want "20200324.113140" - (unixtime, _fractional) = datestr.split(" ") - date = datetime.fromtimestamp(int(unixtime)).strftime("%Y%m%d.%H%M%S") - cached_data = FbsourceRepoData(hash=hash, date=date) - - FBSOURCE_REPO_DATA[build_options.fbsource_dir] = cached_data - - return cached_data - - -class SimpleShipitTransformerFetcher(Fetcher): - def __init__(self, build_options, manifest, ctx) -> None: - self.build_options = build_options - self.manifest = manifest - self.repo_dir = os.path.join(build_options.scratch_dir, "shipit", manifest.name) - self.ctx = ctx - - def clean(self) -> None: - if os.path.exists(self.repo_dir): - shutil.rmtree(self.repo_dir) - - def update(self) -> ChangeStatus: - mapping = ShipitPathMap() - for src, dest in self.manifest.get_section_as_ordered_pairs( - "shipit.pathmap", self.ctx - ): - mapping.add_mapping(src, dest) - if self.manifest.shipit_fbcode_builder: - mapping.add_mapping( - "fbcode/opensource/fbcode_builder", "build/fbcode_builder" - ) - for pattern in self.manifest.get_section_as_args("shipit.strip", self.ctx): - mapping.add_exclusion(pattern) - - return mapping.mirror(self.build_options.fbsource_dir, self.repo_dir) - - # pyre-fixme[15]: `hash` overrides method defined in `Fetcher` inconsistently. - def hash(self) -> str: - # We return a fixed non-hash string for in-fbsource builds. - # We're relying on the `update` logic to correctly invalidate - # the build in the case that files have changed. - return "fbsource" - - def get_src_dir(self): - return self.repo_dir - - -class ShipitTransformerFetcher(Fetcher): - SHIPIT = "/var/www/scripts/opensource/shipit/run_shipit.php" - - def __init__(self, build_options, project_name) -> None: - self.build_options = build_options - self.project_name = project_name - self.repo_dir = os.path.join(build_options.scratch_dir, "shipit", project_name) - - def update(self) -> ChangeStatus: - if os.path.exists(self.repo_dir): - return ChangeStatus() - self.run_shipit() - return ChangeStatus(True) - - def clean(self) -> None: - if os.path.exists(self.repo_dir): - shutil.rmtree(self.repo_dir) - - @classmethod - def available(cls): - return os.path.exists(cls.SHIPIT) - - def run_shipit(self) -> None: - tmp_path = self.repo_dir + ".new" - try: - if os.path.exists(tmp_path): - shutil.rmtree(tmp_path) - - # Run shipit - run_cmd( - [ - "php", - ShipitTransformerFetcher.SHIPIT, - "--project=" + self.project_name, - "--create-new-repo", - "--source-repo-dir=" + self.build_options.fbsource_dir, - "--source-branch=.", - "--skip-source-init", - "--skip-source-pull", - "--skip-source-clean", - "--skip-push", - "--skip-reset", - "--destination-use-anonymous-https", - "--create-new-repo-output-path=" + tmp_path, - ] - ) - - # Remove the .git directory from the repository it generated. - # There is no need to commit this. - repo_git_dir = os.path.join(tmp_path, ".git") - shutil.rmtree(repo_git_dir) - os.rename(tmp_path, self.repo_dir) - except Exception: - # Clean up after a failed extraction - if os.path.exists(tmp_path): - shutil.rmtree(tmp_path) - self.clean() - raise - - # pyre-fixme[15]: `hash` overrides method defined in `Fetcher` inconsistently. - def hash(self) -> str: - # We return a fixed non-hash string for in-fbsource builds. - return "fbsource" - - def get_src_dir(self): - return self.repo_dir - - -def download_url_to_file_with_progress(url: str, file_name) -> None: - print("Download with %s -> %s ..." % (url, file_name)) - - class Progress(object): - last_report = 0 - - def write_update(self, total, amount): - if total == -1: - total = "(Unknown)" - - if sys.stdout.isatty(): - sys.stdout.write("\r downloading %s of %s " % (amount, total)) - else: - # When logging to CI logs, avoid spamming the logs and print - # status every few seconds - now = time.time() - if now - self.last_report > 5: - sys.stdout.write(".. %s of %s " % (amount, total)) - self.last_report = now - sys.stdout.flush() - - def progress_pycurl(self, total, amount, _uploadtotal, _uploadamount): - self.write_update(total, amount) - - progress = Progress() - start = time.time() - try: - if os.environ.get("GETDEPS_USE_WGET") is not None: - subprocess.run( - [ - "wget", - "-O", - file_name, - url, - ] - ) - - headers = None - - elif os.environ.get("GETDEPS_USE_LIBCURL") is not None: - import pycurl - - with open(file_name, "wb") as f: - c = pycurl.Curl() - c.setopt(pycurl.URL, url) - c.setopt(pycurl.WRITEDATA, f) - # display progress - c.setopt(pycurl.NOPROGRESS, False) - c.setopt(pycurl.XFERINFOFUNCTION, progress.progress_pycurl) - c.perform() - c.close() - headers = None - else: - req_header = {"Accept": "application/*"} - res = urlopen(Request(url, None, req_header)) - chunk_size = 8192 # urlretrieve uses this value - headers = res.headers - content_length = res.headers.get("Content-Length") - total = int(content_length.strip()) if content_length else -1 - amount = 0 - with open(file_name, "wb") as f: - chunk = res.read(chunk_size) - while chunk: - f.write(chunk) - amount += len(chunk) - progress.write_update(total, amount) - chunk = res.read(chunk_size) - except (OSError, IOError) as exc: # noqa: B014 - raise TransientFailure( - "Failed to download %s to %s: %s" % (url, file_name, str(exc)) - ) - - end = time.time() - sys.stdout.write(" [Complete in %f seconds]\n" % (end - start)) - sys.stdout.flush() - if headers is not None: - print(f"{headers}") - - -class ArchiveFetcher(Fetcher): - def __init__(self, build_options, manifest, url, sha256) -> None: - self.manifest = manifest - self.url = url - self.sha256 = sha256 - self.build_options = build_options - - url = urlparse(self.url) - basename = "%s-%s" % (manifest.name, os.path.basename(url.path)) - self.file_name = os.path.join(build_options.scratch_dir, "downloads", basename) - self.src_dir = os.path.join(build_options.scratch_dir, "extracted", basename) - self.hash_file = self.src_dir + ".hash" - - def _verify_hash(self) -> None: - h = hashlib.sha256() - with open(self.file_name, "rb") as f: - while True: - block = f.read(8192) - if not block: - break - h.update(block) - digest = h.hexdigest() - if digest != self.sha256: - os.unlink(self.file_name) - raise Exception( - "%s: expected sha256 %s but got %s" % (self.url, self.sha256, digest) - ) - - def _download_dir(self): - """returns the download dir, creating it if it doesn't already exist""" - download_dir = os.path.dirname(self.file_name) - if not os.path.exists(download_dir): - os.makedirs(download_dir) - return download_dir - - def _download(self) -> None: - self._download_dir() - download_url_to_file_with_progress(self.url, self.file_name) - self._verify_hash() - - def clean(self) -> None: - if os.path.exists(self.src_dir): - shutil.rmtree(self.src_dir) - - def update(self) -> ChangeStatus: - try: - with open(self.hash_file, "r") as f: - saved_hash = f.read().strip() - if saved_hash == self.sha256 and os.path.exists(self.src_dir): - # Everything is up to date - return ChangeStatus() - print( - "saved hash %s doesn't match expected hash %s, re-validating" - % (saved_hash, self.sha256) - ) - os.unlink(self.hash_file) - except EnvironmentError: - pass - - # If we got here we know the contents of src_dir are either missing - # or wrong, so blow away whatever happened to be there first. - if os.path.exists(self.src_dir): - shutil.rmtree(self.src_dir) - - # If we already have a file here, make sure it looks legit before - # proceeding: any errors and we just remove it and re-download - if os.path.exists(self.file_name): - try: - self._verify_hash() - except Exception: - if os.path.exists(self.file_name): - os.unlink(self.file_name) - - if not os.path.exists(self.file_name): - self._download() - - if tarfile.is_tarfile(self.file_name): - opener = tarfile.open - elif zipfile.is_zipfile(self.file_name): - opener = zipfile.ZipFile - else: - raise Exception("don't know how to extract %s" % self.file_name) - os.makedirs(self.src_dir) - print("Extract %s -> %s" % (self.file_name, self.src_dir)) - t = opener(self.file_name) - if is_windows(): - # Ensure that we don't fall over when dealing with long paths - # on windows - src = r"\\?\%s" % os.path.normpath(self.src_dir) - else: - src = self.src_dir - # The `str` here is necessary to ensure that we don't pass a unicode - # object down to tarfile.extractall on python2. When extracting - # the boost tarball it makes some assumptions and tries to convert - # a non-ascii path to ascii and throws. - src = str(src) - t.extractall(src) - - with open(self.hash_file, "w") as f: - f.write(self.sha256) - - return ChangeStatus(True) - - def hash(self): - return self.sha256 - - def get_src_dir(self): - return self.src_dir - - -def homebrew_package_prefix(package): - cmd = ["brew", "--prefix", package] - try: - proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - except FileNotFoundError: - return - - if proc.returncode == 0: - return proc.stdout.decode("utf-8").rstrip() diff --git a/build/fbcode_builder/getdeps/load.py b/build/fbcode_builder/getdeps/load.py deleted file mode 100644 index 6390f2fb14b4a..0000000000000 --- a/build/fbcode_builder/getdeps/load.py +++ /dev/null @@ -1,356 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import base64 -import copy -import hashlib -import os - -from . import fetcher -from .envfuncs import path_search -from .errors import ManifestNotFound -from .manifest import ManifestParser - - -class Loader(object): - """The loader allows our tests to patch the load operation""" - - def _list_manifests(self, build_opts): - """Returns a generator that iterates all the available manifests""" - for (path, _, files) in os.walk(build_opts.manifests_dir): - for name in files: - # skip hidden files - if name.startswith("."): - continue - - yield os.path.join(path, name) - - def _load_manifest(self, path): - return ManifestParser(path) - - def load_project(self, build_opts, project_name): - if "/" in project_name or "\\" in project_name: - # Assume this is a path already - return ManifestParser(project_name) - - for manifest in self._list_manifests(build_opts): - if os.path.basename(manifest) == project_name: - return ManifestParser(manifest) - - raise ManifestNotFound(project_name) - - def load_all(self, build_opts): - manifests_by_name = {} - - for manifest in self._list_manifests(build_opts): - m = self._load_manifest(manifest) - - if m.name in manifests_by_name: - raise Exception("found duplicate manifest '%s'" % m.name) - - manifests_by_name[m.name] = m - - return manifests_by_name - - -class ResourceLoader(Loader): - def __init__(self, namespace, manifests_dir) -> None: - self.namespace = namespace - self.manifests_dir = manifests_dir - - def _list_manifests(self, _build_opts): - import pkg_resources - - dirs = [self.manifests_dir] - - while dirs: - current = dirs.pop(0) - for name in pkg_resources.resource_listdir(self.namespace, current): - path = "%s/%s" % (current, name) - - if pkg_resources.resource_isdir(self.namespace, path): - dirs.append(path) - else: - yield "%s/%s" % (current, name) - - def _find_manifest(self, project_name): - for name in self._list_manifests(): - if name.endswith("/%s" % project_name): - return name - - raise ManifestNotFound(project_name) - - def _load_manifest(self, path: str): - import pkg_resources - - contents = pkg_resources.resource_string(self.namespace, path).decode("utf8") - return ManifestParser(file_name=path, fp=contents) - - def load_project(self, build_opts, project_name): - project_name = self._find_manifest(project_name) - return self._load_resource_manifest(project_name) - - -LOADER = Loader() - - -def patch_loader(namespace, manifests_dir: str = "manifests") -> None: - global LOADER - LOADER = ResourceLoader(namespace, manifests_dir) - - -def load_project(build_opts, project_name): - """given the name of a project or a path to a manifest file, - load up the ManifestParser instance for it and return it""" - return LOADER.load_project(build_opts, project_name) - - -def load_all_manifests(build_opts): - return LOADER.load_all(build_opts) - - -class ManifestLoader(object): - """ManifestLoader stores information about project manifest relationships for a - given set of (build options + platform) configuration. - - The ManifestLoader class primarily serves as a location to cache project dependency - relationships and project hash values for this build configuration. - """ - - def __init__(self, build_opts, ctx_gen=None) -> None: - self._loader = LOADER - self.build_opts = build_opts - if ctx_gen is None: - self.ctx_gen = self.build_opts.get_context_generator() - else: - self.ctx_gen = ctx_gen - - self.manifests_by_name = {} - self._loaded_all = False - self._project_hashes = {} - self._fetcher_overrides = {} - self._build_dir_overrides = {} - self._install_dir_overrides = {} - self._install_prefix_overrides = {} - - def load_manifest(self, name): - manifest = self.manifests_by_name.get(name) - if manifest is None: - manifest = self._loader.load_project(self.build_opts, name) - self.manifests_by_name[name] = manifest - return manifest - - def load_all_manifests(self): - if not self._loaded_all: - all_manifests_by_name = self._loader.load_all(self.build_opts) - if self.manifests_by_name: - # To help ensure that we only ever have a single manifest object for a - # given project, and that it can't change once we have loaded it, - # only update our mapping for projects that weren't already loaded. - for name, manifest in all_manifests_by_name.items(): - self.manifests_by_name.setdefault(name, manifest) - else: - self.manifests_by_name = all_manifests_by_name - self._loaded_all = True - - return self.manifests_by_name - - def manifests_in_dependency_order(self, manifest=None): - """Compute all dependencies of the specified project. Returns a list of the - dependencies plus the project itself, in topologically sorted order. - - Each entry in the returned list only depends on projects that appear before it - in the list. - - If the input manifest is None, the dependencies for all currently loaded - projects will be computed. i.e., if you call load_all_manifests() followed by - manifests_in_dependency_order() this will return a global dependency ordering of - all projects.""" - # The list of deps that have been fully processed - seen = set() - # The list of deps which have yet to be evaluated. This - # can potentially contain duplicates. - if manifest is None: - deps = list(self.manifests_by_name.values()) - else: - assert manifest.name in self.manifests_by_name - deps = [manifest] - # The list of manifests in dependency order - dep_order = [] - system_packages = {} - - while len(deps) > 0: - m = deps.pop(0) - if m.name in seen: - continue - - # Consider its deps, if any. - # We sort them for increased determinism; we'll produce - # a correct order even if they aren't sorted, but we prefer - # to produce the same order regardless of how they are listed - # in the project manifest files. - ctx = self.ctx_gen.get_context(m.name) - dep_list = m.get_dependencies(ctx) - - dep_count = 0 - for dep_name in dep_list: - # If we're not sure whether it is done, queue it up - if dep_name not in seen: - dep = self.manifests_by_name.get(dep_name) - if dep is None: - dep = self._loader.load_project(self.build_opts, dep_name) - self.manifests_by_name[dep.name] = dep - - deps.append(dep) - dep_count += 1 - - if dep_count > 0: - # If we queued anything, re-queue this item, as it depends - # those new item(s) and their transitive deps. - deps.append(m) - continue - - # Its deps are done, so we can emit it - seen.add(m.name) - # Capture system packages as we may need to set PATHs to then later - if ( - self.build_opts.allow_system_packages - and self.build_opts.host_type.get_package_manager() - ): - packages = m.get_required_system_packages(ctx) - for pkg_type, v in packages.items(): - merged = system_packages.get(pkg_type, []) - if v not in merged: - merged += v - system_packages[pkg_type] = merged - # A manifest depends on all system packages in it dependencies as well - m.resolved_system_packages = copy.copy(system_packages) - dep_order.append(m) - - return dep_order - - def set_project_src_dir(self, project_name, path) -> None: - self._fetcher_overrides[project_name] = fetcher.LocalDirFetcher(path) - - def set_project_build_dir(self, project_name, path) -> None: - self._build_dir_overrides[project_name] = path - - def set_project_install_dir(self, project_name, path) -> None: - self._install_dir_overrides[project_name] = path - - def set_project_install_prefix(self, project_name, path) -> None: - self._install_prefix_overrides[project_name] = path - - def create_fetcher(self, manifest): - override = self._fetcher_overrides.get(manifest.name) - if override is not None: - return override - - ctx = self.ctx_gen.get_context(manifest.name) - return manifest.create_fetcher(self.build_opts, ctx) - - def get_project_hash(self, manifest): - h = self._project_hashes.get(manifest.name) - if h is None: - h = self._compute_project_hash(manifest) - self._project_hashes[manifest.name] = h - return h - - def _compute_project_hash(self, manifest) -> str: - """This recursive function computes a hash for a given manifest. - The hash takes into account some environmental factors on the - host machine and includes the hashes of its dependencies. - No caching of the computation is performed, which is theoretically - wasteful but the computation is fast enough that it is not required - to cache across multiple invocations.""" - ctx = self.ctx_gen.get_context(manifest.name) - - hasher = hashlib.sha256() - # Some environmental and configuration things matter - env = {} - env["install_dir"] = self.build_opts.install_dir - env["scratch_dir"] = self.build_opts.scratch_dir - env["vcvars_path"] = self.build_opts.vcvars_path - env["os"] = self.build_opts.host_type.ostype - env["distro"] = self.build_opts.host_type.distro - env["distro_vers"] = self.build_opts.host_type.distrovers - env["shared_libs"] = str(self.build_opts.shared_libs) - for name in [ - "CXXFLAGS", - "CPPFLAGS", - "LDFLAGS", - "CXX", - "CC", - "GETDEPS_CMAKE_DEFINES", - ]: - env[name] = os.environ.get(name) - for tool in ["cc", "c++", "gcc", "g++", "clang", "clang++"]: - env["tool-%s" % tool] = path_search(os.environ, tool) - for name in manifest.get_section_as_args("depends.environment", ctx): - env[name] = os.environ.get(name) - - fetcher = self.create_fetcher(manifest) - env["fetcher.hash"] = fetcher.hash() - - for name in sorted(env.keys()): - hasher.update(name.encode("utf-8")) - value = env.get(name) - if value is not None: - try: - hasher.update(value.encode("utf-8")) - except AttributeError as exc: - raise AttributeError("name=%r, value=%r: %s" % (name, value, exc)) - - manifest.update_hash(hasher, ctx) - - dep_list = manifest.get_dependencies(ctx) - for dep in dep_list: - dep_manifest = self.load_manifest(dep) - dep_hash = self.get_project_hash(dep_manifest) - hasher.update(dep_hash.encode("utf-8")) - - # Use base64 to represent the hash, rather than the simple hex digest, - # so that the string is shorter. Use the URL-safe encoding so that - # the hash can also be safely used as a filename component. - h = base64.urlsafe_b64encode(hasher.digest()).decode("ascii") - # ... and because cmd.exe is troublesome with `=` signs, nerf those. - # They tend to be padding characters at the end anyway, so we can - # safely discard them. - h = h.replace("=", "") - - return h - - def _get_project_dir_name(self, manifest): - if manifest.is_first_party_project(): - return manifest.name - else: - project_hash = self.get_project_hash(manifest) - return "%s-%s" % (manifest.name, project_hash) - - def get_project_install_dir(self, manifest): - override = self._install_dir_overrides.get(manifest.name) - if override: - return override - - project_dir_name = self._get_project_dir_name(manifest) - return os.path.join(self.build_opts.install_dir, project_dir_name) - - def get_project_build_dir(self, manifest): - override = self._build_dir_overrides.get(manifest.name) - if override: - return override - - project_dir_name = self._get_project_dir_name(manifest) - return os.path.join(self.build_opts.scratch_dir, "build", project_dir_name) - - def get_project_install_prefix(self, manifest): - return self._install_prefix_overrides.get(manifest.name) - - def get_project_install_dir_respecting_install_prefix(self, manifest): - inst_dir = self.get_project_install_dir(manifest) - prefix = self.get_project_install_prefix(manifest) - if prefix: - return inst_dir + prefix - return inst_dir diff --git a/build/fbcode_builder/getdeps/manifest.py b/build/fbcode_builder/getdeps/manifest.py deleted file mode 100644 index 15f69af7d4e86..0000000000000 --- a/build/fbcode_builder/getdeps/manifest.py +++ /dev/null @@ -1,682 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import configparser -import io -import os -from typing import List - -from .builder import ( - AutoconfBuilder, - Boost, - CMakeBootStrapBuilder, - CMakeBuilder, - Iproute2Builder, - MakeBuilder, - NinjaBootstrap, - NopBuilder, - OpenSSLBuilder, - SqliteBuilder, -) -from .cargo import CargoBuilder -from .expr import parse_expr -from .fetcher import ( - ArchiveFetcher, - GitFetcher, - PreinstalledNopFetcher, - ShipitTransformerFetcher, - SimpleShipitTransformerFetcher, - SystemPackageFetcher, -) -from .py_wheel_builder import PythonWheelBuilder - -REQUIRED = "REQUIRED" -OPTIONAL = "OPTIONAL" - -SCHEMA = { - "manifest": { - "optional_section": False, - "fields": { - "name": REQUIRED, - "fbsource_path": OPTIONAL, - "shipit_project": OPTIONAL, - "shipit_fbcode_builder": OPTIONAL, - }, - }, - "dependencies": {"optional_section": True, "allow_values": False}, - "depends.environment": {"optional_section": True}, - "git": { - "optional_section": True, - "fields": {"repo_url": REQUIRED, "rev": OPTIONAL, "depth": OPTIONAL}, - }, - "download": { - "optional_section": True, - "fields": {"url": REQUIRED, "sha256": REQUIRED}, - }, - "build": { - "optional_section": True, - "fields": { - "builder": REQUIRED, - "subdir": OPTIONAL, - "make_binary": OPTIONAL, - "build_in_src_dir": OPTIONAL, - "job_weight_mib": OPTIONAL, - "patchfile": OPTIONAL, - "patchfile_opts": OPTIONAL, - }, - }, - "msbuild": {"optional_section": True, "fields": {"project": REQUIRED}}, - "cargo": { - "optional_section": True, - "fields": { - "build_doc": OPTIONAL, - "workspace_dir": OPTIONAL, - "manifests_to_build": OPTIONAL, - # Where to write cargo config (defaults to build_dir/.cargo/config.toml) - "cargo_config_file": OPTIONAL, - }, - }, - "github.actions": { - "optional_section": True, - "fields": { - "run_tests": OPTIONAL, - }, - }, - "crate.pathmap": {"optional_section": True}, - "cmake.defines": {"optional_section": True}, - "autoconf.args": {"optional_section": True}, - "autoconf.envcmd.LDFLAGS": {"optional_section": True}, - "rpms": {"optional_section": True}, - "debs": {"optional_section": True}, - "homebrew": {"optional_section": True}, - "preinstalled.env": {"optional_section": True}, - "bootstrap.args": {"optional_section": True}, - "b2.args": {"optional_section": True}, - "make.build_args": {"optional_section": True}, - "make.install_args": {"optional_section": True}, - "make.test_args": {"optional_section": True}, - "header-only": {"optional_section": True, "fields": {"includedir": REQUIRED}}, - "shipit.pathmap": {"optional_section": True}, - "shipit.strip": {"optional_section": True}, - "install.files": {"optional_section": True}, - # fb-only - "sandcastle": {"optional_section": True, "fields": {"run_tests": OPTIONAL}}, -} - -# These sections are allowed to vary for different platforms -# using the expression syntax to enable/disable sections -ALLOWED_EXPR_SECTIONS = [ - "autoconf.args", - "autoconf.envcmd.LDFLAGS", - "build", - "cmake.defines", - "dependencies", - "make.build_args", - "make.install_args", - "bootstrap.args", - "b2.args", - "download", - "git", - "install.files", - "rpms", - "debs", - "shipit.pathmap", - "shipit.strip", - "homebrew", - "github.actions", -] - - -def parse_conditional_section_name(name, section_def): - expr = name[len(section_def) + 1 :] - return parse_expr(expr, ManifestContext.ALLOWED_VARIABLES) - - -def validate_allowed_fields(file_name, section, config, allowed_fields): - for field in config.options(section): - if not allowed_fields.get(field): - raise Exception( - ("manifest file %s section '%s' contains " "unknown field '%s'") - % (file_name, section, field) - ) - - for field in allowed_fields: - if allowed_fields[field] == REQUIRED and not config.has_option(section, field): - raise Exception( - ("manifest file %s section '%s' is missing " "required field '%s'") - % (file_name, section, field) - ) - - -def validate_allow_values(file_name, section, config): - for field in config.options(section): - value = config.get(section, field) - if value is not None: - raise Exception( - ( - "manifest file %s section '%s' has '%s = %s' but " - "this section doesn't allow specifying values " - "for its entries" - ) - % (file_name, section, field, value) - ) - - -def validate_section(file_name, section, config): - section_def = SCHEMA.get(section) - if not section_def: - for name in ALLOWED_EXPR_SECTIONS: - if section.startswith(name + "."): - # Verify that the conditional parses, but discard it - try: - parse_conditional_section_name(section, name) - except Exception as exc: - raise Exception( - ("manifest file %s section '%s' has invalid " "conditional: %s") - % (file_name, section, str(exc)) - ) - section_def = SCHEMA.get(name) - canonical_section_name = name - break - if not section_def: - raise Exception( - "manifest file %s contains unknown section '%s'" % (file_name, section) - ) - else: - canonical_section_name = section - - allowed_fields = section_def.get("fields") - if allowed_fields: - validate_allowed_fields(file_name, section, config, allowed_fields) - elif not section_def.get("allow_values", True): - validate_allow_values(file_name, section, config) - return canonical_section_name - - -class ManifestParser(object): - def __init__(self, file_name, fp=None): - # allow_no_value enables listing parameters in the - # autoconf.args section one per line - config = configparser.RawConfigParser(allow_no_value=True) - config.optionxform = str # make it case sensitive - if fp is None: - with open(file_name, "r") as fp: - config.read_file(fp) - elif isinstance(fp, type("")): - # For testing purposes, parse from a string (str - # or unicode) - config.read_file(io.StringIO(fp)) - else: - config.read_file(fp) - - # validate against the schema - seen_sections = set() - - for section in config.sections(): - seen_sections.add(validate_section(file_name, section, config)) - - for section in SCHEMA.keys(): - section_def = SCHEMA[section] - if ( - not section_def.get("optional_section", False) - and section not in seen_sections - ): - raise Exception( - "manifest file %s is missing required section %s" - % (file_name, section) - ) - - self._config = config - self.name = config.get("manifest", "name") - self.fbsource_path = self.get("manifest", "fbsource_path") - self.shipit_project = self.get("manifest", "shipit_project") - self.shipit_fbcode_builder = self.get("manifest", "shipit_fbcode_builder") - self.resolved_system_packages = {} - - if self.name != os.path.basename(file_name): - raise Exception( - "filename of the manifest '%s' does not match the manifest name '%s'" - % (file_name, self.name) - ) - - def get(self, section, key, defval=None, ctx=None): - ctx = ctx or {} - - for s in self._config.sections(): - if s == section: - if self._config.has_option(s, key): - return self._config.get(s, key) - return defval - - if s.startswith(section + "."): - expr = parse_conditional_section_name(s, section) - if not expr.eval(ctx): - continue - - if self._config.has_option(s, key): - return self._config.get(s, key) - - return defval - - def get_dependencies(self, ctx): - dep_list = list(self.get_section_as_dict("dependencies", ctx).keys()) - dep_list.sort() - builder = self.get("build", "builder", ctx=ctx) - if builder in ("cmake", "python-wheel"): - dep_list.insert(0, "cmake") - elif builder == "autoconf" and self.name not in ( - "autoconf", - "libtool", - "automake", - ): - # they need libtool and its deps (automake, autoconf) so add - # those as deps (but obviously not if we're building those - # projects themselves) - dep_list.insert(0, "libtool") - - return dep_list - - def get_section_as_args(self, section, ctx=None) -> List[str]: - """Intended for use with the make.[build_args/install_args] and - autoconf.args sections, this method collects the entries and returns an - array of strings. - If the manifest contains conditional sections, ctx is used to - evaluate the condition and merge in the values. - """ - args = [] - ctx = ctx or {} - - for s in self._config.sections(): - if s != section: - if not s.startswith(section + "."): - continue - expr = parse_conditional_section_name(s, section) - if not expr.eval(ctx): - continue - for field in self._config.options(s): - value = self._config.get(s, field) - if value is None: - args.append(field) - else: - args.append("%s=%s" % (field, value)) - return args - - def get_section_as_ordered_pairs(self, section, ctx=None): - """Used for eg: shipit.pathmap which has strong - ordering requirements""" - res = [] - ctx = ctx or {} - - for s in self._config.sections(): - if s != section: - if not s.startswith(section + "."): - continue - expr = parse_conditional_section_name(s, section) - if not expr.eval(ctx): - continue - - for key in self._config.options(s): - value = self._config.get(s, key) - res.append((key, value)) - return res - - def get_section_as_dict(self, section, ctx): - d = {} - - for s in self._config.sections(): - if s != section: - if not s.startswith(section + "."): - continue - expr = parse_conditional_section_name(s, section) - if not expr.eval(ctx): - continue - for field in self._config.options(s): - value = self._config.get(s, field) - d[field] = value - return d - - def update_hash(self, hasher, ctx): - """Compute a hash over the configuration for the given - context. The goal is for the hash to change if the config - for that context changes, but not if a change is made to - the config only for a different platform than that expressed - by ctx. The hash is intended to be used to help invalidate - a future cache for the third party build products. - The hasher argument is a hash object returned from hashlib.""" - for section in sorted(SCHEMA.keys()): - hasher.update(section.encode("utf-8")) - - # Note: at the time of writing, nothing in the implementation - # relies on keys in any config section being ordered. - # In theory we could have conflicting flags in different - # config sections and later flags override earlier flags. - # For the purposes of computing a hash we're not super - # concerned about this: manifest changes should be rare - # enough and we'd rather that this trigger an invalidation - # than strive for a cache hit at this time. - pairs = self.get_section_as_ordered_pairs(section, ctx) - pairs.sort(key=lambda pair: pair[0]) - for key, value in pairs: - hasher.update(key.encode("utf-8")) - if value is not None: - hasher.update(value.encode("utf-8")) - - def is_first_party_project(self): - """returns true if this is an FB first-party project""" - return self.shipit_project is not None - - def get_required_system_packages(self, ctx): - """Returns dictionary of packager system -> list of packages""" - return { - "rpm": self.get_section_as_args("rpms", ctx), - "deb": self.get_section_as_args("debs", ctx), - "homebrew": self.get_section_as_args("homebrew", ctx), - } - - def _is_satisfied_by_preinstalled_environment(self, ctx): - envs = self.get_section_as_args("preinstalled.env", ctx) - if not envs: - return False - for key in envs: - val = os.environ.get(key, None) - print(f"Testing ENV[{key}]: {repr(val)}") - if val is None: - return False - if len(val) == 0: - return False - - return True - - def get_repo_url(self, ctx): - return self.get("git", "repo_url", ctx=ctx) - - def create_fetcher(self, build_options, ctx): - use_real_shipit = ( - ShipitTransformerFetcher.available() and build_options.use_shipit - ) - if ( - not use_real_shipit - and self.fbsource_path - and build_options.fbsource_dir - and self.shipit_project - ): - return SimpleShipitTransformerFetcher(build_options, self, ctx) - - if ( - self.fbsource_path - and build_options.fbsource_dir - and self.shipit_project - and ShipitTransformerFetcher.available() - ): - # We can use the code from fbsource - return ShipitTransformerFetcher(build_options, self.shipit_project) - - # Can we satisfy this dep with system packages? - if build_options.allow_system_packages: - if self._is_satisfied_by_preinstalled_environment(ctx): - return PreinstalledNopFetcher() - - packages = self.get_required_system_packages(ctx) - package_fetcher = SystemPackageFetcher(build_options, packages) - if package_fetcher.packages_are_installed(): - return package_fetcher - - repo_url = self.get_repo_url(ctx) - if repo_url: - rev = self.get("git", "rev") - depth = self.get("git", "depth") - return GitFetcher(build_options, self, repo_url, rev, depth) - - url = self.get("download", "url", ctx=ctx) - if url: - # We need to defer this import until now to avoid triggering - # a cycle when the facebook/__init__.py is loaded. - try: - from .facebook.lfs import LFSCachingArchiveFetcher - - return LFSCachingArchiveFetcher( - build_options, self, url, self.get("download", "sha256", ctx=ctx) - ) - except ImportError: - # This FB internal module isn't shippped to github, - # so just use its base class - return ArchiveFetcher( - build_options, self, url, self.get("download", "sha256", ctx=ctx) - ) - - raise KeyError( - "project %s has no fetcher configuration matching %s" % (self.name, ctx) - ) - - def get_builder_name(self, ctx): - builder = self.get("build", "builder", ctx=ctx) - if not builder: - raise Exception("project %s has no builder for %r" % (self.name, ctx)) - return builder - - def create_builder( # noqa:C901 - self, - build_options, - src_dir, - build_dir, - inst_dir, - ctx, - loader, - final_install_prefix=None, - extra_cmake_defines=None, - cmake_target=None, - extra_b2_args=None, - ): - builder = self.get_builder_name(ctx) - build_in_src_dir = self.get("build", "build_in_src_dir", "false", ctx=ctx) - if build_in_src_dir == "true": - # Some scripts don't work when they are configured and build in - # a different directory than source (or when the build directory - # is not a subdir of source). - build_dir = src_dir - subdir = self.get("build", "subdir", None, ctx=ctx) - if subdir is not None: - build_dir = os.path.join(build_dir, subdir) - print("build_dir is %s" % build_dir) # just to quiet lint - - if builder == "make" or builder == "cmakebootstrap": - build_args = self.get_section_as_args("make.build_args", ctx) - install_args = self.get_section_as_args("make.install_args", ctx) - test_args = self.get_section_as_args("make.test_args", ctx) - if builder == "cmakebootstrap": - return CMakeBootStrapBuilder( - build_options, - ctx, - self, - src_dir, - None, - inst_dir, - build_args, - install_args, - test_args, - ) - else: - return MakeBuilder( - build_options, - ctx, - self, - src_dir, - None, - inst_dir, - build_args, - install_args, - test_args, - ) - - if builder == "autoconf": - args = self.get_section_as_args("autoconf.args", ctx) - conf_env_args = {} - ldflags_cmd = self.get_section_as_args("autoconf.envcmd.LDFLAGS", ctx) - if ldflags_cmd: - conf_env_args["LDFLAGS"] = ldflags_cmd - return AutoconfBuilder( - build_options, - ctx, - self, - src_dir, - build_dir, - inst_dir, - args, - conf_env_args, - ) - - if builder == "boost": - args = self.get_section_as_args("b2.args", ctx) - if extra_b2_args is not None: - args += extra_b2_args - return Boost(build_options, ctx, self, src_dir, build_dir, inst_dir, args) - - if builder == "cmake": - defines = self.get_section_as_dict("cmake.defines", ctx) - return CMakeBuilder( - build_options, - ctx, - self, - src_dir, - build_dir, - inst_dir, - defines, - loader, - final_install_prefix, - extra_cmake_defines, - cmake_target, - ) - - if builder == "python-wheel": - return PythonWheelBuilder( - build_options, ctx, self, src_dir, build_dir, inst_dir - ) - - if builder == "sqlite": - return SqliteBuilder(build_options, ctx, self, src_dir, build_dir, inst_dir) - - if builder == "ninja_bootstrap": - return NinjaBootstrap( - build_options, ctx, self, build_dir, src_dir, inst_dir - ) - - if builder == "nop": - return NopBuilder(build_options, ctx, self, src_dir, inst_dir) - - if builder == "openssl": - return OpenSSLBuilder( - build_options, ctx, self, build_dir, src_dir, inst_dir - ) - - if builder == "iproute2": - return Iproute2Builder( - build_options, ctx, self, src_dir, build_dir, inst_dir - ) - - if builder == "cargo": - return self.create_cargo_builder( - build_options, ctx, src_dir, build_dir, inst_dir, loader - ) - - raise KeyError("project %s has no known builder" % (self.name)) - - def create_prepare_builders( - self, build_options, ctx, src_dir, build_dir, inst_dir, loader - ): - """Create builders that have a prepare step run, e.g. to write config files""" - prepare_builders = [] - builder = self.get_builder_name(ctx) - cargo = self.get_section_as_dict("cargo", ctx) - if not builder == "cargo" and cargo: - cargo_builder = self.create_cargo_builder( - build_options, ctx, src_dir, build_dir, inst_dir, loader - ) - prepare_builders.append(cargo_builder) - return prepare_builders - - def create_cargo_builder( - self, build_options, ctx, src_dir, build_dir, inst_dir, loader - ): - build_doc = self.get("cargo", "build_doc", False, ctx) - workspace_dir = self.get("cargo", "workspace_dir", None, ctx) - manifests_to_build = self.get("cargo", "manifests_to_build", None, ctx) - cargo_config_file = self.get("cargo", "cargo_config_file", None, ctx) - return CargoBuilder( - build_options, - ctx, - self, - src_dir, - build_dir, - inst_dir, - build_doc, - workspace_dir, - manifests_to_build, - loader, - cargo_config_file, - ) - - -class ManifestContext(object): - """ProjectContext contains a dictionary of values to use when evaluating boolean - expressions in a project manifest. - - This object should be passed as the `ctx` parameter in ManifestParser.get() calls. - """ - - ALLOWED_VARIABLES = { - "os", - "distro", - "distro_vers", - "fb", - "fbsource", - "test", - "shared_libs", - } - - def __init__(self, ctx_dict): - assert set(ctx_dict.keys()) == self.ALLOWED_VARIABLES - self.ctx_dict = ctx_dict - - def get(self, key): - return self.ctx_dict[key] - - def set(self, key, value): - assert key in self.ALLOWED_VARIABLES - self.ctx_dict[key] = value - - def copy(self): - return ManifestContext(dict(self.ctx_dict)) - - def __str__(self): - s = ", ".join( - "%s=%s" % (key, value) for key, value in sorted(self.ctx_dict.items()) - ) - return "{" + s + "}" - - -class ContextGenerator(object): - """ContextGenerator allows creating ManifestContext objects on a per-project basis. - This allows us to evaluate different projects with slightly different contexts. - - For instance, this can be used to only enable tests for some projects.""" - - def __init__(self, default_ctx): - self.default_ctx = ManifestContext(default_ctx) - self.ctx_by_project = {} - - def set_value_for_project(self, project_name, key, value): - project_ctx = self.ctx_by_project.get(project_name) - if project_ctx is None: - project_ctx = self.default_ctx.copy() - self.ctx_by_project[project_name] = project_ctx - project_ctx.set(key, value) - - def set_value_for_all_projects(self, key, value): - self.default_ctx.set(key, value) - for ctx in self.ctx_by_project.values(): - ctx.set(key, value) - - def get_context(self, project_name): - return self.ctx_by_project.get(project_name, self.default_ctx) diff --git a/build/fbcode_builder/getdeps/platform.py b/build/fbcode_builder/getdeps/platform.py deleted file mode 100644 index d8ac41b46be44..0000000000000 --- a/build/fbcode_builder/getdeps/platform.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import os - -import platform -import re -import shlex -import sys -from typing import Optional, Tuple - - -def is_windows() -> bool: - """Returns true if the system we are currently running on - is a Windows system""" - return sys.platform.startswith("win") - - -def get_linux_type() -> Tuple[Optional[str], Optional[str], Optional[str]]: - try: - with open("/etc/os-release") as f: - data = f.read() - except EnvironmentError: - return (None, None, None) - - os_vars = {} - for line in data.splitlines(): - parts = line.split("=", 1) - if len(parts) != 2: - continue - key = parts[0].strip() - value_parts = shlex.split(parts[1].strip()) - if not value_parts: - value = "" - else: - value = value_parts[0] - os_vars[key] = value - - name = os_vars.get("NAME") - if name: - name = name.lower() - name = re.sub("linux", "", name) - name = name.strip().replace(" ", "_") - - version_id = os_vars.get("VERSION_ID") - if version_id: - version_id = version_id.lower() - - return "linux", name, version_id - - -# Ideally we'd use a common library like `psutil` to read system information, -# but getdeps can't take third-party dependencies. - - -def _get_available_ram_linux() -> int: - # TODO: Ideally, this function would inspect the current cgroup for any - # limits, rather than solely relying on system RAM. - - meminfo_path = "/proc/meminfo" - try: - with open(meminfo_path) as f: - for line in f: - try: - key, value = line.split(":", 1) - except ValueError: - continue - suffix = " kB\n" - if key == "MemAvailable" and value.endswith(suffix): - value = value[: -len(suffix)] - try: - return int(value) // 1024 - except ValueError: - continue - except OSError: - print("error opening {}".format(meminfo_path), end="", file=sys.stderr) - else: - print( - "{} had no valid MemAvailable".format(meminfo_path), end="", file=sys.stderr - ) - - guess = 8 - print(", guessing {} GiB".format(guess), file=sys.stderr) - return guess * 1024 - - -def _get_available_ram_macos() -> int: - import ctypes.util - - libc = ctypes.CDLL(ctypes.util.find_library("libc"), use_errno=True) - sysctlbyname = libc.sysctlbyname - sysctlbyname.restype = ctypes.c_int - sysctlbyname.argtypes = [ - ctypes.c_char_p, - ctypes.c_void_p, - ctypes.POINTER(ctypes.c_size_t), - ctypes.c_void_p, - ctypes.c_size_t, - ] - # TODO: There may be some way to approximate an availability - # metric, but just use total RAM for now. - memsize = ctypes.c_int64() - memsizesize = ctypes.c_size_t(8) - res = sysctlbyname( - b"hw.memsize", ctypes.byref(memsize), ctypes.byref(memsizesize), None, 0 - ) - if res != 0: - raise NotImplementedError( - f"failed to retrieve hw.memsize sysctl: {ctypes.get_errno()}" - ) - return memsize.value // (1024 * 1024) - - -def _get_available_ram_windows() -> int: - import ctypes - - DWORD = ctypes.c_uint32 - QWORD = ctypes.c_uint64 - - class MEMORYSTATUSEX(ctypes.Structure): - _fields_ = [ - ("dwLength", DWORD), - ("dwMemoryLoad", DWORD), - ("ullTotalPhys", QWORD), - ("ullAvailPhys", QWORD), - ("ullTotalPageFile", QWORD), - ("ullAvailPageFile", QWORD), - ("ullTotalVirtual", QWORD), - ("ullAvailVirtual", QWORD), - ("ullExtendedVirtual", QWORD), - ] - - ms = MEMORYSTATUSEX() - ms.dwLength = ctypes.sizeof(ms) - # pyre-ignore[16] - res = ctypes.windll.kernel32.GlobalMemoryStatusEx(ctypes.byref(ms)) - if res == 0: - raise NotImplementedError("error calling GlobalMemoryStatusEx") - - # This is fuzzy, but AvailPhys is too conservative, and AvailTotal is too - # aggressive, so average the two. It's okay for builds to use some swap. - return (ms.ullAvailPhys + ms.ullTotalPhys) // (2 * 1024 * 1024) - - -def _get_available_ram_freebsd() -> int: - import ctypes.util - - libc = ctypes.CDLL(ctypes.util.find_library("libc"), use_errno=True) - sysctlbyname = libc.sysctlbyname - sysctlbyname.restype = ctypes.c_int - sysctlbyname.argtypes = [ - ctypes.c_char_p, - ctypes.c_void_p, - ctypes.POINTER(ctypes.c_size_t), - ctypes.c_void_p, - ctypes.c_size_t, - ] - # hw.usermem is pretty close to what we want. - memsize = ctypes.c_int64() - memsizesize = ctypes.c_size_t(8) - res = sysctlbyname( - b"hw.usermem", ctypes.byref(memsize), ctypes.byref(memsizesize), None, 0 - ) - if res != 0: - raise NotImplementedError( - f"failed to retrieve hw.memsize sysctl: {ctypes.get_errno()}" - ) - return memsize.value // (1024 * 1024) - - -def get_available_ram() -> int: - """ - Returns a platform-appropriate available RAM metric in MiB. - """ - if sys.platform == "linux": - return _get_available_ram_linux() - elif sys.platform == "darwin": - return _get_available_ram_macos() - elif sys.platform == "win32": - return _get_available_ram_windows() - elif sys.platform.startswith("freebsd"): - return _get_available_ram_freebsd() - else: - raise NotImplementedError( - f"platform {sys.platform} does not have an implementation of get_available_ram" - ) - - -def is_current_host_arm() -> bool: - if sys.platform.startswith("darwin"): - # platform.machine() can be fooled by rosetta for python < 3.9.2 - return "ARM64" in os.uname().version - else: - machine = platform.machine().lower() - return "arm" in machine or "aarch" in machine - - -class HostType(object): - def __init__(self, ostype=None, distro=None, distrovers=None) -> None: - # Maybe we should allow callers to indicate whether this machine uses - # an ARM architecture, but we need to change HostType serialization - # and deserialization in that case and hunt down anywhere that is - # persisting that serialized data. - isarm = False - - if ostype is None: - distro = None - distrovers = None - if sys.platform.startswith("linux"): - ostype, distro, distrovers = get_linux_type() - elif sys.platform.startswith("darwin"): - ostype = "darwin" - elif is_windows(): - ostype = "windows" - # pyre-fixme[16]: Module `sys` has no attribute `getwindowsversion`. - distrovers = str(sys.getwindowsversion().major) - elif sys.platform.startswith("freebsd"): - ostype = "freebsd" - else: - ostype = sys.platform - - isarm = is_current_host_arm() - - # The operating system type - self.ostype = ostype - # The distribution, if applicable - self.distro = distro - # The OS/distro version if known - self.distrovers = distrovers - # Does the CPU use an ARM architecture? ARM includes Apple Silicon - # Macs as well as other ARM systems that might be running Linux or - # something. - self.isarm = isarm - - def is_windows(self): - return self.ostype == "windows" - - # is_arm is kinda half implemented at the moment. This method is only - # intended to be used when HostType represents information about the - # current machine we are running on. - # When HostType is being used to enumerate platform types (represent - # information about machine types that we may or may not be running on) - # the result could be nonsense (under the current implementation its always - # false.) - def is_arm(self): - return self.isarm - - def is_darwin(self): - return self.ostype == "darwin" - - def is_linux(self): - return self.ostype == "linux" - - def is_freebsd(self): - return self.ostype == "freebsd" - - def as_tuple_string(self) -> str: - return "%s-%s-%s" % ( - self.ostype, - self.distro or "none", - self.distrovers or "none", - ) - - def get_package_manager(self): - if not self.is_linux() and not self.is_darwin(): - return None - if self.is_darwin(): - return "homebrew" - if self.distro in ("fedora", "centos", "centos_stream"): - return "rpm" - if self.distro.startswith(("debian", "ubuntu")): - return "deb" - return None - - @staticmethod - def from_tuple_string(s) -> "HostType": - ostype, distro, distrovers = s.split("-") - return HostType(ostype=ostype, distro=distro, distrovers=distrovers) - - def __eq__(self, b): - return ( - self.ostype == b.ostype - and self.distro == b.distro - and self.distrovers == b.distrovers - ) diff --git a/build/fbcode_builder/getdeps/py_wheel_builder.py b/build/fbcode_builder/getdeps/py_wheel_builder.py deleted file mode 100644 index 53e807927f7c8..0000000000000 --- a/build/fbcode_builder/getdeps/py_wheel_builder.py +++ /dev/null @@ -1,285 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import codecs -import collections -import email -import os -import re -import stat -from typing import Dict, List - -from .builder import BuilderBase, CMakeBuilder - - -WheelNameInfo = collections.namedtuple( - "WheelNameInfo", ("distribution", "version", "build", "python", "abi", "platform") -) - -CMAKE_HEADER = """ -cmake_minimum_required(VERSION 3.8) - -project("{manifest_name}" LANGUAGES C) - -set(CMAKE_MODULE_PATH - "{cmake_dir}" - ${{CMAKE_MODULE_PATH}} -) -include(FBPythonBinary) - -set(CMAKE_INSTALL_DIR lib/cmake/{manifest_name} CACHE STRING - "The subdirectory where CMake package config files should be installed") -""" - -CMAKE_FOOTER = """ -install_fb_python_library({lib_name} EXPORT all) -install( - EXPORT all - FILE {manifest_name}-targets.cmake - NAMESPACE {namespace}:: - DESTINATION ${{CMAKE_INSTALL_DIR}} -) - -include(CMakePackageConfigHelpers) -configure_package_config_file( - ${{CMAKE_BINARY_DIR}}/{manifest_name}-config.cmake.in - {manifest_name}-config.cmake - INSTALL_DESTINATION ${{CMAKE_INSTALL_DIR}} - PATH_VARS - CMAKE_INSTALL_DIR -) -install( - FILES ${{CMAKE_CURRENT_BINARY_DIR}}/{manifest_name}-config.cmake - DESTINATION ${{CMAKE_INSTALL_DIR}} -) -""" - -CMAKE_CONFIG_FILE = """ -@PACKAGE_INIT@ - -include(CMakeFindDependencyMacro) - -set_and_check({upper_name}_CMAKE_DIR "@PACKAGE_CMAKE_INSTALL_DIR@") - -if (NOT TARGET {namespace}::{lib_name}) - include("${{{upper_name}_CMAKE_DIR}}/{manifest_name}-targets.cmake") -endif() - -set({upper_name}_LIBRARIES {namespace}::{lib_name}) - -{find_dependency_lines} - -if (NOT {manifest_name}_FIND_QUIETLY) - message(STATUS "Found {manifest_name}: ${{PACKAGE_PREFIX_DIR}}") -endif() -""" - - -# Note: for now we are manually manipulating the wheel packet contents. -# The wheel format is documented here: -# https://www.python.org/dev/peps/pep-0491/#file-format -# -# We currently aren't particularly smart about correctly handling the full wheel -# functionality, but this is good enough to handle simple pure-python wheels, -# which is the main thing we care about right now. -# -# We could potentially use pip to install the wheel to a temporary location and -# then copy its "installed" files, but this has its own set of complications. -# This would require pip to already be installed and available, and we would -# need to correctly find the right version of pip or pip3 to use. -# If we did ever want to go down that path, we would probably want to use -# something like the following pip3 command: -# pip3 --isolated install --no-cache-dir --no-index --system \ -# --target -# pyre-fixme[13] fields initialized in _build -class PythonWheelBuilder(BuilderBase): - """This Builder can take Python wheel archives and install them as python libraries - that can be used by add_fb_python_library()/add_fb_python_executable() CMake rules. - """ - - dist_info_dir: str - template_format_dict: Dict[str, str] - - def _build(self, install_dirs: List[str], reconfigure: bool) -> None: - # When we are invoked, self.src_dir contains the unpacked wheel contents. - # - # Since a wheel file is just a zip file, the Fetcher code recognizes it as such - # and goes ahead and unpacks it. (We could disable that Fetcher behavior in the - # future if we ever wanted to, say if we wanted to call pip here.) - wheel_name = self._parse_wheel_name() - name_version_prefix = "-".join((wheel_name.distribution, wheel_name.version)) - dist_info_name = name_version_prefix + ".dist-info" - data_dir_name = name_version_prefix + ".data" - self.dist_info_dir = os.path.join(self.src_dir, dist_info_name) - wheel_metadata = self._read_wheel_metadata(wheel_name) - - # Check that we can understand the wheel version. - # We don't really care about wheel_metadata["Root-Is-Purelib"] since - # we are generating our own standalone python archives rather than installing - # into site-packages. - version = wheel_metadata["Wheel-Version"] - if not version.startswith("1."): - raise Exception("unsupported wheel version %s" % (version,)) - - # Add a find_dependency() call for each of our dependencies. - # The dependencies are also listed in the wheel METADATA file, but it is simpler - # to pull this directly from the getdeps manifest. - dep_list = sorted( - self.manifest.get_section_as_dict("dependencies", self.ctx).keys() - ) - find_dependency_lines = ["find_dependency({})".format(dep) for dep in dep_list] - - getdeps_cmake_dir = os.path.join( - os.path.dirname(os.path.dirname(__file__)), "CMake" - ) - self.template_format_dict = { - # Note that CMake files always uses forward slash separators in path names, - # even on Windows. Therefore replace path separators here. - "cmake_dir": _to_cmake_path(getdeps_cmake_dir), - "lib_name": self.manifest.name, - "manifest_name": self.manifest.name, - "namespace": self.manifest.name, - "upper_name": self.manifest.name.upper().replace("-", "_"), - "find_dependency_lines": "\n".join(find_dependency_lines), - } - - # Find sources from the root directory - path_mapping = {} - for entry in os.listdir(self.src_dir): - if entry in (dist_info_name, data_dir_name): - continue - self._add_sources(path_mapping, os.path.join(self.src_dir, entry), entry) - - # Files under the .data directory also need to be installed in the correct - # locations - if os.path.exists(data_dir_name): - # TODO: process the subdirectories of data_dir_name - # This isn't implemented yet since for now we have only needed dependencies - # on some simple pure Python wheels, so I haven't tested against wheels with - # additional files in the .data directory. - raise Exception( - "handling of the subdirectories inside %s is not implemented yet" - % data_dir_name - ) - - # Emit CMake files - self._write_cmakelists(path_mapping, dep_list) - self._write_cmake_config_template() - - # Run the build - self._run_cmake_build(install_dirs, reconfigure) - - def _run_cmake_build(self, install_dirs: List[str], reconfigure: bool) -> None: - cmake_builder = CMakeBuilder( - build_opts=self.build_opts, - ctx=self.ctx, - manifest=self.manifest, - # Note that we intentionally supply src_dir=build_dir, - # since we wrote out our generated CMakeLists.txt in the build directory - src_dir=self.build_dir, - build_dir=self.build_dir, - inst_dir=self.inst_dir, - loader=None, - defines={}, - final_install_prefix=None, - ) - cmake_builder.build(install_dirs=install_dirs, reconfigure=reconfigure) - - def _write_cmakelists(self, path_mapping: Dict[str, str], dependencies) -> None: - cmake_path = os.path.join(self.build_dir, "CMakeLists.txt") - with open(cmake_path, "w") as f: - f.write(CMAKE_HEADER.format(**self.template_format_dict)) - for dep in dependencies: - f.write("find_package({0} REQUIRED)\n".format(dep)) - - f.write( - "add_fb_python_library({lib_name}\n".format(**self.template_format_dict) - ) - f.write(' BASE_DIR "%s"\n' % _to_cmake_path(self.src_dir)) - f.write(" SOURCES\n") - for src_path, install_path in path_mapping.items(): - f.write( - ' "%s=%s"\n' - % (_to_cmake_path(src_path), _to_cmake_path(install_path)) - ) - if dependencies: - f.write(" DEPENDS\n") - for dep in dependencies: - f.write(' "{0}::{0}"\n'.format(dep)) - f.write(")\n") - - f.write(CMAKE_FOOTER.format(**self.template_format_dict)) - - def _write_cmake_config_template(self) -> None: - config_path_name = self.manifest.name + "-config.cmake.in" - output_path = os.path.join(self.build_dir, config_path_name) - - with open(output_path, "w") as f: - f.write(CMAKE_CONFIG_FILE.format(**self.template_format_dict)) - - def _add_sources( - self, path_mapping: Dict[str, str], src_path: str, install_path: str - ) -> None: - s = os.lstat(src_path) - if not stat.S_ISDIR(s.st_mode): - path_mapping[src_path] = install_path - return - - for entry in os.listdir(src_path): - self._add_sources( - path_mapping, - os.path.join(src_path, entry), - os.path.join(install_path, entry), - ) - - def _parse_wheel_name(self) -> WheelNameInfo: - # The ArchiveFetcher prepends "manifest_name-", so strip that off first. - wheel_name = os.path.basename(self.src_dir) - prefix = self.manifest.name + "-" - if not wheel_name.startswith(prefix): - raise Exception( - "expected wheel source directory to be of the form %s-NAME.whl" - % (prefix,) - ) - wheel_name = wheel_name[len(prefix) :] - - wheel_name_re = re.compile( - r"(?P[^-]+)" - r"-(?P\d+[^-]*)" - r"(-(?P\d+[^-]*))?" - r"-(?P\w+\d+(\.\w+\d+)*)" - r"-(?P\w+)" - r"-(?P\w+(\.\w+)*)" - r"\.whl" - ) - match = wheel_name_re.match(wheel_name) - if not match: - raise Exception( - "bad python wheel name %s: expected to have the form " - "DISTRIBUTION-VERSION-[-BUILD]-PYTAG-ABI-PLATFORM" - ) - - return WheelNameInfo( - distribution=match.group("distribution"), - version=match.group("version"), - build=match.group("build"), - python=match.group("python"), - abi=match.group("abi"), - platform=match.group("platform"), - ) - - def _read_wheel_metadata(self, wheel_name): - metadata_path = os.path.join(self.dist_info_dir, "WHEEL") - with codecs.open(metadata_path, "r", encoding="utf-8") as f: - return email.message_from_file(f) - - -def _to_cmake_path(path): - # CMake always uses forward slashes to separate paths in CMakeLists.txt files, - # even on Windows. It treats backslashes as character escapes, so using - # backslashes in the path will cause problems. Therefore replace all path - # separators with forward slashes to make sure the paths are correct on Windows. - # e.g. "C:\foo\bar.txt" becomes "C:/foo/bar.txt" - return path.replace(os.path.sep, "/") diff --git a/build/fbcode_builder/getdeps/runcmd.py b/build/fbcode_builder/getdeps/runcmd.py deleted file mode 100644 index dc35b6f026600..0000000000000 --- a/build/fbcode_builder/getdeps/runcmd.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import os -import select -import subprocess -import sys - -from .envfuncs import Env -from .platform import is_windows - - -try: - from shlex import quote as shellquote -except ImportError: - from pipes import quote as shellquote - - -class RunCommandError(Exception): - pass - - -def _print_env_diff(env, log_fn) -> None: - current_keys = set(os.environ.keys()) - wanted_env = set(env.keys()) - - unset_keys = current_keys.difference(wanted_env) - for k in sorted(unset_keys): - log_fn("+ unset %s\n" % k) - - added_keys = wanted_env.difference(current_keys) - for k in wanted_env.intersection(current_keys): - if os.environ[k] != env[k]: - added_keys.add(k) - - for k in sorted(added_keys): - if ("PATH" in k) and (os.pathsep in env[k]): - log_fn("+ %s=\\\n" % k) - for elem in env[k].split(os.pathsep): - log_fn("+ %s%s\\\n" % (shellquote(elem), os.pathsep)) - else: - log_fn("+ %s=%s \\\n" % (k, shellquote(env[k]))) - - -def run_cmd(cmd, env=None, cwd=None, allow_fail: bool = False, log_file=None) -> int: - def log_to_stdout(msg): - sys.stdout.buffer.write(msg.encode(errors="surrogateescape")) - - if log_file is not None: - with open(log_file, "a", encoding="utf-8", errors="surrogateescape") as log: - - def log_function(msg): - log.write(msg) - log_to_stdout(msg) - - return _run_cmd( - cmd, env=env, cwd=cwd, allow_fail=allow_fail, log_fn=log_function - ) - else: - return _run_cmd( - cmd, env=env, cwd=cwd, allow_fail=allow_fail, log_fn=log_to_stdout - ) - - -def _run_cmd(cmd, env, cwd, allow_fail, log_fn) -> int: - log_fn("---\n") - try: - cmd_str = " \\\n+ ".join(shellquote(arg) for arg in cmd) - except TypeError: - # eg: one of the elements is None - raise RunCommandError("problem quoting cmd: %r" % cmd) - - if env: - assert isinstance(env, Env) - _print_env_diff(env, log_fn) - - # Convert from our Env type to a regular dict. - # This is needed because python3 looks up b'PATH' and 'PATH' - # and emits an error if both are present. In our Env type - # we'll return the same value for both requests, but we don't - # have duplicate potentially conflicting values which is the - # spirit of the check. - env = dict(env.items()) - - if cwd: - log_fn("+ cd %s && \\\n" % shellquote(cwd)) - # Our long path escape sequence may confuse cmd.exe, so if the cwd - # is short enough, strip that off. - if is_windows() and (len(cwd) < 250) and cwd.startswith("\\\\?\\"): - cwd = cwd[4:] - - log_fn("+ %s\n" % cmd_str) - - isinteractive = os.isatty(sys.stdout.fileno()) - if isinteractive: - stdout = None - sys.stdout.buffer.flush() - else: - stdout = subprocess.PIPE - - try: - p = subprocess.Popen( - cmd, env=env, cwd=cwd, stdout=stdout, stderr=subprocess.STDOUT - ) - except (TypeError, ValueError, OSError) as exc: - log_fn("error running `%s`: %s" % (cmd_str, exc)) - raise RunCommandError( - "%s while running `%s` with env=%r\nos.environ=%r" - % (str(exc), cmd_str, env, os.environ) - ) - - if not isinteractive: - _pipe_output(p, log_fn) - - p.wait() - if p.returncode != 0 and not allow_fail: - raise subprocess.CalledProcessError(p.returncode, cmd) - - return p.returncode - - -if hasattr(select, "poll"): - - def _pipe_output(p, log_fn): - """Read output from p.stdout and call log_fn() with each chunk of data as it - becomes available.""" - # Perform non-blocking reads - import fcntl - - fcntl.fcntl(p.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) - poll = select.poll() - poll.register(p.stdout.fileno(), select.POLLIN) - - buffer_size = 4096 - while True: - poll.poll() - data = p.stdout.read(buffer_size) - if not data: - break - # log_fn() accepts arguments as str (binary in Python 2, unicode in - # Python 3). In Python 3 the subprocess output will be plain bytes, - # and need to be decoded. - if not isinstance(data, str): - data = data.decode("utf-8", errors="surrogateescape") - log_fn(data) - -else: - - def _pipe_output(p, log_fn): - """Read output from p.stdout and call log_fn() with each chunk of data as it - becomes available.""" - # Perform blocking reads. Use a smaller buffer size to avoid blocking - # for very long when data is available. - buffer_size = 64 - while True: - data = p.stdout.read(buffer_size) - if not data: - break - # log_fn() accepts arguments as str (binary in Python 2, unicode in - # Python 3). In Python 3 the subprocess output will be plain bytes, - # and need to be decoded. - if not isinstance(data, str): - data = data.decode("utf-8", errors="surrogateescape") - log_fn(data) diff --git a/build/fbcode_builder/getdeps/subcmd.py b/build/fbcode_builder/getdeps/subcmd.py deleted file mode 100644 index 3c338642d8402..0000000000000 --- a/build/fbcode_builder/getdeps/subcmd.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -class SubCmd(object): - NAME = None - HELP = None - - def run(self, args) -> int: - """perform the command""" - return 0 - - def setup_parser(self, parser) -> None: - # Subclasses should override setup_parser() if they have any - # command line options or arguments. - pass - - -CmdTable = [] - - -def add_subcommands(parser, common_args, cmd_table=CmdTable) -> None: - """Register parsers for the defined commands with the provided parser""" - for cls in cmd_table: - command = cls() - command_parser = parser.add_parser( - command.NAME, help=command.HELP, parents=[common_args] - ) - command.setup_parser(command_parser) - command_parser.set_defaults(func=command.run) - - -def cmd(name, help=None, cmd_table=CmdTable): - """ - @cmd() is a decorator that can be used to help define Subcmd instances - - Example usage: - - @subcmd('list', 'Show the result list') - class ListCmd(Subcmd): - def run(self, args): - # Perform the command actions here... - pass - """ - - def wrapper(cls): - class SubclassedCmd(cls): - NAME = name - HELP = help - - cmd_table.append(SubclassedCmd) - return SubclassedCmd - - return wrapper diff --git a/build/fbcode_builder/getdeps/test/expr_test.py b/build/fbcode_builder/getdeps/test/expr_test.py deleted file mode 100644 index f12f68985ed89..0000000000000 --- a/build/fbcode_builder/getdeps/test/expr_test.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -import unittest - -from ..expr import parse_expr - - -class ExprTest(unittest.TestCase): - def test_equal(self) -> None: - valid_variables = {"foo", "some_var", "another_var"} - e = parse_expr("foo=bar", valid_variables) - self.assertTrue(e.eval({"foo": "bar"})) - self.assertFalse(e.eval({"foo": "not-bar"})) - self.assertFalse(e.eval({"not-foo": "bar"})) - - def test_not_equal(self) -> None: - valid_variables = {"foo"} - e = parse_expr("not(foo=bar)", valid_variables) - self.assertFalse(e.eval({"foo": "bar"})) - self.assertTrue(e.eval({"foo": "not-bar"})) - - def test_bad_not(self) -> None: - valid_variables = {"foo"} - with self.assertRaises(Exception): - parse_expr("foo=not(bar)", valid_variables) - - def test_bad_variable(self) -> None: - valid_variables = {"bar"} - with self.assertRaises(Exception): - parse_expr("foo=bar", valid_variables) - - def test_all(self) -> None: - valid_variables = {"foo", "baz"} - e = parse_expr("all(foo = bar, baz = qux)", valid_variables) - self.assertTrue(e.eval({"foo": "bar", "baz": "qux"})) - self.assertFalse(e.eval({"foo": "bar", "baz": "nope"})) - self.assertFalse(e.eval({"foo": "nope", "baz": "nope"})) - - def test_any(self) -> None: - valid_variables = {"foo", "baz"} - e = parse_expr("any(foo = bar, baz = qux)", valid_variables) - self.assertTrue(e.eval({"foo": "bar", "baz": "qux"})) - self.assertTrue(e.eval({"foo": "bar", "baz": "nope"})) - self.assertFalse(e.eval({"foo": "nope", "baz": "nope"})) diff --git a/build/fbcode_builder/getdeps/test/fixtures/duplicate/foo b/build/fbcode_builder/getdeps/test/fixtures/duplicate/foo deleted file mode 100644 index a0384ee3b33fd..0000000000000 --- a/build/fbcode_builder/getdeps/test/fixtures/duplicate/foo +++ /dev/null @@ -1,2 +0,0 @@ -[manifest] -name = foo diff --git a/build/fbcode_builder/getdeps/test/fixtures/duplicate/subdir/foo b/build/fbcode_builder/getdeps/test/fixtures/duplicate/subdir/foo deleted file mode 100644 index a0384ee3b33fd..0000000000000 --- a/build/fbcode_builder/getdeps/test/fixtures/duplicate/subdir/foo +++ /dev/null @@ -1,2 +0,0 @@ -[manifest] -name = foo diff --git a/build/fbcode_builder/getdeps/test/manifest_test.py b/build/fbcode_builder/getdeps/test/manifest_test.py deleted file mode 100644 index e48b05f321d04..0000000000000 --- a/build/fbcode_builder/getdeps/test/manifest_test.py +++ /dev/null @@ -1,232 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -import sys -import unittest - -from ..load import load_all_manifests, patch_loader -from ..manifest import ManifestParser - - -class ManifestTest(unittest.TestCase): - def test_missing_section(self) -> None: - with self.assertRaisesRegex( - Exception, "manifest file test is missing required section manifest" - ): - ManifestParser("test", "") - - def test_missing_name(self) -> None: - with self.assertRaisesRegex( - Exception, - "manifest file test section 'manifest' is missing required field 'name'", - ): - ManifestParser( - "test", - """ -[manifest] -""", - ) - - def test_minimal(self) -> None: - p = ManifestParser( - "test", - """ -[manifest] -name = test -""", - ) - self.assertEqual(p.name, "test") - self.assertEqual(p.fbsource_path, None) - - def test_minimal_with_fbsource_path(self) -> None: - p = ManifestParser( - "test", - """ -[manifest] -name = test -fbsource_path = fbcode/wat -""", - ) - self.assertEqual(p.name, "test") - self.assertEqual(p.fbsource_path, "fbcode/wat") - - def test_unknown_field(self) -> None: - with self.assertRaisesRegex( - Exception, - ( - "manifest file test section 'manifest' contains " - "unknown field 'invalid.field'" - ), - ): - ManifestParser( - "test", - """ -[manifest] -name = test -invalid.field = woot -""", - ) - - def test_invalid_section_name(self) -> None: - with self.assertRaisesRegex( - Exception, "manifest file test contains unknown section 'invalid.section'" - ): - ManifestParser( - "test", - """ -[manifest] -name = test - -[invalid.section] -foo = bar -""", - ) - - def test_value_in_dependencies_section(self) -> None: - with self.assertRaisesRegex( - Exception, - ( - "manifest file test section 'dependencies' has " - "'foo = bar' but this section doesn't allow " - "specifying values for its entries" - ), - ): - ManifestParser( - "test", - """ -[manifest] -name = test - -[dependencies] -foo = bar -""", - ) - - def test_invalid_conditional_section_name(self) -> None: - with self.assertRaisesRegex( - Exception, - ( - "manifest file test section 'dependencies.=' " - "has invalid conditional: expected " - "identifier found =" - ), - ): - ManifestParser( - "test", - """ -[manifest] -name = test - -[dependencies.=] -""", - ) - - def test_section_as_args(self) -> None: - p = ManifestParser( - "test", - """ -[manifest] -name = test - -[dependencies] -a -b -c - -[dependencies.test=on] -foo -""", - ) - self.assertEqual(p.get_section_as_args("dependencies"), ["a", "b", "c"]) - self.assertEqual( - p.get_section_as_args("dependencies", {"test": "off"}), ["a", "b", "c"] - ) - self.assertEqual( - p.get_section_as_args("dependencies", {"test": "on"}), - ["a", "b", "c", "foo"], - ) - - p2 = ManifestParser( - "test", - """ -[manifest] -name = test - -[autoconf.args] ---prefix=/foo ---with-woot -""", - ) - self.assertEqual( - p2.get_section_as_args("autoconf.args"), ["--prefix=/foo", "--with-woot"] - ) - - def test_section_as_dict(self) -> None: - p = ManifestParser( - "test", - """ -[manifest] -name = test - -[cmake.defines] -foo = bar - -[cmake.defines.test=on] -foo = baz -""", - ) - self.assertEqual(p.get_section_as_dict("cmake.defines", {}), {"foo": "bar"}) - self.assertEqual( - p.get_section_as_dict("cmake.defines", {"test": "on"}), {"foo": "baz"} - ) - - p2 = ManifestParser( - "test", - """ -[manifest] -name = test - -[cmake.defines.test=on] -foo = baz - -[cmake.defines] -foo = bar -""", - ) - self.assertEqual( - p2.get_section_as_dict("cmake.defines", {"test": "on"}), - {"foo": "bar"}, - msg="sections cascade in the order they appear in the manifest", - ) - - def test_parse_common_manifests(self) -> None: - patch_loader(__name__) - manifests = load_all_manifests(None) - self.assertNotEqual(0, len(manifests), msg="parsed some number of manifests") - - def test_mismatch_name(self) -> None: - with self.assertRaisesRegex( - Exception, - "filename of the manifest 'foo' does not match the manifest name 'bar'", - ): - ManifestParser( - "foo", - """ -[manifest] -name = bar -""", - ) - - def test_duplicate_manifest(self) -> None: - patch_loader(__name__, "fixtures/duplicate") - - with self.assertRaisesRegex(Exception, "found duplicate manifest 'foo'"): - load_all_manifests(None) - - if sys.version_info < (3, 2): - - def assertRaisesRegex(self, *args, **kwargs): - return self.assertRaisesRegexp(*args, **kwargs) diff --git a/build/fbcode_builder/getdeps/test/platform_test.py b/build/fbcode_builder/getdeps/test/platform_test.py deleted file mode 100644 index ce0de7a67f5ed..0000000000000 --- a/build/fbcode_builder/getdeps/test/platform_test.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -import unittest - -from ..platform import HostType - - -class PlatformTest(unittest.TestCase): - def test_create(self) -> None: - p = HostType() - self.assertNotEqual(p.ostype, None, msg="probed and returned something") - - tuple_string = p.as_tuple_string() - round_trip = HostType.from_tuple_string(tuple_string) - self.assertEqual(round_trip, p) - - def test_rendering_of_none(self) -> None: - p = HostType(ostype="foo") - self.assertEqual(p.as_tuple_string(), "foo-none-none") - - def test_is_methods(self) -> None: - p = HostType(ostype="windows") - self.assertTrue(p.is_windows()) - self.assertFalse(p.is_darwin()) - self.assertFalse(p.is_linux()) - - p = HostType(ostype="darwin") - self.assertFalse(p.is_windows()) - self.assertTrue(p.is_darwin()) - self.assertFalse(p.is_linux()) - - p = HostType(ostype="linux") - self.assertFalse(p.is_windows()) - self.assertFalse(p.is_darwin()) - self.assertTrue(p.is_linux()) diff --git a/build/fbcode_builder/getdeps/test/scratch_test.py b/build/fbcode_builder/getdeps/test/scratch_test.py deleted file mode 100644 index b57d8b583fe40..0000000000000 --- a/build/fbcode_builder/getdeps/test/scratch_test.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -import unittest - -from ..buildopts import find_existing_win32_subst_for_path - - -class Win32SubstTest(unittest.TestCase): - def test_no_existing_subst(self) -> None: - self.assertIsNone( - find_existing_win32_subst_for_path( - r"C:\users\alice\appdata\local\temp\fbcode_builder_getdeps", - subst_mapping={}, - ) - ) - self.assertIsNone( - find_existing_win32_subst_for_path( - r"C:\users\alice\appdata\local\temp\fbcode_builder_getdeps", - subst_mapping={"X:\\": r"C:\users\alice\appdata\local\temp\other"}, - ) - ) - - def test_exact_match_returns_drive_path(self) -> None: - self.assertEqual( - find_existing_win32_subst_for_path( - r"C:\temp\fbcode_builder_getdeps", - subst_mapping={"X:\\": r"C:\temp\fbcode_builder_getdeps"}, - ), - "X:\\", - ) - self.assertEqual( - find_existing_win32_subst_for_path( - r"C:/temp/fbcode_builder_getdeps", - subst_mapping={"X:\\": r"C:/temp/fbcode_builder_getdeps"}, - ), - "X:\\", - ) - - def test_multiple_exact_matches_returns_arbitrary_drive_path(self) -> None: - self.assertIn( - find_existing_win32_subst_for_path( - r"C:\temp\fbcode_builder_getdeps", - subst_mapping={ - "X:\\": r"C:\temp\fbcode_builder_getdeps", - "Y:\\": r"C:\temp\fbcode_builder_getdeps", - "Z:\\": r"C:\temp\fbcode_builder_getdeps", - }, - ), - ("X:\\", "Y:\\", "Z:\\"), - ) - - def test_drive_letter_is_case_insensitive(self) -> None: - self.assertEqual( - find_existing_win32_subst_for_path( - r"C:\temp\fbcode_builder_getdeps", - subst_mapping={"X:\\": r"c:\temp\fbcode_builder_getdeps"}, - ), - "X:\\", - ) - - def test_path_components_are_case_insensitive(self) -> None: - self.assertEqual( - find_existing_win32_subst_for_path( - r"C:\TEMP\FBCODE_builder_getdeps", - subst_mapping={"X:\\": r"C:\temp\fbcode_builder_getdeps"}, - ), - "X:\\", - ) - self.assertEqual( - find_existing_win32_subst_for_path( - r"C:\temp\fbcode_builder_getdeps", - subst_mapping={"X:\\": r"C:\TEMP\FBCODE_builder_getdeps"}, - ), - "X:\\", - ) diff --git a/build/fbcode_builder/manifests/CLI11 b/build/fbcode_builder/manifests/CLI11 deleted file mode 100644 index 14cb2332af457..0000000000000 --- a/build/fbcode_builder/manifests/CLI11 +++ /dev/null @@ -1,14 +0,0 @@ -[manifest] -name = CLI11 - -[download] -url = https://github.com/CLIUtils/CLI11/archive/v2.0.0.tar.gz -sha256 = 2c672f17bf56e8e6223a3bfb74055a946fa7b1ff376510371902adb9cb0ab6a3 - -[build] -builder = cmake -subdir = CLI11-2.0.0 - -[cmake.defines] -CLI11_BUILD_TESTS = OFF -CLI11_BUILD_EXAMPLES = OFF diff --git a/build/fbcode_builder/manifests/airstore b/build/fbcode_builder/manifests/airstore deleted file mode 100644 index 91c38c53cc7f0..0000000000000 --- a/build/fbcode_builder/manifests/airstore +++ /dev/null @@ -1,38 +0,0 @@ -[manifest] -name = airstore -fbsource_path = fbcode/fair_infra/data/airstore/ -shipit_project = AIRStore -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/fairinternal/AIRStore.git - -[build.os=linux] -builder = cmake - -[build.not(os=linux)] -# We only support Linux -builder = nop - -[dependencies] -boost -libcurl -fizz -fmt -folly -googletest -libsodium -libevent -double-conversion -proxygen -wangle -zstd -zlib -xz - -[shipit.pathmap] -fbcode/fair_infra/data/airstore = . -fbcode/deeplearning/projects/fairstore/cpp = deeplearning/projects/fairstore/cpp -fbcode/proxygen/lib/utils = proxygen/lib/utils - -[shipit.strip] diff --git a/build/fbcode_builder/manifests/autoconf b/build/fbcode_builder/manifests/autoconf deleted file mode 100644 index 8c8b883974c9a..0000000000000 --- a/build/fbcode_builder/manifests/autoconf +++ /dev/null @@ -1,19 +0,0 @@ -[manifest] -name = autoconf - -[debs] -autoconf - -[homebrew] -autoconf - -[rpms] -autoconf - -[download] -url = http://ftp.gnu.org/gnu/autoconf/autoconf-2.69.tar.gz -sha256 = 954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969 - -[build] -builder = autoconf -subdir = autoconf-2.69 diff --git a/build/fbcode_builder/manifests/automake b/build/fbcode_builder/manifests/automake deleted file mode 100644 index 37ffb95d21e8b..0000000000000 --- a/build/fbcode_builder/manifests/automake +++ /dev/null @@ -1,22 +0,0 @@ -[manifest] -name = automake - -[homebrew] -automake - -[debs] -automake - -[rpms] -automake - -[download] -url = http://ftp.gnu.org/gnu/automake/automake-1.16.1.tar.gz -sha256 = 608a97523f97db32f1f5d5615c98ca69326ced2054c9f82e65bade7fc4c9dea8 - -[build] -builder = autoconf -subdir = automake-1.16.1 - -[dependencies] -autoconf diff --git a/build/fbcode_builder/manifests/benchmark b/build/fbcode_builder/manifests/benchmark deleted file mode 100644 index 25d621184cea2..0000000000000 --- a/build/fbcode_builder/manifests/benchmark +++ /dev/null @@ -1,13 +0,0 @@ -[manifest] -name = benchmark - -[download] -url = https://github.com/google/benchmark/archive/refs/tags/v1.8.0.tar.gz -sha256 = ea2e94c24ddf6594d15c711c06ccd4486434d9cf3eca954e2af8a20c88f9f172 - -[build] -builder = cmake -subdir = benchmark-1.8.0/ - -[cmake.defines] -BENCHMARK_ENABLE_TESTING=OFF diff --git a/build/fbcode_builder/manifests/blake3 b/build/fbcode_builder/manifests/blake3 deleted file mode 100644 index 27a5509e7c4b0..0000000000000 --- a/build/fbcode_builder/manifests/blake3 +++ /dev/null @@ -1,11 +0,0 @@ -[manifest] -name = blake3 - -[download] -url = https://github.com/BLAKE3-team/BLAKE3/archive/refs/tags/1.3.3.tar.gz -sha256 = 27d2bc4ee5945ba75434859521042c949463ee7514ff17aaef328e23ef83fec0 - -[build] -builder = cmake -subdir = BLAKE3-1.3.3/c -patchfile = blake3_CMakeLists_txt.patch diff --git a/build/fbcode_builder/manifests/boost b/build/fbcode_builder/manifests/boost deleted file mode 100644 index 89b4ee4e293bf..0000000000000 --- a/build/fbcode_builder/manifests/boost +++ /dev/null @@ -1,110 +0,0 @@ -[manifest] -name = boost - -[download.not(os=windows)] -url = https://boostorg.jfrog.io/artifactory/main/release/1.83.0/source/boost_1_83_0.tar.gz -sha256 = c0685b68dd44cc46574cce86c4e17c0f611b15e195be9848dfd0769a0a207628 - -[download.os=windows] -url = https://boostorg.jfrog.io/artifactory/main/release/1.83.0/source/boost_1_83_0.zip -sha256 = c86bd9d9eef795b4b0d3802279419fde5221922805b073b9bd822edecb1ca28e - -[preinstalled.env] -# Here we list the acceptable versions that cmake needs a hint to find -BOOST_ROOT_1_69_0 -BOOST_ROOT_1_83_0 - -[debs] -libboost-all-dev - -[homebrew] -boost -# Boost cmake detection on homebrew adds this as requirement: https://github.com/Homebrew/homebrew-core/issues/67427#issuecomment-754187345 -icu4c - -[rpms.all(distro=centos_stream,distro_vers=8)] -boost169 -boost169-math -boost169-test -boost169-fiber -boost169-graph -boost169-log -boost169-openmpi -boost169-timer -boost169-chrono -boost169-locale -boost169-thread -boost169-atomic -boost169-random -boost169-static -boost169-contract -boost169-date-time -boost169-iostreams -boost169-container -boost169-coroutine -boost169-filesystem -boost169-system -boost169-stacktrace -boost169-regex -boost169-devel -boost169-context -boost169-python3-devel -boost169-type_erasure -boost169-wave -boost169-python3 -boost169-serialization -boost169-program-options - -[rpms.not(all(distro=centos_stream,distro_vers=8))] -boost-devel -boost-static - -[build] -builder = boost -job_weight_mib = 512 - -[b2.args] ---with-atomic ---with-chrono ---with-container ---with-context ---with-contract ---with-coroutine ---with-date_time ---with-exception ---with-fiber ---with-filesystem ---with-graph ---with-graph_parallel ---with-iostreams ---with-locale ---with-log ---with-math ---with-mpi ---with-program_options ---with-python ---with-random ---with-regex ---with-serialization ---with-stacktrace ---with-system ---with-test ---with-thread ---with-timer ---with-type_erasure - -[bootstrap.args.os=darwin] -# Not really gcc, but CI puts a broken clang in the PATH, and saying gcc -# here selects the correct one from Xcode. ---with-toolset=gcc - -[b2.args.os=linux] -# RHEL hardened gcc is not compatible with PCH -# https://bugzilla.redhat.com/show_bug.cgi?id=1806545 -pch=off - -[b2.args.os=darwin] -toolset=clang - -[b2.args.all(os=windows,fb=on)] -toolset=msvc-14.2 diff --git a/build/fbcode_builder/manifests/bz2 b/build/fbcode_builder/manifests/bz2 deleted file mode 100644 index af2f357d5dbe8..0000000000000 --- a/build/fbcode_builder/manifests/bz2 +++ /dev/null @@ -1,30 +0,0 @@ -[manifest] -name = bz2 - -[debs] -libbz2-dev - -[homebrew] -bzip2 - -[rpms] -bzip2-devel - -[download] -url = https://sourceware.org/pub/bzip2/bzip2-1.0.8.tar.gz -sha256 = ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269 - -[build.not(os=windows)] -builder = make -subdir = bzip2-1.0.8 - -[make.build_args.os=linux] -# python bz2 support on linux needs dynamic library --f -Makefile-libbz2_so - -[make.install_args] -install - -[build.os=windows] -builder = nop diff --git a/build/fbcode_builder/manifests/cmake b/build/fbcode_builder/manifests/cmake deleted file mode 100644 index 71548f119cba5..0000000000000 --- a/build/fbcode_builder/manifests/cmake +++ /dev/null @@ -1,46 +0,0 @@ -[manifest] -name = cmake - -[homebrew] -cmake - -# 18.04 cmake is too old -[debs.not(all(distro=ubuntu,distro_vers="18.04"))] -cmake - -[rpms] -cmake - -[dependencies] -ninja - -[download.os=windows] -url = https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2-windows-x86_64.zip -sha256 = 15a49e2ab81c1822d75b1b1a92f7863f58e31f6d6aac1c4103eef2b071be3112 - -[download.os=darwin] -url = https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2-macos-universal.tar.gz -sha256 = 0100663380a3bd977b001183cd487412db7aad9de6859927bde97e1e6e44e645 - -[download.any(os=linux,os=freebsd)] -url = https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz -sha256 = aecf6ecb975179eb3bb6a4a50cae192d41e92b9372b02300f9e8f1d5f559544e - -[build.os=windows] -builder = nop -subdir = cmake-3.20.2-windows-x86_64 - -[build.os=darwin] -builder = nop -subdir = cmake-3.20.2-macos-universal - -[install.files.os=darwin] -CMake.app/Contents/bin = bin -CMake.app/Contents/share = share - -[build.any(os=linux,os=freebsd)] -builder = cmakebootstrap -subdir = cmake-3.20.2 - -[make.install_args.any(os=linux,os=freebsd)] -install diff --git a/build/fbcode_builder/manifests/cpptoml b/build/fbcode_builder/manifests/cpptoml deleted file mode 100644 index c4d6d8d9c5656..0000000000000 --- a/build/fbcode_builder/manifests/cpptoml +++ /dev/null @@ -1,16 +0,0 @@ -[manifest] -name = cpptoml - -[homebrew] -cpptoml - -[download] -url = https://github.com/chadaustin/cpptoml/archive/refs/tags/v0.1.2.tar.gz -sha256 = beda37e94f9746874436c8090c045fd80ae6f8a51f7c668c932a2b110a4fc277 - -[build] -builder = cmake -subdir = cpptoml-0.1.2 - -[cmake.defines.os=freebsd] -ENABLE_LIBCXX=NO diff --git a/build/fbcode_builder/manifests/date b/build/fbcode_builder/manifests/date deleted file mode 100644 index 8a4e255c88fc0..0000000000000 --- a/build/fbcode_builder/manifests/date +++ /dev/null @@ -1,10 +0,0 @@ -[manifest] -name = date - -[download] -url = https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.tar.gz -sha256 = 7a390f200f0ccd207e8cff6757e04817c1a0aec3e327b006b7eb451c57ee3538 - -[build] -builder = cmake -subdir = date-3.0.1 diff --git a/build/fbcode_builder/manifests/delos_core b/build/fbcode_builder/manifests/delos_core deleted file mode 100644 index 1de6c3342df48..0000000000000 --- a/build/fbcode_builder/manifests/delos_core +++ /dev/null @@ -1,25 +0,0 @@ -[manifest] -name = delos_core -fbsource_path = fbcode/delos_core -shipit_project = delos_core -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebookincubator/delos_core.git - -[build.os=linux] -builder = cmake - -[build.not(os=linux)] -builder = nop - -[dependencies] -glog -googletest -folly -fbthrift -fb303 -re2 - -[shipit.pathmap] -fbcode/delos_core = . diff --git a/build/fbcode_builder/manifests/double-conversion b/build/fbcode_builder/manifests/double-conversion deleted file mode 100644 index 2d7265e8d39c8..0000000000000 --- a/build/fbcode_builder/manifests/double-conversion +++ /dev/null @@ -1,20 +0,0 @@ -[manifest] -name = double-conversion - -[download] -url = https://github.com/google/double-conversion/archive/v3.1.4.tar.gz -sha256 = 95004b65e43fefc6100f337a25da27bb99b9ef8d4071a36a33b5e83eb1f82021 - -[homebrew] -double-conversion - -[debs] -libdouble-conversion-dev - -[rpms] -double-conversion -double-conversion-devel - -[build] -builder = cmake -subdir = double-conversion-3.1.4 diff --git a/build/fbcode_builder/manifests/eden b/build/fbcode_builder/manifests/eden deleted file mode 100644 index 4c32bf698a1c0..0000000000000 --- a/build/fbcode_builder/manifests/eden +++ /dev/null @@ -1,112 +0,0 @@ -[manifest] -name = eden -fbsource_path = fbcode/eden -shipit_project = eden -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/sapling.git - -[github.actions] -run_tests = off - -[sandcastle] -run_tests = off - -[build] -builder = cmake - -[dependencies] -blake3 -googletest -folly -fbthrift -fb303 -cpptoml -rocksdb -re2 -libgit2 -pexpect -python-toml -python-filelock -edencommon -rust-shed - -[dependencies.fbsource=on] -rust - -# macOS ships with sqlite3, and some of the core system -# frameworks require that that version be linked rather -# than the one we might build for ourselves here, so we -# skip building it on macos. -[dependencies.not(os=darwin)] -sqlite3 - -[dependencies.os=darwin] -osxfuse - -[dependencies.not(os=windows)] -# TODO: teach getdeps to compile curl on Windows. -# Enabling curl on Windows requires us to find a way to compile libcurl with -# msvc. -libcurl -# Added so that OSS doesn't see system "python" which is python 2 on darwin and some linux -python - -[shipit.pathmap.fb=on] -# for internal builds that use getdeps -fbcode/fb303 = fb303 -fbcode/common/rust/fbwhoami = common/rust/fbwhoami -fbcode/common/rust/shed = common/rust/shed -fbcode/thrift/lib/rust = thrift/lib/rust - -[shipit.pathmap] -# Map hostcaps for now as eden C++ includes its .h. Rust-shed should install it -fbcode/common/rust/shed/hostcaps = common/rust/shed/hostcaps -fbcode/configerator/structs/scm/hg = configerator/structs/scm/hg -fbcode/eden/oss = . -fbcode/eden = eden -fbcode/tools/lfs = tools/lfs - -[shipit.pathmap.fb=off] -fbcode/eden/fs/public_autocargo = eden/fs -fbcode/eden/scm/public_autocargo = eden/scm -fbcode/common/rust/shed/hostcaps/public_cargo = common/rust/shed/hostcaps -fbcode/configerator/structs/scm/hg/public_autocargo = configerator/structs/scm/hg - -[shipit.strip] -^fbcode/eden/addons/.*$ -^fbcode/eden/fs/eden-config\.h$ -^fbcode/eden/fs/py/eden/config\.py$ -^fbcode/eden/hg-server/.*$ -^fbcode/eden/mononoke/(?!lfs_protocol) -^fbcode/eden/scm/build/.*$ -^fbcode/eden/scm/lib/third-party/rust/.*/Cargo.toml$ -^fbcode/eden/website/.*$ -^fbcode/eden/.*/\.cargo/.*$ -/Cargo\.lock$ -\.pyc$ - -[shipit.strip.fb=off] -^fbcode/common/rust/shed(?!/public_autocargo).*/Cargo\.toml$ -^fbcode/configerator/structs/scm/hg(?!/public_autocargo).*/Cargo\.toml$ -^fbcode/eden/fs(?!/public_autocargo).*/Cargo\.toml$ -^fbcode/eden/scm(?!/public_autocargo|/saplingnative).*/Cargo\.toml$ -^.*/facebook/.*$ -^.*/fb/.*$ - -[cmake.defines.all(fb=on,os=windows)] -ENABLE_GIT=OFF -INSTALL_PYTHON_LIB=ON - -[cmake.defines.all(not(fb=on),os=windows)] -ENABLE_GIT=OFF - -[cmake.defines.fbsource=on] -USE_CARGO_VENDOR=ON - -[cmake.defines.fb=on] -IS_FB_BUILD=ON - -[depends.environment] -EDEN_VERSION_OVERRIDE diff --git a/build/fbcode_builder/manifests/edencommon b/build/fbcode_builder/manifests/edencommon deleted file mode 100644 index 772c9407ed9f2..0000000000000 --- a/build/fbcode_builder/manifests/edencommon +++ /dev/null @@ -1,30 +0,0 @@ -[manifest] -name = edencommon -fbsource_path = fbcode/eden/common -shipit_project = edencommon -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebookexperimental/edencommon.git - -[build] -builder = cmake - -[dependencies] -fmt -folly -gflags -glog - -[cmake.defines.test=on] -BUILD_TESTS=ON - -[cmake.defines.test=off] -BUILD_TESTS=OFF - -[shipit.pathmap] -fbcode/eden/common = eden/common -fbcode/eden/common/oss = . - -[shipit.strip] -@README.facebook@ diff --git a/build/fbcode_builder/manifests/exprtk b/build/fbcode_builder/manifests/exprtk deleted file mode 100644 index c0dfc1afbf67f..0000000000000 --- a/build/fbcode_builder/manifests/exprtk +++ /dev/null @@ -1,15 +0,0 @@ -[manifest] -name = exprtk - -[download] -url = https://github.com/ArashPartow/exprtk/archive/refs/tags/0.0.1.tar.gz -sha256 = fb72791c88ae3b3426e14fdad630027715682584daf56b973569718c56e33f28 - -[build.not(os=windows)] -builder = nop -subdir = exprtk-0.0.1 - -[install.files] -exprtk.hpp = exprtk.hpp - -[dependencies] diff --git a/build/fbcode_builder/manifests/f4d b/build/fbcode_builder/manifests/f4d deleted file mode 100644 index 2f3db2595acbe..0000000000000 --- a/build/fbcode_builder/manifests/f4d +++ /dev/null @@ -1,30 +0,0 @@ -[manifest] -name = f4d -fbsource_path = fbcode/f4d -shipit_project = f4d -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebookexternal/f4d.git -rev = master - -[build.os=windows] -builder = nop - -[build.not(os=windows)] -builder = cmake - -[dependencies] -double-conversion -folly -glog -googletest -boost -protobuf -lzo -libicu -re2 - -[shipit.pathmap] -fbcode/f4d/public_tld = . -fbcode/f4d = f4d diff --git a/build/fbcode_builder/manifests/fatal b/build/fbcode_builder/manifests/fatal deleted file mode 100644 index b516d765f7ed7..0000000000000 --- a/build/fbcode_builder/manifests/fatal +++ /dev/null @@ -1,24 +0,0 @@ -[manifest] -name = fatal -fbsource_path = fbcode/fatal -shipit_project = fatal - -[git] -repo_url = https://github.com/facebook/fatal.git - -[shipit.pathmap] -fbcode/fatal = fatal -fbcode/fatal/public_tld = . - -[build] -builder = nop -subdir = . - -[install.files] -fatal/portability.h = fatal/portability.h -fatal/preprocessor.h = fatal/preprocessor.h -fatal/container = fatal/container -fatal/functional = fatal/functional -fatal/math = fatal/math -fatal/string = fatal/string -fatal/type = fatal/type diff --git a/build/fbcode_builder/manifests/fb303 b/build/fbcode_builder/manifests/fb303 deleted file mode 100644 index ad398a8c281f4..0000000000000 --- a/build/fbcode_builder/manifests/fb303 +++ /dev/null @@ -1,37 +0,0 @@ -[manifest] -name = fb303 -fbsource_path = fbcode/fb303 -shipit_project = fb303 -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/fb303.git - -[cargo] -cargo_config_file = source/fb303/thrift/.cargo/config.toml - -[crate.pathmap] -fb303_core = fb303/thrift - -[build] -builder = cmake - -[dependencies] -folly -gflags -glog -fbthrift - -[cmake.defines.test=on] -BUILD_TESTS=ON - -[cmake.defines.test=off] -BUILD_TESTS=OFF - -[shipit.pathmap] -fbcode/fb303/github = . -fbcode/fb303/public_autocargo = fb303 -fbcode/fb303 = fb303 - -[shipit.strip] -^fbcode/fb303/(?!public_autocargo).+/Cargo\.toml$ diff --git a/build/fbcode_builder/manifests/fb303-source b/build/fbcode_builder/manifests/fb303-source deleted file mode 100644 index d62ce3b003254..0000000000000 --- a/build/fbcode_builder/manifests/fb303-source +++ /dev/null @@ -1,19 +0,0 @@ -[manifest] -name = fb303-source -fbsource_path = fbcode/fb303 -shipit_project = fb303 -shipit_fbcode_builder = false - -[git] -repo_url = https://github.com/facebook/fb303.git - -[build] -builder = nop - -[shipit.pathmap] -fbcode/fb303/github = . -fbcode/fb303/public_autocargo = fb303 -fbcode/fb303 = fb303 - -[shipit.strip] -^fbcode/fb303/(?!public_autocargo).+/Cargo\.toml$ diff --git a/build/fbcode_builder/manifests/fboss b/build/fbcode_builder/manifests/fboss deleted file mode 100644 index 8bcd1cb110c97..0000000000000 --- a/build/fbcode_builder/manifests/fboss +++ /dev/null @@ -1,48 +0,0 @@ -[manifest] -name = fboss -fbsource_path = fbcode/fboss -shipit_project = fboss -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/fboss.git - -[build.os=linux] -builder = cmake -# fboss files take a lot of RAM to compile. -job_weight_mib = 3072 - -[build.not(os=linux)] -builder = nop - -[dependencies] -folly -fb303 -wangle -fizz -fmt -libsodium -googletest -zstd -fatal -fbthrift -iproute2 -libusb -libcurl -libnl -libsai -re2 -python -yaml-cpp -libyaml -CLI11 -exprtk -nlohmann-json - -[shipit.pathmap] -fbcode/fboss/github = . -fbcode/fboss/common = common -fbcode/fboss = fboss - -[sandcastle] -run_tests = off diff --git a/build/fbcode_builder/manifests/fbthrift b/build/fbcode_builder/manifests/fbthrift deleted file mode 100644 index 3d852d8d1c56b..0000000000000 --- a/build/fbcode_builder/manifests/fbthrift +++ /dev/null @@ -1,47 +0,0 @@ -[manifest] -name = fbthrift -fbsource_path = fbcode/thrift -shipit_project = fbthrift -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/fbthrift.git - -[cargo] -cargo_config_file = source/thrift/lib/rust/.cargo/config.toml - -[crate.pathmap] -fbthrift = thrift/lib/rust - -[build] -builder = cmake -job_weight_mib = 2048 - -[dependencies] -fizz -fmt -folly -googletest -libsodium -python-six -wangle -zstd -mvfst -# Thrift also depends on openssl but since the latter requires a platform- -# specific configuration we rely on the folly manifest to provide this -# dependency to avoid duplication. - -[dependencies.os=linux] -# python doesn't build on Windows yet and this causes python3 shebangs to -# expand to a non-portable path on macOS -python - -[shipit.pathmap] -fbcode/thrift/public_tld = . -fbcode/thrift = thrift - -[shipit.strip] -^fbcode/thrift/thrift-config\.h$ -^fbcode/thrift/perf/canary.py$ -^fbcode/thrift/perf/loadtest.py$ -^fbcode/thrift/.castle/.* diff --git a/build/fbcode_builder/manifests/fbthrift-source b/build/fbcode_builder/manifests/fbthrift-source deleted file mode 100644 index 7af0d6ddac0e3..0000000000000 --- a/build/fbcode_builder/manifests/fbthrift-source +++ /dev/null @@ -1,21 +0,0 @@ -[manifest] -name = fbthrift-source -fbsource_path = fbcode/thrift -shipit_project = fbthrift -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/fbthrift.git - -[build] -builder = nop - -[shipit.pathmap] -fbcode/thrift/public_tld = . -fbcode/thrift = thrift - -[shipit.strip] -^fbcode/thrift/thrift-config\.h$ -^fbcode/thrift/perf/canary.py$ -^fbcode/thrift/perf/loadtest.py$ -^fbcode/thrift/.castle/.* diff --git a/build/fbcode_builder/manifests/fbzmq b/build/fbcode_builder/manifests/fbzmq deleted file mode 100644 index 5739016c84ac6..0000000000000 --- a/build/fbcode_builder/manifests/fbzmq +++ /dev/null @@ -1,29 +0,0 @@ -[manifest] -name = fbzmq -fbsource_path = facebook/fbzmq -shipit_project = fbzmq -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/fbzmq.git - -[build.os=linux] -builder = cmake - -[build.not(os=linux)] -# boost.fiber is required and that is not available on macos. -# libzmq doesn't currently build on windows. -builder = nop - -[dependencies] -boost -folly -fbthrift -googletest -libzmq - -[shipit.pathmap] -fbcode/fbzmq = fbzmq -fbcode/fbzmq/public_tld = . - -[shipit.strip] diff --git a/build/fbcode_builder/manifests/fizz b/build/fbcode_builder/manifests/fizz deleted file mode 100644 index 15e14ec608b3c..0000000000000 --- a/build/fbcode_builder/manifests/fizz +++ /dev/null @@ -1,37 +0,0 @@ -[manifest] -name = fizz -fbsource_path = fbcode/fizz -shipit_project = fizz -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebookincubator/fizz.git - -[build] -builder = cmake -subdir = fizz - -[cmake.defines] -BUILD_EXAMPLES = OFF - -[cmake.defines.test=on] -BUILD_TESTS = ON - -[cmake.defines.all(os=windows, test=on)] -BUILD_TESTS = OFF - -[cmake.defines.test=off] -BUILD_TESTS = OFF - -[dependencies] -folly -libsodium -zlib -zstd - -[dependencies.all(test=on, not(os=windows))] -googletest - -[shipit.pathmap] -fbcode/fizz/public_tld = . -fbcode/fizz = fizz diff --git a/build/fbcode_builder/manifests/fmt b/build/fbcode_builder/manifests/fmt deleted file mode 100644 index eb79496e3d991..0000000000000 --- a/build/fbcode_builder/manifests/fmt +++ /dev/null @@ -1,14 +0,0 @@ -[manifest] -name = fmt - -[download] -url = https://github.com/fmtlib/fmt/archive/refs/tags/9.1.0.tar.gz -sha256 = 5dea48d1fcddc3ec571ce2058e13910a0d4a6bab4cc09a809d8b1dd1c88ae6f2 - -[build] -builder = cmake -subdir = fmt-9.1.0 - -[cmake.defines] -FMT_TEST = OFF -FMT_DOC = OFF diff --git a/build/fbcode_builder/manifests/folly b/build/fbcode_builder/manifests/folly deleted file mode 100644 index 3c17c184e053f..0000000000000 --- a/build/fbcode_builder/manifests/folly +++ /dev/null @@ -1,71 +0,0 @@ -[manifest] -name = folly -fbsource_path = fbcode/folly -shipit_project = folly -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/folly.git - -[build] -builder = cmake -job_weight_mib = 1024 - -[dependencies] -gflags -glog -googletest -boost -libevent -libsodium -double-conversion -fmt -lz4 -snappy -zstd -# no openssl or zlib in the linux case, why? -# these are usually installed on the system -# and are the easiest system deps to pull in. -# In the future we want to be able to express -# that a system dep is sufficient in the manifest -# for eg: openssl and zlib, but for now we don't -# have it. - -# macOS doesn't expose the openssl api so we need -# to build our own. -[dependencies.os=darwin] -openssl - -# Windows has neither openssl nor zlib, so we get -# to provide both -[dependencies.os=windows] -openssl -zlib - -# xz depends on autoconf which does not build on -# Windows -[dependencies.not(os=windows)] -xz - -[shipit.pathmap] -fbcode/folly/public_tld = . -fbcode/folly = folly - -[shipit.strip] -^fbcode/folly/folly-config\.h$ -^fbcode/folly/public_tld/build/facebook_.* - -[cmake.defines] -BUILD_SHARED_LIBS=OFF -BOOST_LINK_STATIC=ON - -[cmake.defines.os=freebsd] -LIBDWARF_FOUND=NO - -[cmake.defines.test=on] -BUILD_TESTS=ON -BUILD_BENCHMARKS=OFF - -[cmake.defines.test=off] -BUILD_TESTS=OFF -BUILD_BENCHMARKS=OFF diff --git a/build/fbcode_builder/manifests/gflags b/build/fbcode_builder/manifests/gflags deleted file mode 100644 index a2704180eb3b5..0000000000000 --- a/build/fbcode_builder/manifests/gflags +++ /dev/null @@ -1,22 +0,0 @@ -[manifest] -name = gflags - -[download] -url = https://github.com/gflags/gflags/archive/v2.2.2.tar.gz -sha256 = 34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf - -[build] -builder = cmake -subdir = gflags-2.2.2 - -[cmake.defines] -BUILD_SHARED_LIBS = ON -BUILD_STATIC_LIBS = ON -#BUILD_gflags_nothreads_LIB = OFF -BUILD_gflags_LIB = ON - -[debs] -libgflags-dev - -[rpms] -gflags-devel diff --git a/build/fbcode_builder/manifests/git-lfs b/build/fbcode_builder/manifests/git-lfs deleted file mode 100644 index 38a5e6aeba581..0000000000000 --- a/build/fbcode_builder/manifests/git-lfs +++ /dev/null @@ -1,12 +0,0 @@ -[manifest] -name = git-lfs - -[download.os=linux] -url = https://github.com/git-lfs/git-lfs/releases/download/v2.9.1/git-lfs-linux-amd64-v2.9.1.tar.gz -sha256 = 2a8e60cf51ec45aa0f4332aa0521d60ec75c76e485d13ebaeea915b9d70ea466 - -[build] -builder = nop - -[install.files] -git-lfs = bin/git-lfs diff --git a/build/fbcode_builder/manifests/glog b/build/fbcode_builder/manifests/glog deleted file mode 100644 index 946d3e359bc53..0000000000000 --- a/build/fbcode_builder/manifests/glog +++ /dev/null @@ -1,28 +0,0 @@ -[manifest] -name = glog - -[download] -url = https://github.com/google/glog/archive/v0.5.0.tar.gz -sha256 = eede71f28371bf39aa69b45de23b329d37214016e2055269b3b5e7cfd40b59f5 - -[build] -builder = cmake -subdir = glog-0.5.0 - -[dependencies] -gflags - -[cmake.defines] -BUILD_SHARED_LIBS=ON -BUILD_TESTING=NO -WITH_PKGCONFIG=ON - -[cmake.defines.os=freebsd] -HAVE_TR1_UNORDERED_MAP=OFF -HAVE_TR1_UNORDERED_SET=OFF - -[debs] -libgoogle-glog-dev - -[rpms] -glog-devel diff --git a/build/fbcode_builder/manifests/gnu-bash b/build/fbcode_builder/manifests/gnu-bash deleted file mode 100644 index 89da77ca2b70b..0000000000000 --- a/build/fbcode_builder/manifests/gnu-bash +++ /dev/null @@ -1,20 +0,0 @@ -[manifest] -name = gnu-bash - -[download.os=darwin] -url = https://ftp.gnu.org/gnu/bash/bash-5.1-rc1.tar.gz -sha256 = 0b2684eb1990329d499c96decfe2459f3e150deb915b0a9d03cf1be692b1d6d3 - -[build.os=darwin] -# The buildin FreeBSD bash on OSX is both outdated and incompatible with the -# modern GNU bash, so for the sake of being cross-platform friendly this -# manifest provides GNU bash. -# NOTE: This is the 5.1-rc1 version, which is almost the same as what Homebrew -# uses (Homebrew installs 5.0 with the 18 patches that in fact make the 5.1-rc1 -# version). -builder = autoconf -subdir = bash-5.1-rc1 -build_in_src_dir = true - -[build.not(os=darwin)] -builder = nop diff --git a/build/fbcode_builder/manifests/gnu-coreutils b/build/fbcode_builder/manifests/gnu-coreutils deleted file mode 100644 index 1ab4d9d4a5a5b..0000000000000 --- a/build/fbcode_builder/manifests/gnu-coreutils +++ /dev/null @@ -1,15 +0,0 @@ -[manifest] -name = gnu-coreutils - -[download.os=darwin] -url = https://ftp.gnu.org/gnu/coreutils/coreutils-8.32.tar.gz -sha256 = d5ab07435a74058ab69a2007e838be4f6a90b5635d812c2e26671e3972fca1b8 - -[build.os=darwin] -# The buildin FreeBSD version incompatible with the GNU one, so for the sake of -# being cross-platform friendly this manifest provides the GNU version. -builder = autoconf -subdir = coreutils-8.32 - -[build.not(os=darwin)] -builder = nop diff --git a/build/fbcode_builder/manifests/gnu-grep b/build/fbcode_builder/manifests/gnu-grep deleted file mode 100644 index e6a163d37a84f..0000000000000 --- a/build/fbcode_builder/manifests/gnu-grep +++ /dev/null @@ -1,15 +0,0 @@ -[manifest] -name = gnu-grep - -[download.os=darwin] -url = https://ftp.gnu.org/gnu/grep/grep-3.5.tar.gz -sha256 = 9897220992a8fd38a80b70731462defa95f7ff2709b235fb54864ddd011141dd - -[build.os=darwin] -# The buildin FreeBSD version incompatible with the GNU one, so for the sake of -# being cross-platform friendly this manifest provides the GNU version. -builder = autoconf -subdir = grep-3.5 - -[build.not(os=darwin)] -builder = nop diff --git a/build/fbcode_builder/manifests/gnu-sed b/build/fbcode_builder/manifests/gnu-sed deleted file mode 100644 index 9b458df6ef987..0000000000000 --- a/build/fbcode_builder/manifests/gnu-sed +++ /dev/null @@ -1,15 +0,0 @@ -[manifest] -name = gnu-sed - -[download.os=darwin] -url = https://ftp.gnu.org/gnu/sed/sed-4.8.tar.gz -sha256 = 53cf3e14c71f3a149f29d13a0da64120b3c1d3334fba39c4af3e520be053982a - -[build.os=darwin] -# The buildin FreeBSD version incompatible with the GNU one, so for the sake of -# being cross-platform friendly this manifest provides the GNU version. -builder = autoconf -subdir = sed-4.8 - -[build.not(os=darwin)] -builder = nop diff --git a/build/fbcode_builder/manifests/googletest b/build/fbcode_builder/manifests/googletest deleted file mode 100644 index 90b05c635fbce..0000000000000 --- a/build/fbcode_builder/manifests/googletest +++ /dev/null @@ -1,23 +0,0 @@ -[manifest] -name = googletest - -[download] -url = https://github.com/google/googletest/archive/refs/tags/release-1.12.1.tar.gz -sha256 = 81964fe578e9bd7c94dfdb09c8e4d6e6759e19967e397dbea48d1c10e45d0df2 - -[build] -builder = cmake -subdir = googletest-release-1.12.1 - -[cmake.defines] -# Everything else defaults to the shared runtime, so tell gtest that -# it should not use its choice of the static runtime -gtest_force_shared_crt=ON - -[cmake.defines.os=windows] -BUILD_SHARED_LIBS=ON - -# packaged googletest is too old -[debs.not(all(distro=ubuntu,any(distro_vers="18.04",distro_vers="20.04",distro_vers="22.04")))] -libgtest-dev -libgmock-dev diff --git a/build/fbcode_builder/manifests/googletest_1_8 b/build/fbcode_builder/manifests/googletest_1_8 deleted file mode 100644 index 76c0ce51f9eb2..0000000000000 --- a/build/fbcode_builder/manifests/googletest_1_8 +++ /dev/null @@ -1,18 +0,0 @@ -[manifest] -name = googletest_1_8 - -[download] -url = https://github.com/google/googletest/archive/release-1.8.0.tar.gz -sha256 = 58a6f4277ca2bc8565222b3bbd58a177609e9c488e8a72649359ba51450db7d8 - -[build] -builder = cmake -subdir = googletest-release-1.8.0 - -[cmake.defines] -# Everything else defaults to the shared runtime, so tell gtest that -# it should not use its choice of the static runtime -gtest_force_shared_crt=ON - -[cmake.defines.os=windows] -BUILD_SHARED_LIBS=ON diff --git a/build/fbcode_builder/manifests/gperf b/build/fbcode_builder/manifests/gperf deleted file mode 100644 index 13d7a890fdede..0000000000000 --- a/build/fbcode_builder/manifests/gperf +++ /dev/null @@ -1,14 +0,0 @@ -[manifest] -name = gperf - -[download] -url = http://ftp.gnu.org/pub/gnu/gperf/gperf-3.1.tar.gz -sha256 = 588546b945bba4b70b6a3a616e80b4ab466e3f33024a352fc2198112cdbb3ae2 - -[build.not(os=windows)] -builder = autoconf -subdir = gperf-3.1 - -[build.os=windows] -builder = nop - diff --git a/build/fbcode_builder/manifests/iproute2 b/build/fbcode_builder/manifests/iproute2 deleted file mode 100644 index 6fb7f77ed9c2e..0000000000000 --- a/build/fbcode_builder/manifests/iproute2 +++ /dev/null @@ -1,13 +0,0 @@ -[manifest] -name = iproute2 - -[download] -url = https://mirrors.edge.kernel.org/pub/linux/utils/net/iproute2/iproute2-4.12.0.tar.gz -sha256 = 46612a1e2d01bb31932557bccdb1b8618cae9a439dfffc08ef35ed8e197f14ce - -[build.os=linux] -builder = iproute2 -subdir = iproute2-4.12.0 - -[build.not(os=linux)] -builder = nop diff --git a/build/fbcode_builder/manifests/jq b/build/fbcode_builder/manifests/jq deleted file mode 100644 index 231818f343e97..0000000000000 --- a/build/fbcode_builder/manifests/jq +++ /dev/null @@ -1,24 +0,0 @@ -[manifest] -name = jq - -[rpms] -jq - -[debs] -jq - -[download.not(os=windows)] -url = https://github.com/stedolan/jq/releases/download/jq-1.5/jq-1.5.tar.gz -sha256 = c4d2bfec6436341113419debf479d833692cc5cdab7eb0326b5a4d4fbe9f493c - -[build.not(os=windows)] -builder = autoconf -subdir = jq-1.5 - -[build.os=windows] -builder = nop - -[autoconf.args] -# This argument turns off some developers tool and it is recommended in jq's -# README ---disable-maintainer-mode diff --git a/build/fbcode_builder/manifests/katran b/build/fbcode_builder/manifests/katran deleted file mode 100644 index c4f2c74f40f5a..0000000000000 --- a/build/fbcode_builder/manifests/katran +++ /dev/null @@ -1,41 +0,0 @@ -[manifest] -name = katran -fbsource_path = fbcode/katran -shipit_project = katran -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebookincubator/katran.git - -[build.not(os=linux)] -builder = nop - -[build.os=linux] -builder = cmake -subdir = . - -[cmake.defines.test=on] -BUILD_TESTS=ON - -[cmake.defines.test=off] -BUILD_TESTS=OFF - -[dependencies] -folly -fizz -libbpf -libmnl -zlib -googletest -fmt - -[debs] -libssl-dev - -[shipit.pathmap] -fbcode/katran/public_root = . -fbcode/katran = katran - -[shipit.strip] -^fbcode/katran/facebook -^fbcode/katran/OSS_SYNC diff --git a/build/fbcode_builder/manifests/libbpf b/build/fbcode_builder/manifests/libbpf deleted file mode 100644 index 9ab3a26bee069..0000000000000 --- a/build/fbcode_builder/manifests/libbpf +++ /dev/null @@ -1,26 +0,0 @@ -[manifest] -name = libbpf - -[download] -url = https://github.com/libbpf/libbpf/archive/refs/tags/v0.7.0.tar.gz -sha256 = 5083588ce5a3a620e395ee1e596af77b4ec5771ffc71cff2af49dfee38c06361 - -# BPF only builds on linux, so make it a NOP on other platforms -[build.not(os=linux)] -builder = nop - -[build.os=linux] -builder = make -subdir = libbpf-0.7.0/src - -[make.build_args] -BUILD_STATIC_ONLY=y - -# libbpf-0.3 requires uapi headers >= 5.8 -[make.install_args] -install -install_uapi_headers -BUILD_STATIC_ONLY=y - -[dependencies] -libelf diff --git a/build/fbcode_builder/manifests/libbpf_0_2_0_beta b/build/fbcode_builder/manifests/libbpf_0_2_0_beta deleted file mode 100644 index 072639817d769..0000000000000 --- a/build/fbcode_builder/manifests/libbpf_0_2_0_beta +++ /dev/null @@ -1,26 +0,0 @@ -[manifest] -name = libbpf_0_2_0_beta - -[download] -url = https://github.com/libbpf/libbpf/archive/b6dd2f2.tar.gz -sha256 = 8db9dca90f5c445ef2362e3c6a00f3d6c4bf36e8782f8e27704109c78e541497 - -# BPF only builds on linux, so make it a NOP on other platforms -[build.not(os=linux)] -builder = nop - -[build.os=linux] -builder = make -subdir = libbpf-b6dd2f2b7df4d3bd35d64aaf521d9ad18d766f53/src - -[make.build_args] -BUILD_STATIC_ONLY=y - -# libbpf now requires uapi headers >= 5.8 -[make.install_args] -install -install_uapi_headers -BUILD_STATIC_ONLY=y - -[dependencies] -libelf diff --git a/build/fbcode_builder/manifests/libcurl b/build/fbcode_builder/manifests/libcurl deleted file mode 100644 index 466b4497c35d0..0000000000000 --- a/build/fbcode_builder/manifests/libcurl +++ /dev/null @@ -1,39 +0,0 @@ -[manifest] -name = libcurl - -[rpms] -libcurl-devel -libcurl - -[debs] -libcurl4-openssl-dev - -[download] -url = https://curl.haxx.se/download/curl-7.65.1.tar.gz -sha256 = 821aeb78421375f70e55381c9ad2474bf279fc454b791b7e95fc83562951c690 - -[dependencies] -nghttp2 - -# We use system OpenSSL on Linux (see folly's manifest for details) -[dependencies.not(os=linux)] -openssl - -[build.not(os=windows)] -builder = autoconf -subdir = curl-7.65.1 - -[autoconf.args] -# fboss (which added the libcurl dep) doesn't need ldap so it is disabled here. -# if someone in the future wants to add ldap for something else, it won't hurt -# fboss. However, that would require adding an ldap manifest. -# -# For the same reason, we disable libssh2 and libidn2 which aren't really used -# but would require adding manifests if we don't disable them. ---disable-ldap ---without-libssh2 ---without-libidn2 - -[build.os=windows] -builder = cmake -subdir = curl-7.65.1 diff --git a/build/fbcode_builder/manifests/libelf b/build/fbcode_builder/manifests/libelf deleted file mode 100644 index a46aab8796eaa..0000000000000 --- a/build/fbcode_builder/manifests/libelf +++ /dev/null @@ -1,20 +0,0 @@ -[manifest] -name = libelf - -[rpms] -elfutils-libelf-devel-static - -[debs] -libelf-dev - -[download] -url = https://ftp.osuosl.org/pub/blfs/conglomeration/libelf/libelf-0.8.13.tar.gz -sha256 = 591a9b4ec81c1f2042a97aa60564e0cb79d041c52faa7416acb38bc95bd2c76d - -# libelf only makes sense on linux, so make it a NOP on other platforms -[build.not(os=linux)] -builder = nop - -[build.os=linux] -builder = autoconf -subdir = libelf-0.8.13 diff --git a/build/fbcode_builder/manifests/libevent b/build/fbcode_builder/manifests/libevent deleted file mode 100644 index 1c073333fc826..0000000000000 --- a/build/fbcode_builder/manifests/libevent +++ /dev/null @@ -1,38 +0,0 @@ -[manifest] -name = libevent - -[debs] -libevent-dev - -[homebrew] -libevent - -[rpms] -libevent-devel - -# Note that the CMakeLists.txt file is present only in -# git repo and not in the release tarball, so take care -# to use the github generated source tarball rather than -# the explicitly uploaded source tarball -[download] -url = https://github.com/libevent/libevent/releases/download/release-2.1.12-stable/libevent-2.1.12-stable.tar.gz -sha256 = 92e6de1be9ec176428fd2367677e61ceffc2ee1cb119035037a27d346b0403bb - -[build] -builder = cmake -subdir = libevent-2.1.12-stable - -[cmake.defines] -EVENT__DISABLE_TESTS = ON -EVENT__DISABLE_BENCHMARK = ON -EVENT__DISABLE_SAMPLES = ON -EVENT__DISABLE_REGRESS = ON - -[cmake.defines.shared_libs=on] -EVENT__BUILD_SHARED_LIBRARIES = ON - -[cmake.defines.os=windows] -EVENT__LIBRARY_TYPE = STATIC - -[dependencies.not(any(os=linux, os=freebsd))] -openssl diff --git a/build/fbcode_builder/manifests/libffi b/build/fbcode_builder/manifests/libffi deleted file mode 100644 index 0511287c28923..0000000000000 --- a/build/fbcode_builder/manifests/libffi +++ /dev/null @@ -1,20 +0,0 @@ -[manifest] -name = libffi - -[debs] -libffi-dev - -[homebrew] -libffi - -[rpms] -libffi-devel -libffi - -[download] -url = https://github.com/libffi/libffi/releases/download/v3.4.2/libffi-3.4.2.tar.gz -sha256 = 540fb721619a6aba3bdeef7d940d8e9e0e6d2c193595bc243241b77ff9e93620 - -[build] -builder = autoconf -subdir = libffi-3.4.2 diff --git a/build/fbcode_builder/manifests/libgit2 b/build/fbcode_builder/manifests/libgit2 deleted file mode 100644 index 33e6b506f98f4..0000000000000 --- a/build/fbcode_builder/manifests/libgit2 +++ /dev/null @@ -1,30 +0,0 @@ -[manifest] -name = libgit2 - -[homebrew] -libgit2 - -[rpms] -libgit2-devel - -# Ubuntu 18.04 libgit2 has clash with libcurl4-openssl-dev as it depends on -# libcurl4-gnutls-dev. Should be ok from 20.04 again -# There is a description at https://github.com/r-hub/sysreqsdb/issues/77 -[debs.not(all(distro=ubuntu,distro_vers="18.04"))] -libgit2-dev - -[download] -url = https://github.com/libgit2/libgit2/archive/v0.28.1.tar.gz -sha256 = 0ca11048795b0d6338f2e57717370208c2c97ad66c6d5eac0c97a8827d13936b - -[build] -builder = cmake -subdir = libgit2-0.28.1 - -[cmake.defines] -# Could turn this on if we also wanted to add a manifest for libssh2 -USE_SSH = OFF -BUILD_CLAR = OFF -# Have to build shared to work around annoying problems with cmake -# mis-parsing the frameworks required to link this on macos :-/ -BUILD_SHARED_LIBS = ON diff --git a/build/fbcode_builder/manifests/libicu b/build/fbcode_builder/manifests/libicu deleted file mode 100644 index c1deda503760f..0000000000000 --- a/build/fbcode_builder/manifests/libicu +++ /dev/null @@ -1,19 +0,0 @@ -[manifest] -name = libicu - -[rpms] -libicu-devel - -[debs] -libicu-dev - -[download] -url = https://github.com/unicode-org/icu/releases/download/release-68-2/icu4c-68_2-src.tgz -sha256 = c79193dee3907a2199b8296a93b52c5cb74332c26f3d167269487680d479d625 - -[build.not(os=windows)] -builder = autoconf -subdir = icu/source - -[build.os=windows] -builder = nop diff --git a/build/fbcode_builder/manifests/libmnl b/build/fbcode_builder/manifests/libmnl deleted file mode 100644 index 1f8d609c1392a..0000000000000 --- a/build/fbcode_builder/manifests/libmnl +++ /dev/null @@ -1,21 +0,0 @@ -[manifest] -name = libmnl - -[rpms] -libmnl-devel - -# all centos 8 distros are missing this, -# but its in fedora so may be back in a later version -[rpms.not(all(any(distro=centos_stream,distro=centos),distro_vers=8))] -libmnl-static - -[debs] -libmnl-dev - -[download] -url = http://www.netfilter.org/pub/libmnl/libmnl-1.0.4.tar.bz2 -sha256 = 171f89699f286a5854b72b91d06e8f8e3683064c5901fb09d954a9ab6f551f81 - -[build.os=linux] -builder = autoconf -subdir = libmnl-1.0.4 diff --git a/build/fbcode_builder/manifests/libnl b/build/fbcode_builder/manifests/libnl deleted file mode 100644 index 560885c2e331a..0000000000000 --- a/build/fbcode_builder/manifests/libnl +++ /dev/null @@ -1,18 +0,0 @@ -[manifest] -name = libnl - -[rpms] -libnl3-devel -libnl3 - -[debs] -libnl-3-dev -libnl-route-3-dev - -[download] -url = https://www.infradead.org/~tgr/libnl/files/libnl-3.2.25.tar.gz -sha256 = 8beb7590674957b931de6b7f81c530b85dc7c1ad8fbda015398bc1e8d1ce8ec5 - -[build.os=linux] -builder = autoconf -subdir = libnl-3.2.25 diff --git a/build/fbcode_builder/manifests/libsai b/build/fbcode_builder/manifests/libsai deleted file mode 100644 index 4f772d826e113..0000000000000 --- a/build/fbcode_builder/manifests/libsai +++ /dev/null @@ -1,14 +0,0 @@ -[manifest] -name = libsai - -[download] -url = https://github.com/opencomputeproject/SAI/archive/v1.13.0.tar.gz -sha256 = bb8c5d6cb0c7897422875d0da7b903708d1a15557ad07c6d6266dff83cb8c78d - -[build] -builder = nop -subdir = SAI-1.13.0 - -[install.files] -inc = include -experimental = experimental diff --git a/build/fbcode_builder/manifests/libsodium b/build/fbcode_builder/manifests/libsodium deleted file mode 100644 index 0c9941c3fee76..0000000000000 --- a/build/fbcode_builder/manifests/libsodium +++ /dev/null @@ -1,36 +0,0 @@ -[manifest] -name = libsodium - -[debs] -libsodium-dev - -[homebrew] -libsodium - -[rpms] -libsodium-devel -libsodium-static - -[download.not(os=windows)] -url = https://github.com/jedisct1/libsodium/releases/download/1.0.17/libsodium-1.0.17.tar.gz -sha256 = 0cc3dae33e642cc187b5ceb467e0ad0e1b51dcba577de1190e9ffa17766ac2b1 - -[build.not(os=windows)] -builder = autoconf -subdir = libsodium-1.0.17 - -[download.os=windows] -url = https://download.libsodium.org/libsodium/releases/old/libsodium-1.0.17-msvc.zip -sha256 = f0f32ad8ebd76eee99bb039f843f583f2babca5288a8c26a7261db9694c11467 - -[build.os=windows] -builder = nop - -[install.files.os=windows] -x64/Release/v141/dynamic/libsodium.dll = bin/libsodium.dll -x64/Release/v141/dynamic/libsodium.lib = lib/libsodium.lib -x64/Release/v141/dynamic/libsodium.exp = lib/libsodium.exp -x64/Release/v141/dynamic/libsodium.pdb = lib/libsodium.pdb -include = include - -[autoconf.args] diff --git a/build/fbcode_builder/manifests/libtool b/build/fbcode_builder/manifests/libtool deleted file mode 100644 index 887a23cdfba7a..0000000000000 --- a/build/fbcode_builder/manifests/libtool +++ /dev/null @@ -1,25 +0,0 @@ -[manifest] -name = libtool - -[homebrew] -libtool - -[rpms] -libtool - -[debs] -libtool - -[download] -url = http://ftp.gnu.org/gnu/libtool/libtool-2.4.6.tar.gz -sha256 = e3bd4d5d3d025a36c21dd6af7ea818a2afcd4dfc1ea5a17b39d7854bcd0c06e3 - -[build] -builder = autoconf -subdir = libtool-2.4.6 - -[dependencies] -automake - -[autoconf.args] ---enable-ltdl-install diff --git a/build/fbcode_builder/manifests/libusb b/build/fbcode_builder/manifests/libusb deleted file mode 100644 index 9b97c3a597629..0000000000000 --- a/build/fbcode_builder/manifests/libusb +++ /dev/null @@ -1,26 +0,0 @@ -[manifest] -name = libusb - -[debs] -libusb-1.0-0-dev - -[homebrew] -libusb - -[rpms] -libusb-devel -libusb - -[download] -url = https://github.com/libusb/libusb/releases/download/v1.0.22/libusb-1.0.22.tar.bz2 -sha256 = 75aeb9d59a4fdb800d329a545c2e6799f732362193b465ea198f2aa275518157 - -[build.os=linux] -builder = autoconf -subdir = libusb-1.0.22 - -[autoconf.args] -# fboss (which added the libusb dep) doesn't need udev so it is disabled here. -# if someone in the future wants to add udev for something else, it won't hurt -# fboss. ---disable-udev diff --git a/build/fbcode_builder/manifests/libyaml b/build/fbcode_builder/manifests/libyaml deleted file mode 100644 index a7ff57316fe10..0000000000000 --- a/build/fbcode_builder/manifests/libyaml +++ /dev/null @@ -1,13 +0,0 @@ -[manifest] -name = libyaml - -[download] -url = http://pyyaml.org/download/libyaml/yaml-0.1.7.tar.gz -sha256 = 8088e457264a98ba451a90b8661fcb4f9d6f478f7265d48322a196cec2480729 - -[build.os=linux] -builder = autoconf -subdir = yaml-0.1.7 - -[build.not(os=linux)] -builder = nop diff --git a/build/fbcode_builder/manifests/libzmq b/build/fbcode_builder/manifests/libzmq deleted file mode 100644 index a36121d67fd72..0000000000000 --- a/build/fbcode_builder/manifests/libzmq +++ /dev/null @@ -1,27 +0,0 @@ -[manifest] -name = libzmq - -[debs] -libzmq3-dev - -[homebrew] -zeromq - -[rpms] -zeromq-devel -zeromq - -[download] -url = https://github.com/zeromq/libzmq/releases/download/v4.3.1/zeromq-4.3.1.tar.gz -sha256 = bcbabe1e2c7d0eec4ed612e10b94b112dd5f06fcefa994a0c79a45d835cd21eb - - -[build] -builder = autoconf -subdir = zeromq-4.3.1 - -[autoconf.args] - -[dependencies] -autoconf -libtool diff --git a/build/fbcode_builder/manifests/lld b/build/fbcode_builder/manifests/lld deleted file mode 100644 index 39f5b095213ef..0000000000000 --- a/build/fbcode_builder/manifests/lld +++ /dev/null @@ -1,13 +0,0 @@ -[manifest] -name = lld - -[debs] -lld - -[rpms] -lld - -# We use the system lld where needed on linux and default linker elsewhere -[build] -builder = nop - diff --git a/build/fbcode_builder/manifests/lz4 b/build/fbcode_builder/manifests/lz4 deleted file mode 100644 index 084d6a4aecd8b..0000000000000 --- a/build/fbcode_builder/manifests/lz4 +++ /dev/null @@ -1,22 +0,0 @@ -[manifest] -name = lz4 - -[homebrew] -lz4 - -[rpms] -lz4-devel -# centos 8 and centos_stream 9 are missing this rpm -[rpms.not(any(all(distro=centos,distro_vers=8),all(distro=centos_stream,distro_vers=9)))] -lz4-static - -[debs] -liblz4-dev - -[download] -url = https://github.com/lz4/lz4/archive/v1.8.3.tar.gz -sha256 = 33af5936ac06536805f9745e0b6d61da606a1f8b4cc5c04dd3cbaca3b9b4fc43 - -[build] -builder = cmake -subdir = lz4-1.8.3/contrib/cmake_unofficial diff --git a/build/fbcode_builder/manifests/lzo b/build/fbcode_builder/manifests/lzo deleted file mode 100644 index fd474127bbb12..0000000000000 --- a/build/fbcode_builder/manifests/lzo +++ /dev/null @@ -1,22 +0,0 @@ -[manifest] -name = lzo - -[debs] -liblzo2-dev - -[homebrew] -lzo - -[rpms] -lzo-devel - -[download] -url = http://www.oberhumer.com/opensource/lzo/download/lzo-2.10.tar.gz -sha256 = c0f892943208266f9b6543b3ae308fab6284c5c90e627931446fb49b4221a072 - -[build.not(os=windows)] -builder = autoconf -subdir = lzo-2.10 - -[build.os=windows] -builder = nop diff --git a/build/fbcode_builder/manifests/mononoke b/build/fbcode_builder/manifests/mononoke deleted file mode 100644 index 9df5bbd484e0f..0000000000000 --- a/build/fbcode_builder/manifests/mononoke +++ /dev/null @@ -1,55 +0,0 @@ -[manifest] -name = mononoke -fbsource_path = fbcode/eden -shipit_project = eden -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/sapling.git - -[build.not(os=windows)] -builder = cargo - -[build.os=windows] -# building Mononoke on windows is not supported -builder = nop - -[cargo] -build_doc = true -workspace_dir = eden/mononoke -cargo_config_file = source/eden/mononoke/.cargo/config - -[shipit.pathmap] -fbcode/configerator/structs/scm/hg = configerator/structs/scm/hg -fbcode/configerator/structs/scm/hg/public_autocargo = configerator/structs/scm/hg -fbcode/configerator/structs/scm/mononoke/public_autocargo = configerator/structs/scm/mononoke -fbcode/configerator/structs/scm/mononoke = configerator/structs/scm/mononoke -fbcode/eden/oss = . -fbcode/eden = eden -fbcode/eden/fs/public_autocargo = eden/fs -fbcode/eden/mononoke/public_autocargo = eden/mononoke -fbcode/eden/scm/public_autocargo = eden/scm -fbcode/tools/lfs = tools/lfs -tools/rust/ossconfigs = . - -[shipit.strip] -^fbcode/configerator/structs/scm/hg(?!/public_autocargo).*/Cargo\.toml$ -^fbcode/configerator/structs/scm/mononoke(?!/public_autocargo).*/Cargo\.toml$ -^fbcode/eden/fs(?!/public_autocargo).*/Cargo\.toml$ -^fbcode/eden/scm/lib/third-party/rust/.*/Cargo\.toml$ -^fbcode/eden/mononoke(?!/public_autocargo).*/Cargo\.toml$ -# strip other scm code unrelated to mononoke to prevent triggering unnecessary checks -^fbcode/eden(?!/mononoke|/scm/(lib|public_autocargo))/.*$ -^.*/facebook/.*$ -^.*/fb/.*$ - -[dependencies] -fb303 -fbthrift -rust-shed - -[dependencies.os=linux] -lld - -[dependencies.fb=on] -rust diff --git a/build/fbcode_builder/manifests/mvfst b/build/fbcode_builder/manifests/mvfst deleted file mode 100644 index c2a797be23478..0000000000000 --- a/build/fbcode_builder/manifests/mvfst +++ /dev/null @@ -1,32 +0,0 @@ -[manifest] -name = mvfst -fbsource_path = fbcode/quic -shipit_project = mvfst -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/mvfst.git - -[build] -builder = cmake -subdir = . - -[cmake.defines.test=on] -BUILD_TESTS = ON - -[cmake.defines.all(os=windows, test=on)] -BUILD_TESTS = OFF - -[cmake.defines.test=off] -BUILD_TESTS = OFF - -[dependencies] -folly -fizz - -[dependencies.all(test=on, not(os=windows))] -googletest - -[shipit.pathmap] -fbcode/quic/public_root = . -fbcode/quic = quic diff --git a/build/fbcode_builder/manifests/ncurses b/build/fbcode_builder/manifests/ncurses deleted file mode 100644 index 1bb5e8a45396a..0000000000000 --- a/build/fbcode_builder/manifests/ncurses +++ /dev/null @@ -1,30 +0,0 @@ -[manifest] -name = ncurses - -[debs] -libncurses-dev - -[homebrew] -ncurses - -[rpms] -ncurses-devel - -[download] -url = https://ftp.gnu.org/pub/gnu/ncurses/ncurses-6.3.tar.gz -sha256 = 97fc51ac2b085d4cde31ef4d2c3122c21abc217e9090a43a30fc5ec21684e059 - -[build.not(os=windows)] -builder = autoconf -subdir = ncurses-6.3 - -[autoconf.args] ---without-cxx-binding ---without-ada - -[autoconf.args.os=linux] ---enable-shared ---with-shared - -[build.os=windows] -builder = nop diff --git a/build/fbcode_builder/manifests/nghttp2 b/build/fbcode_builder/manifests/nghttp2 deleted file mode 100644 index 5ebdce0a42fed..0000000000000 --- a/build/fbcode_builder/manifests/nghttp2 +++ /dev/null @@ -1,21 +0,0 @@ -[manifest] -name = nghttp2 - -[rpms] -libnghttp2-devel -libnghttp2 - -[debs] -libnghttp2-dev - -[download] -url = https://github.com/nghttp2/nghttp2/releases/download/v1.47.0/nghttp2-1.47.0.tar.gz -sha256 = 62f50f0e9fc479e48b34e1526df8dd2e94136de4c426b7680048181606832b7c - -[build] -builder = autoconf -subdir = nghttp2-1.47.0 - -[autoconf.args] ---enable-lib-only ---disable-dependency-tracking diff --git a/build/fbcode_builder/manifests/ninja b/build/fbcode_builder/manifests/ninja deleted file mode 100644 index 713c59d69f98d..0000000000000 --- a/build/fbcode_builder/manifests/ninja +++ /dev/null @@ -1,29 +0,0 @@ -[manifest] -name = ninja - -[debs] -ninja-build - -[homebrew] -ninja - -[rpms] -ninja-build - -[download.os=windows] -url = https://github.com/ninja-build/ninja/releases/download/v1.10.2/ninja-win.zip -sha256 = bbde850d247d2737c5764c927d1071cbb1f1957dcabda4a130fa8547c12c695f - -[build.os=windows] -builder = nop - -[install.files.os=windows] -ninja.exe = bin/ninja.exe - -[download.not(os=windows)] -url = https://github.com/ninja-build/ninja/archive/v1.10.2.tar.gz -sha256 = ce35865411f0490368a8fc383f29071de6690cbadc27704734978221f25e2bed - -[build.not(os=windows)] -builder = ninja_bootstrap -subdir = ninja-1.10.2 diff --git a/build/fbcode_builder/manifests/nlohmann-json b/build/fbcode_builder/manifests/nlohmann-json deleted file mode 100644 index 7d552d95faccf..0000000000000 --- a/build/fbcode_builder/manifests/nlohmann-json +++ /dev/null @@ -1,12 +0,0 @@ -[manifest] -name = nlohmann-json - -[download] -url = https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.tar.gz -sha256 = 5daca6ca216495edf89d167f808d1d03c4a4d929cef7da5e10f135ae1540c7e4 - -[dependencies] - -[build] -builder = cmake -subdir = json-3.10.5 diff --git a/build/fbcode_builder/manifests/nmap b/build/fbcode_builder/manifests/nmap deleted file mode 100644 index c245e12417be6..0000000000000 --- a/build/fbcode_builder/manifests/nmap +++ /dev/null @@ -1,25 +0,0 @@ -[manifest] -name = nmap - -[rpms] -nmap - -[debs] -nmap - -[download.not(os=windows)] -url = https://api.github.com/repos/nmap/nmap/tarball/ef8213a36c2e89233c806753a57b5cd473605408 -sha256 = eda39e5a8ef4964fac7db16abf91cc11ff568eac0fa2d680b0bfa33b0ed71f4a - -[build.not(os=windows)] -builder = autoconf -subdir = nmap-nmap-ef8213a -build_in_src_dir = true - -[build.os=windows] -builder = nop - -[autoconf.args] -# Without this option the build was filing to find some third party libraries -# that we don't need -enable_rdma=no diff --git a/build/fbcode_builder/manifests/openr b/build/fbcode_builder/manifests/openr deleted file mode 100644 index 913d81f379ee1..0000000000000 --- a/build/fbcode_builder/manifests/openr +++ /dev/null @@ -1,38 +0,0 @@ -[manifest] -name = openr -fbsource_path = facebook/openr -shipit_project = openr -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/openr.git - -[build.os=linux] -builder = cmake -# openr files take a lot of RAM to compile. -job_weight_mib = 3072 - -[build.not(os=linux)] -# boost.fiber is required and that is not available on macos. -builder = nop - -[dependencies] -boost -fb303 -fbthrift -folly -googletest -re2 -range-v3 - -[cmake.defines.test=on] -BUILD_TESTS=ON -ADD_ROOT_TESTS=OFF - -[cmake.defines.test=off] -BUILD_TESTS=OFF - - -[shipit.pathmap] -fbcode/openr = openr -fbcode/openr/public_tld = . diff --git a/build/fbcode_builder/manifests/openssl b/build/fbcode_builder/manifests/openssl deleted file mode 100644 index b806b701ae720..0000000000000 --- a/build/fbcode_builder/manifests/openssl +++ /dev/null @@ -1,26 +0,0 @@ -[manifest] -name = openssl - -[debs] -libssl-dev - -[homebrew] -openssl@1.1 -# on homebrew need the matching curl and ca- - -[rpms] -openssl -openssl-devel -openssl-libs - -[download] -url = https://www.openssl.org/source/openssl-1.1.1l.tar.gz -sha256 = 0b7a3e5e59c34827fe0c3a74b7ec8baef302b98fa80088d7f9153aa16fa76bd1 - -# We use the system openssl on linux -[build.not(any(os=linux, os=freebsd))] -builder = openssl -subdir = openssl-1.1.1l - -[dependencies.os=windows] -perl diff --git a/build/fbcode_builder/manifests/osxfuse b/build/fbcode_builder/manifests/osxfuse deleted file mode 100644 index b6c6c551f118d..0000000000000 --- a/build/fbcode_builder/manifests/osxfuse +++ /dev/null @@ -1,12 +0,0 @@ -[manifest] -name = osxfuse - -[download] -url = https://github.com/osxfuse/osxfuse/archive/osxfuse-3.8.3.tar.gz -sha256 = 93bab6731bdfe8dc1ef069483437270ce7fe5a370f933d40d8d0ef09ba846c0c - -[build] -builder = nop - -[install.files] -osxfuse-osxfuse-3.8.3/common = include diff --git a/build/fbcode_builder/manifests/patchelf b/build/fbcode_builder/manifests/patchelf deleted file mode 100644 index f9d050424a293..0000000000000 --- a/build/fbcode_builder/manifests/patchelf +++ /dev/null @@ -1,17 +0,0 @@ -[manifest] -name = patchelf - -[rpms] -patchelf - -[debs] -patchelf - -[download] -url = https://github.com/NixOS/patchelf/archive/0.10.tar.gz -sha256 = b3cb6bdedcef5607ce34a350cf0b182eb979f8f7bc31eae55a93a70a3f020d13 - -[build] -builder = autoconf -subdir = patchelf-0.10 - diff --git a/build/fbcode_builder/manifests/pcre b/build/fbcode_builder/manifests/pcre deleted file mode 100644 index 047f6352b2715..0000000000000 --- a/build/fbcode_builder/manifests/pcre +++ /dev/null @@ -1,20 +0,0 @@ -[manifest] -name = pcre - -[homebrew] -pcre - -[rpms] -pcre-devel -pcre-static - -[debs] -libpcre3-dev - -[download] -url = https://versaweb.dl.sourceforge.net/project/pcre/pcre/8.43/pcre-8.43.tar.gz -sha256 = 0b8e7465dc5e98c757cc3650a20a7843ee4c3edf50aaf60bb33fd879690d2c73 - -[build] -builder = cmake -subdir = pcre-8.43 diff --git a/build/fbcode_builder/manifests/pcre2 b/build/fbcode_builder/manifests/pcre2 deleted file mode 100644 index 9ba119a78f570..0000000000000 --- a/build/fbcode_builder/manifests/pcre2 +++ /dev/null @@ -1,20 +0,0 @@ -[manifest] -name = pcre2 - -[homebrew] -pcre2 - -[rpms] -pcre2-devel -pcre-static - -[debs] -libpcre2-dev - -[download] -url = https://github.com/PCRE2Project/pcre2/releases/download/pcre2-10.40/pcre2-10.40.tar.bz2 -sha256 = 14e4b83c4783933dc17e964318e6324f7cae1bc75d8f3c79bc6969f00c159d68 - -[build] -builder = cmake -subdir = pcre2-10.40 diff --git a/build/fbcode_builder/manifests/perl b/build/fbcode_builder/manifests/perl deleted file mode 100644 index 32bddc51ca69e..0000000000000 --- a/build/fbcode_builder/manifests/perl +++ /dev/null @@ -1,11 +0,0 @@ -[manifest] -name = perl - -[download.os=windows] -url = http://strawberryperl.com/download/5.28.1.1/strawberry-perl-5.28.1.1-64bit-portable.zip -sha256 = 935c95ba096fa11c4e1b5188732e3832d330a2a79e9882ab7ba8460ddbca810d - -[build.os=windows] -builder = nop -subdir = perl - diff --git a/build/fbcode_builder/manifests/pexpect b/build/fbcode_builder/manifests/pexpect deleted file mode 100644 index 682e66a540c12..0000000000000 --- a/build/fbcode_builder/manifests/pexpect +++ /dev/null @@ -1,12 +0,0 @@ -[manifest] -name = pexpect - -[download] -url = https://files.pythonhosted.org/packages/0e/3e/377007e3f36ec42f1b84ec322ee12141a9e10d808312e5738f52f80a232c/pexpect-4.7.0-py2.py3-none-any.whl -sha256 = 2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1 - -[build] -builder = python-wheel - -[dependencies] -python-ptyprocess diff --git a/build/fbcode_builder/manifests/protobuf b/build/fbcode_builder/manifests/protobuf deleted file mode 100644 index 311775b98179f..0000000000000 --- a/build/fbcode_builder/manifests/protobuf +++ /dev/null @@ -1,18 +0,0 @@ -[manifest] -name = protobuf - -[rpms] -protobuf-devel - -[debs] -libprotobuf-dev - -[git] -repo_url = https://github.com/protocolbuffers/protobuf.git -rev = master - -[build.not(os=windows)] -builder = autoconf - -[build.os=windows] -builder = nop diff --git a/build/fbcode_builder/manifests/proxygen b/build/fbcode_builder/manifests/proxygen deleted file mode 100644 index a7b48043f8fb9..0000000000000 --- a/build/fbcode_builder/manifests/proxygen +++ /dev/null @@ -1,37 +0,0 @@ -[manifest] -name = proxygen -fbsource_path = fbcode/proxygen -shipit_project = proxygen -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/proxygen.git - -[build.os=windows] -builder = nop - -[build] -builder = cmake -subdir = . -job_weight_mib = 3072 - -[cmake.defines.test=on] -BUILD_TESTS = ON - -[cmake.defines.test=off] -BUILD_TESTS = OFF - -[dependencies] -zlib -gperf -folly -fizz -wangle -mvfst - -[dependencies.test=on] -googletest - -[shipit.pathmap] -fbcode/proxygen/public_tld = . -fbcode/proxygen = proxygen diff --git a/build/fbcode_builder/manifests/python b/build/fbcode_builder/manifests/python deleted file mode 100644 index c2e98e5711c4a..0000000000000 --- a/build/fbcode_builder/manifests/python +++ /dev/null @@ -1,45 +0,0 @@ -[manifest] -name = python - -[homebrew] -python@3.8 - -[rpms] -python3 -python3-devel - -# sapling needs dataclasses which arrive in 3.7, and the bionic python is 3.6 -[debs.all(distro=ubuntu,distro_vers="18.04")] -python3.8-dev - -[debs.not(all(distro=ubuntu,distro_vers="18.04"))] -python3-all-dev - -[download] -url = https://www.python.org/ftp/python/3.8.13/Python-3.8.13.tgz -sha256 = 903b92d76354366b1d9c4434d0c81643345cef87c1600adfa36095d7b00eede4 - -[build] -builder = autoconf -subdir = Python-3.8.13 - -[autoconf.args] ---enable-shared ---with-ensurepip=install - -# python's pkg-config libffi detection is broken -# See https://bugs.python.org/issue34823 for clearest description -# and pending PR https://github.com/python/cpython/pull/20451 -# The documented workaround requires an environment variable derived from -# pkg-config to be passed into its configure step -[autoconf.envcmd.LDFLAGS] -pkg-config ---libs-only-L -libffi - -[dependencies] -libffi -# eden tests expect the python bz2 support -bz2 -# eden tests expect the python curses support -ncurses diff --git a/build/fbcode_builder/manifests/python-click b/build/fbcode_builder/manifests/python-click deleted file mode 100644 index ea9a9d2d3dc38..0000000000000 --- a/build/fbcode_builder/manifests/python-click +++ /dev/null @@ -1,9 +0,0 @@ -[manifest] -name = python-click - -[download] -url = https://files.pythonhosted.org/packages/d2/3d/fa76db83bf75c4f8d338c2fd15c8d33fdd7ad23a9b5e57eb6c5de26b430e/click-7.1.2-py2.py3-none-any.whl -sha256 = dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc - -[build] -builder = python-wheel diff --git a/build/fbcode_builder/manifests/python-dulwich b/build/fbcode_builder/manifests/python-dulwich deleted file mode 100644 index 0d995e12f4b36..0000000000000 --- a/build/fbcode_builder/manifests/python-dulwich +++ /dev/null @@ -1,19 +0,0 @@ -[manifest] -name = python-dulwich - -# The below links point to custom github forks of project dulwich, because the -# 0.18.6 version didn't have an official rollout of wheel packages. - -[download.os=linux] -url = https://github.com/lukaspiatkowski/dulwich/releases/download/dulwich-0.18.6-wheel/dulwich-0.18.6-cp36-cp36m-linux_x86_64.whl -sha256 = e96f545f3d003e67236785473caaba2c368e531ea85fd508a3bd016ebac3a6d8 - -[download.os=darwin] -url = https://github.com/lukaspiatkowski/dulwich/releases/download/dulwich-0.18.6-wheel/dulwich-0.18.6-cp37-cp37m-macosx_10_14_x86_64.whl -sha256 = 8373652056284ad40ea5220b659b3489b0a91f25536322345a3e4b5d29069308 - -[build.not(os=windows)] -builder = python-wheel - -[build.os=windows] -builder = nop diff --git a/build/fbcode_builder/manifests/python-filelock b/build/fbcode_builder/manifests/python-filelock deleted file mode 100644 index 40502de7c2159..0000000000000 --- a/build/fbcode_builder/manifests/python-filelock +++ /dev/null @@ -1,9 +0,0 @@ -[manifest] -name = python-filelock - -[download] -url = https://files.pythonhosted.org/packages/31/24/ee722b92f23b9ebd87783e893a75352c048bbbc1f67dce0d63b58b46cb48/filelock-3.3.2-py3-none-any.whl -sha256 = bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b - -[build] -builder = python-wheel diff --git a/build/fbcode_builder/manifests/python-ptyprocess b/build/fbcode_builder/manifests/python-ptyprocess deleted file mode 100644 index adc60e048ed17..0000000000000 --- a/build/fbcode_builder/manifests/python-ptyprocess +++ /dev/null @@ -1,9 +0,0 @@ -[manifest] -name = python-ptyprocess - -[download] -url = https://files.pythonhosted.org/packages/d1/29/605c2cc68a9992d18dada28206eeada56ea4bd07a239669da41674648b6f/ptyprocess-0.6.0-py2.py3-none-any.whl -sha256 = d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f - -[build] -builder = python-wheel diff --git a/build/fbcode_builder/manifests/python-six b/build/fbcode_builder/manifests/python-six deleted file mode 100644 index a712188dc2909..0000000000000 --- a/build/fbcode_builder/manifests/python-six +++ /dev/null @@ -1,9 +0,0 @@ -[manifest] -name = python-six - -[download] -url = https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl -sha256 = 3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c - -[build] -builder = python-wheel diff --git a/build/fbcode_builder/manifests/python-toml b/build/fbcode_builder/manifests/python-toml deleted file mode 100644 index b49a3b8fb8604..0000000000000 --- a/build/fbcode_builder/manifests/python-toml +++ /dev/null @@ -1,9 +0,0 @@ -[manifest] -name = python-toml - -[download] -url = https://files.pythonhosted.org/packages/a2/12/ced7105d2de62fa7c8fb5fce92cc4ce66b57c95fb875e9318dba7f8c5db0/toml-0.10.0-py2.py3-none-any.whl -sha256 = 235682dd292d5899d361a811df37e04a8828a5b1da3115886b73cf81ebc9100e - -[build] -builder = python-wheel diff --git a/build/fbcode_builder/manifests/range-v3 b/build/fbcode_builder/manifests/range-v3 deleted file mode 100644 index f96403c83f8c7..0000000000000 --- a/build/fbcode_builder/manifests/range-v3 +++ /dev/null @@ -1,11 +0,0 @@ -[manifest] -name = range-v3 - -[download] -url = https://github.com/ericniebler/range-v3/archive/refs/tags/0.11.0.tar.gz -sha256 = 376376615dbba43d3bef75aa590931431ecb49eb36d07bb726a19f680c75e20c - - -[build] -builder = cmake -subdir = range-v3-0.11.0 diff --git a/build/fbcode_builder/manifests/re2 b/build/fbcode_builder/manifests/re2 deleted file mode 100644 index 945750afd6cba..0000000000000 --- a/build/fbcode_builder/manifests/re2 +++ /dev/null @@ -1,20 +0,0 @@ -[manifest] -name = re2 - -[homebrew] -re2 - -[debs] -libre2-dev - -[rpms] -re2 -re2-devel - -[download] -url = https://github.com/google/re2/archive/2019-06-01.tar.gz -sha256 = 02b7d73126bd18e9fbfe5d6375a8bb13fadaf8e99e48cbb062e4500fc18e8e2e - -[build] -builder = cmake -subdir = re2-2019-06-01 diff --git a/build/fbcode_builder/manifests/rocksdb b/build/fbcode_builder/manifests/rocksdb deleted file mode 100644 index e1e3e717318fc..0000000000000 --- a/build/fbcode_builder/manifests/rocksdb +++ /dev/null @@ -1,42 +0,0 @@ -[manifest] -name = rocksdb - -[download] -url = https://github.com/facebook/rocksdb/archive/refs/tags/v7.7.3.tar.gz -sha256 = b8ac9784a342b2e314c821f6d701148912215666ac5e9bdbccd93cf3767cb611 - -[dependencies] -lz4 -snappy - -[build] -builder = cmake -subdir = rocksdb-7.7.3 - -[cmake.defines] -WITH_SNAPPY=ON -WITH_LZ4=ON -WITH_TESTS=OFF -WITH_BENCHMARK_TOOLS=OFF -# We get relocation errors with the static gflags lib, -# and there's no clear way to make it pick the shared gflags -# so just turn it off. -WITH_GFLAGS=OFF -# mac pro machines don't have some of the newer features that -# rocksdb enables by default; ask it to disable their use even -# when building on new hardware -PORTABLE = ON -# Disable the use of -Werror -FAIL_ON_WARNINGS = OFF - -[cmake.defines.os=windows] -ROCKSDB_INSTALL_ON_WINDOWS=ON -# RocksDB hard codes the paths to the snappy libs to something -# that doesn't exist; ignoring the usual cmake rules. As a result, -# we can't build it with snappy without either patching rocksdb or -# without introducing more complex logic to the build system to -# connect the snappy build outputs to rocksdb's custom logic here. -# Let's just turn it off on windows. -WITH_SNAPPY=OFF -WITH_LZ4=ON -ROCKSDB_SKIP_THIRDPARTY=ON diff --git a/build/fbcode_builder/manifests/rust-shed b/build/fbcode_builder/manifests/rust-shed deleted file mode 100644 index 31e2b61d91313..0000000000000 --- a/build/fbcode_builder/manifests/rust-shed +++ /dev/null @@ -1,35 +0,0 @@ -[manifest] -name = rust-shed -fbsource_path = fbcode/common/rust/shed -shipit_project = rust-shed -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebookexperimental/rust-shed.git - -[build] -builder = cargo - -[cargo] -build_doc = true -workspace_dir = - -[shipit.pathmap] -fbcode/common/rust/shed = shed -fbcode/common/rust/shed/public_autocargo = shed -fbcode/common/rust/shed/public_tld = . -tools/rust/ossconfigs = . - -[shipit.strip] -^fbcode/common/rust/shed/(?!public_autocargo|public_tld).+/Cargo\.toml$ - -[dependencies] -fbthrift -fb303 - -# We use the system openssl on linux -[dependencies.not(os=linux)] -openssl - -[dependencies.fbsource=on] -rust diff --git a/build/fbcode_builder/manifests/snappy b/build/fbcode_builder/manifests/snappy deleted file mode 100644 index c4517efa24d04..0000000000000 --- a/build/fbcode_builder/manifests/snappy +++ /dev/null @@ -1,24 +0,0 @@ -[manifest] -name = snappy - -[debs] -libsnappy-dev - -[rpms] -snappy-devel - -[download] -url = https://github.com/google/snappy/archive/1.1.7.tar.gz -sha256 = 3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4 - -[build] -builder = cmake -subdir = snappy-1.1.7 - -[cmake.defines] -SNAPPY_BUILD_TESTS = OFF - -# Avoid problems like `relocation R_X86_64_PC32 against symbol` on ELF systems -# when linking rocksdb, which builds PIC even when building a static lib -[cmake.defines.os=linux] -BUILD_SHARED_LIBS = ON diff --git a/build/fbcode_builder/manifests/sqlite3 b/build/fbcode_builder/manifests/sqlite3 deleted file mode 100644 index c87d4cf932c19..0000000000000 --- a/build/fbcode_builder/manifests/sqlite3 +++ /dev/null @@ -1,24 +0,0 @@ -[manifest] -name = sqlite3 - -[debs] -libsqlite3-dev - -[homebrew] -sqlite - -[rpms] -sqlite-devel -sqlite-libs - -[download] -url = https://sqlite.org/2019/sqlite-amalgamation-3280000.zip -sha256 = d02fc4e95cfef672b45052e221617a050b7f2e20103661cda88387349a9b1327 - -[dependencies] -cmake -ninja - -[build] -builder = sqlite -subdir = sqlite-amalgamation-3280000 diff --git a/build/fbcode_builder/manifests/sqlite3-bin b/build/fbcode_builder/manifests/sqlite3-bin deleted file mode 100644 index aa138d499d6b6..0000000000000 --- a/build/fbcode_builder/manifests/sqlite3-bin +++ /dev/null @@ -1,28 +0,0 @@ -[manifest] -name = sqlite3-bin - -[rpms] -sqlite - -[debs] -sqlite3 - -[download.os=linux] -url = https://github.com/sqlite/sqlite/archive/version-3.33.0.tar.gz -sha256 = 48e5f989eefe9af0ac758096f82ead0f3c7b58118ac17cc5810495bd5084a331 - -[build.os=linux] -builder = autoconf -subdir = sqlite-version-3.33.0 - -[build.not(os=linux)] -# MacOS comes with sqlite3 preinstalled and don't need Windows here -builder = nop - -[dependencies.os=linux] -tcl - -[autoconf.args] -# This flag disabled tcl as a runtime library used for some functionality, -# but tcl is still a required dependency as it is used by the build files ---disable-tcl diff --git a/build/fbcode_builder/manifests/tcl b/build/fbcode_builder/manifests/tcl deleted file mode 100644 index 5e9892f37a6d3..0000000000000 --- a/build/fbcode_builder/manifests/tcl +++ /dev/null @@ -1,20 +0,0 @@ -[manifest] -name = tcl - -[rpms] -tcl - -[debs] -tcl - -[download] -url = https://github.com/tcltk/tcl/archive/core-8-7a3.tar.gz -sha256 = 22d748f0c9652f3ecc195fed3f24a1b6eea8d449003085e6651197951528982e - -[build.os=linux] -builder = autoconf -subdir = tcl-core-8-7a3/unix - -[build.not(os=linux)] -# This is for sqlite3 on Linux for now -builder = nop diff --git a/build/fbcode_builder/manifests/tree b/build/fbcode_builder/manifests/tree deleted file mode 100644 index ccd0180a74c0d..0000000000000 --- a/build/fbcode_builder/manifests/tree +++ /dev/null @@ -1,37 +0,0 @@ -[manifest] -name = tree - -[debs] -tree - -[homebrew] -tree - -[rpms] -tree - -[download.os=linux] -url = https://salsa.debian.org/debian/tree-packaging/-/archive/debian/1.8.0-1/tree-packaging-debian-1.8.0-1.tar.gz -sha256 = a841eee1d52bfd64a48f54caab9937b9bd92935055c48885c4ab1ae4dab7fae5 - -[download.os=darwin] -# The official package of tree source requires users of non-Linux platform to -# comment/uncomment certain lines in the Makefile to build for their platform. -# Besauce getdeps.py doesn't have that functionality we just use this custom -# fork of tree which has proper lines uncommented for a OSX build -url = https://github.com/lukaspiatkowski/tree-command/archive/debian/1.8.0-1-macos.tar.gz -sha256 = 9cbe889553d95cf5a2791dd0743795d46a3c092c5bba691769c0e5c52e11229e - -[build.os=linux] -builder = make -subdir = tree-packaging-debian-1.8.0-1 - -[build.os=darwin] -builder = make -subdir = tree-command-debian-1.8.0-1-macos - -[build.os=windows] -builder = nop - -[make.install_args] -install diff --git a/build/fbcode_builder/manifests/wangle b/build/fbcode_builder/manifests/wangle deleted file mode 100644 index 6b330d620f466..0000000000000 --- a/build/fbcode_builder/manifests/wangle +++ /dev/null @@ -1,27 +0,0 @@ -[manifest] -name = wangle -fbsource_path = fbcode/wangle -shipit_project = wangle -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/wangle.git - -[build] -builder = cmake -subdir = wangle - -[cmake.defines.test=on] -BUILD_TESTS=ON - -[cmake.defines.test=off] -BUILD_TESTS=OFF - -[dependencies] -folly -googletest -fizz - -[shipit.pathmap] -fbcode/wangle/public_tld = . -fbcode/wangle = wangle diff --git a/build/fbcode_builder/manifests/watchman b/build/fbcode_builder/manifests/watchman deleted file mode 100644 index b733a0c09c372..0000000000000 --- a/build/fbcode_builder/manifests/watchman +++ /dev/null @@ -1,47 +0,0 @@ -[manifest] -name = watchman -fbsource_path = fbcode/watchman -shipit_project = watchman -shipit_fbcode_builder = true - -[git] -repo_url = https://github.com/facebook/watchman.git - -[build] -builder = cmake - -[dependencies] -boost -cpptoml -edencommon -fb303 -fbthrift -folly -pcre2 -googletest - -[dependencies.fbsource=on] -rust - -[shipit.pathmap] -fbcode/watchman = watchman -fbcode/watchman/oss = . -fbcode/eden/fs = eden/fs - -[shipit.strip] -^fbcode/eden/fs/(?!.*\.thrift|service/shipit_test_file\.txt) - -[cmake.defines.fb=on] -ENABLE_EDEN_SUPPORT=ON -IS_FB_BUILD=ON - -# FB macos specific settings -[cmake.defines.all(fb=on,os=darwin)] -# this path is coupled with the FB internal watchman-osx.spec -WATCHMAN_STATE_DIR=/opt/facebook/watchman/var/run/watchman -# tell cmake not to try to create /opt/facebook/... -INSTALL_WATCHMAN_STATE_DIR=OFF -USE_SYS_PYTHON=OFF - -[depends.environment] -WATCHMAN_VERSION_OVERRIDE diff --git a/build/fbcode_builder/manifests/ws_airstore b/build/fbcode_builder/manifests/ws_airstore deleted file mode 100644 index 3e5daa72a17d2..0000000000000 --- a/build/fbcode_builder/manifests/ws_airstore +++ /dev/null @@ -1,32 +0,0 @@ -[manifest] -name = ws_airstore -fbsource_path = fbcode/warm_storage/experimental/ws_airstore/ -shipit_project = WS_AIRStore -shipit_fbcode_builder = true - -[build.os=linux] -builder = cmake - -[build.not(os=linux)] -# We only support Linux -builder = nop - -[dependencies] -boost -libcurl -fizz -fmt -folly -googletest -libsodium -libevent -double-conversion -wangle -zstd -zlib -xz - -[shipit.pathmap] -fbcode/warm_storage/experimental/ws_airstore = . - -[shipit.strip] diff --git a/build/fbcode_builder/manifests/xz b/build/fbcode_builder/manifests/xz deleted file mode 100644 index 0b27ad63cc91c..0000000000000 --- a/build/fbcode_builder/manifests/xz +++ /dev/null @@ -1,22 +0,0 @@ -[manifest] -name = xz - -[debs] -liblzma-dev - -[homebrew] -xz - -[rpms] -xz-devel - -[download] -url = https://tukaani.org/xz/xz-5.2.5.tar.gz -sha256 = f6f4910fd033078738bd82bfba4f49219d03b17eb0794eb91efbae419f4aba10 - -[build] -builder = autoconf -subdir = xz-5.2.5 - -[autoconf.args] ---disable-shared diff --git a/build/fbcode_builder/manifests/yaml-cpp b/build/fbcode_builder/manifests/yaml-cpp deleted file mode 100644 index bffa540fe78ea..0000000000000 --- a/build/fbcode_builder/manifests/yaml-cpp +++ /dev/null @@ -1,20 +0,0 @@ -[manifest] -name = yaml-cpp - -[download] -url = https://github.com/jbeder/yaml-cpp/archive/yaml-cpp-0.6.2.tar.gz -sha256 = e4d8560e163c3d875fd5d9e5542b5fd5bec810febdcba61481fe5fc4e6b1fd05 - -[build.os=linux] -builder = cmake -subdir = yaml-cpp-yaml-cpp-0.6.2 - -[build.not(os=linux)] -builder = nop - -[dependencies] -boost -googletest - -[cmake.defines] -YAML_CPP_BUILD_TESTS=OFF diff --git a/build/fbcode_builder/manifests/zlib b/build/fbcode_builder/manifests/zlib deleted file mode 100644 index 86647fc9275ed..0000000000000 --- a/build/fbcode_builder/manifests/zlib +++ /dev/null @@ -1,21 +0,0 @@ -[manifest] -name = zlib - -[debs] -zlib1g-dev - -[homebrew] -zlib - -[rpms] -zlib-devel -zlib-static - -[download] -url = https://zlib.net/zlib-1.3.tar.gz -sha256 = ff0ba4c292013dbc27530b3a81e1f9a813cd39de01ca5e0f8bf355702efa593e - -[build] -builder = cmake -subdir = zlib-1.3 -patchfile = zlib_dont_build_more_than_needed.patch diff --git a/build/fbcode_builder/manifests/zstd b/build/fbcode_builder/manifests/zstd deleted file mode 100644 index 18484f4b1ed15..0000000000000 --- a/build/fbcode_builder/manifests/zstd +++ /dev/null @@ -1,31 +0,0 @@ -[manifest] -name = zstd - -[homebrew] -zstd - -# 18.04 zstd is too old -[debs.not(all(distro=ubuntu,distro_vers="18.04"))] -libzstd-dev - -[rpms] -libzstd-devel -libzstd - -[download] -url = https://github.com/facebook/zstd/releases/download/v1.5.5/zstd-1.5.5.tar.gz -sha256 = 9c4396cc829cfae319a6e2615202e82aad41372073482fce286fac78646d3ee4 - -[build] -builder = cmake -subdir = zstd-1.5.5/build/cmake - -# The zstd cmake build explicitly sets the install name -# for the shared library in such a way that cmake discards -# the path to the library from the install_name, rendering -# the library non-resolvable during the build. The short -# term solution for this is just to link static on macos. -# -# And while we're at it, let's just always link statically. -[cmake.defines] -ZSTD_BUILD_SHARED = OFF diff --git a/build/fbcode_builder/manifests/zstrong b/build/fbcode_builder/manifests/zstrong deleted file mode 100644 index 5205c2c6708f7..0000000000000 --- a/build/fbcode_builder/manifests/zstrong +++ /dev/null @@ -1,33 +0,0 @@ -[manifest] -name = zstrong - -[git] -repo_url = https://github.com/facebookincubator/zstrong.git - -[build] -builder = cmake - -[dependencies] -zstd - -[dependencies.test=on] -benchmark -fmt -googletest - -[shipit.pathmap] -fbcode/data_compression/experimental/zstrong = . - -[shipit.strip] -^fbcode/data_compression/experimental/zstrong/zstrong/zs2_config\.h$ - -[cmake.defines] -BUILD_SHARED_LIBS=OFF - -[cmake.defines.test=on] -BUILD_TESTS=ON -BUILD_BENCHMARKS=ON - -[cmake.defines.test=off] -BUILD_TESTS=OFF -BUILD_BENCHMARKS=OFF diff --git a/build/fbcode_builder/patches/blake3_CMakeLists_txt.patch b/build/fbcode_builder/patches/blake3_CMakeLists_txt.patch deleted file mode 100644 index 9b1c828fafe62..0000000000000 --- a/build/fbcode_builder/patches/blake3_CMakeLists_txt.patch +++ /dev/null @@ -1,97 +0,0 @@ -diff --git a/BLAKE3Config.cmake.in b/BLAKE3Config.cmake.in -new file mode 100644 -index 0000000..5a8919d ---- /dev/null -+++ b/BLAKE3Config.cmake.in -@@ -0,0 +1,4 @@ -+@PACKAGE_INIT@ -+ -+set_and_check(BLAKE3_INCLUDE_DIR "@PACKAGE_INCLUDE_DIR@") -+include("${CMAKE_CURRENT_LIST_DIR}/BLAKE3Targets.cmake") -diff --git a/CMakeLists.txt b/CMakeLists.txt -new file mode 100644 -index 0000000..171554b ---- /dev/null -+++ b/CMakeLists.txt -@@ -0,0 +1,81 @@ -+cmake_minimum_required(VERSION 3.12) -+cmake_policy(VERSION ${CMAKE_VERSION}) -+ -+project(BLAKE3 -+ VERSION 1.3.3 -+ DESCRIPTION "BLAKE3 C implementation" -+ HOMEPAGE_URL "https://github.com/BLAKE3-team/BLAKE3" -+ LANGUAGES C) -+ -+include(GNUInstallDirs) -+ -+add_library(blake3) -+ -+set(INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH -+ "The subdirectory where header files should be installed") -+ -+set_target_properties(blake3 PROPERTIES -+ PUBLIC_HEADER "blake3.h" -+ SOVERSION ${PROJECT_VERSION_MAJOR} -+ VERSION ${PROJECT_VERSION}) -+ -+target_sources(blake3 PRIVATE -+ blake3.c -+ blake3_dispatch.c -+ blake3_portable.c) -+ -+if((CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64) OR (CMAKE_SYSTEM_PROCESSOR STREQUAL AMD64)) -+ enable_language(ASM) -+ if(MSVC) -+ enable_language(ASM_MASM) -+ set(SUFFIX "windows_msvc.asm") -+ elseif(WIN32) -+ enable_language(ASM) -+ set(SUFFIX "windows_gnu.S") -+ else() -+ enable_language(ASM) -+ set(SUFFIX "unix.S") -+ endif() -+ target_sources(blake3 PRIVATE -+ blake3_avx2_x86-64_${SUFFIX} -+ blake3_avx512_x86-64_${SUFFIX} -+ blake3_sse2_x86-64_${SUFFIX} -+ blake3_sse41_x86-64_${SUFFIX}) -+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL i686) -+ target_compile_options(blake3 PRIVATE -+ -mavx2 -+ -mavx512f -mavx512vl -+ -msse2 -+ -msse4.1) -+ target_sources(blake3 PRIVATE -+ blake3_avx2.c -+ blake3_avx512.c -+ blake3_sse2.c -+ blake3_sse41.c) -+elseif((CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) OR -+ (ANDROID_ABI STREQUAL armeabi-v7a)) -+ target_compile_definitions(blake3 PRIVATE BLAKE3_USE_NEON) -+ target_compile_options(blake3 PRIVATE -mfpu=neon) -+ target_sources(blake3 PRIVATE blake3_neon.c) -+endif() -+ -+install(TARGETS blake3 -+ EXPORT blake3_targets -+ PUBLIC_HEADER) -+install(EXPORT blake3_targets -+ DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" -+ FILE "${PROJECT_NAME}Targets.cmake" -+ NAMESPACE "${PROJECT_NAME}::") -+ -+include(CMakePackageConfigHelpers) -+write_basic_package_version_file( -+ "${PROJECT_NAME}ConfigVersion.cmake" -+ VERSION ${PROJECT_VERSION} -+ COMPATIBILITY AnyNewerVersion) -+configure_package_config_file( -+ ${PROJECT_NAME}Config.cmake.in -+ ${PROJECT_NAME}Config.cmake -+ INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" -+ PATH_VARS INCLUDE_DIR) -+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" -+ DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") diff --git a/build/fbcode_builder/patches/zlib_dont_build_more_than_needed.patch b/build/fbcode_builder/patches/zlib_dont_build_more_than_needed.patch deleted file mode 100644 index 919f4ec8adc0f..0000000000000 --- a/build/fbcode_builder/patches/zlib_dont_build_more_than_needed.patch +++ /dev/null @@ -1,31 +0,0 @@ -diff -Naur ../zlib-1.2.13/CMakeLists.txt ./CMakeLists.txt ---- ../zlib-1.2.13/CMakeLists.txt 2022-10-12 22:06:55.000000000 -0700 -+++ ./CMakeLists.txt 2022-10-14 14:50:28.000000000 -0700 -@@ -147,8 +147,7 @@ - set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) - endif(MINGW) - --add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) --add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) -+add_library(zlib ${ZLIB_SRCS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) - set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) - set_target_properties(zlib PROPERTIES SOVERSION 1) - -@@ -165,7 +164,7 @@ - - if(UNIX) - # On unix-like platforms the library is almost always called libz -- set_target_properties(zlib zlibstatic PROPERTIES OUTPUT_NAME z) -+ set_target_properties(zlib PROPERTIES OUTPUT_NAME z) - if(NOT APPLE) - set_target_properties(zlib PROPERTIES LINK_FLAGS "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib.map\"") - endif() -@@ -175,7 +174,7 @@ - endif() - - if(NOT SKIP_INSTALL_LIBRARIES AND NOT SKIP_INSTALL_ALL ) -- install(TARGETS zlib zlibstatic -+ install(TARGETS zlib - RUNTIME DESTINATION "${INSTALL_BIN_DIR}" - ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" - LIBRARY DESTINATION "${INSTALL_LIB_DIR}" ) diff --git a/docker-compose.yml b/docker-compose.yml index a8fe32b71706f..6904baa543eea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,7 +21,7 @@ services: # or # docker-compose run -e NUM_THREADS= --rm ubuntu-cpp # to set the number of threads used during compilation - image: ghcr.io/facebookincubator/velox-dev:amd64-ubuntu-22.04-avx + image: ghcr.io/facebookincubator/velox-dev:ubuntu-22.04 build: context: . dockerfile: scripts/ubuntu-22.04-cpp.dockerfile @@ -33,6 +33,32 @@ services: - .:/velox:delegated command: scripts/docker-command.sh + adapters-cpp: + # Usage: + # docker-compose pull adapters-cpp or docker-compose build adapters-cpp + # or + # docker-compose run --rm adapters-cpp + # or + # docker-compose run -e NUM_THREADS= --rm adapters-cpp + # to set the number of threads used during compilation + # scripts/adapters.dockerfile uses SHELL which is not supported for OCI image format. + # podman users must specify "--podman-build-args='--format docker'" argument. + image: ghcr.io/facebookincubator/velox-dev:adapters + build: + context: . + dockerfile: scripts/adapters.dockerfile + args: + image: ghcr.io/facebookincubator/velox-dev:centos9 + environment: + NUM_THREADS: 8 # default value for NUM_THREADS + CCACHE_DIR: "/velox/.ccache" + EXTRA_CMAKE_FLAGS: -DVELOX_ENABLE_PARQUET=ON + -DVELOX_ENABLE_S3=ON + volumes: + - .:/velox:delegated + working_dir: /velox + command: /velox/scripts/docker-command.sh + centos-cpp: # Usage: # docker-compose pull centos-cpp or docker-compose build centos-cpp @@ -40,32 +66,58 @@ services: # or # docker-compose run -e NUM_THREADS= --rm centos-cpp # to set the number of threads used during compilation - image: ghcr.io/facebookincubator/velox-dev:amd64-centos-8-avx + image: ghcr.io/facebookincubator/velox-dev:centos9 build: context: . - dockerfile: scripts/centos-8-stream.dockerfile + dockerfile: scripts/centos.dockerfile + args: + image: quay.io/centos/centos:stream9 environment: NUM_THREADS: 8 # default value for NUM_THREADS CCACHE_DIR: "/velox/.ccache" volumes: - .:/velox:delegated - command: /bin/bash -c "scl enable gcc-toolset-9 '/velox/scripts/docker-command.sh'" + working_dir: /velox + command: /velox/scripts/docker-command.sh - python: - # Usage: - # docker-compose pull ubuntu-cpp or docker-compose build ubuntu-cpp - # docker-compose run --rm ubuntu-cpp - # or - # docker-compose run -e NUM_THREADS= --rm ubuntu-cpp - # to set the number of threads used during compilation - image: ghcr.io/facebookincubator/velox-dev:torcharrow-avx + presto-java: + # Usage: + # docker-compose pull presto-java or docker-compose build presto-java + # docker-compose run --rm presto-java + # or + # docker-compose run -e NUM_THREADS= --rm presto-java + # to set the number of threads used during compilation + image: ghcr.io/facebookincubator/velox-dev:presto-java + build: + args: + - PRESTO_VERSION=0.288 + context: . + dockerfile: scripts/prestojava-container.dockerfile + environment: + NUM_THREADS: 8 # default value for NUM_THREADS + CCACHE_DIR: "/velox/.ccache" + volumes: + - .:/velox:delegated + working_dir: /velox + command: /velox/scripts/docker-command.sh + + spark-server: + # Usage: + # docker-compose pull spark-server or docker-compose build spark-server + # docker-compose run --rm spark-server + # or + # docker-compose run -e NUM_THREADS= --rm spark-server + # to set the number of threads used during compilation + image: ghcr.io/facebookincubator/velox-dev:spark-server build: + args: + - SPARK_VERSION=3.5.1 context: . - dockerfile: scripts/velox-torcharrow-container.dockfile + dockerfile: scripts/spark-container.dockerfile environment: - PYTHON_EXECUTABLE: python3.7 NUM_THREADS: 8 # default value for NUM_THREADS CCACHE_DIR: "/velox/.ccache" volumes: - .:/velox:delegated - command: cd /velox && make python-test + working_dir: /velox + command: /velox/scripts/docker-command.sh diff --git a/pyvelox/CMakeLists.txt b/pyvelox/CMakeLists.txt index 92d4adbef213f..f6fb59151cffe 100644 --- a/pyvelox/CMakeLists.txt +++ b/pyvelox/CMakeLists.txt @@ -14,28 +14,30 @@ if(VELOX_BUILD_PYTHON_PACKAGE) message("Creating pyvelox module") - include_directories(SYSTEM ${CMAKE_SOURCE_DIR}) - add_definitions(-DCREATE_PYVELOX_MODULE -DVELOX_DISABLE_GOOGLETEST) # Define our Python module: - pybind11_add_module(pyvelox MODULE pyvelox.cpp serde.cpp signatures.cpp - conversion.cpp) - # Link with Velox: + pybind11_add_module( + pyvelox + MODULE + complex.cpp + conversion.cpp + pyvelox.cpp + serde.cpp + signatures.cpp) + target_link_libraries( pyvelox PRIVATE velox_type velox_vector velox_core velox_exec - velox_functions_prestosql velox_parse_parser velox_functions_prestosql - velox_functions_spark) + velox_functions_spark + velox_aggregates + velox_functions_spark_aggregates) + target_include_directories(pyvelox SYSTEM + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/..) + target_compile_definitions(pyvelox PRIVATE -DCREATE_PYVELOX_MODULE) install(TARGETS pyvelox LIBRARY DESTINATION .) -else() - # Torcharrow will not use pyvelox as an extension module for compatibility - # reasons. - message("Creating pyvelox library") - add_library(pyvelox pyvelox.cpp pyvelox.h) - target_link_libraries(pyvelox velox_type pybind11::module) endif() diff --git a/pyvelox/complex.cpp b/pyvelox/complex.cpp new file mode 100644 index 0000000000000..cd401e09cd46b --- /dev/null +++ b/pyvelox/complex.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "complex.h" +#include "velox/vector/ComplexVector.h" + +#include + +namespace facebook::velox::py { + +using namespace velox; +namespace py = pybind11; + +namespace { +// Structure used to keep check on the number +// of constituent elements. Attributes totalElements +// and insertedElements keeps the length of the vector, and +// the number of elements inserted during the operation +// respectively. +struct ElementCounter { + vector_size_t insertedElements = + 0; // to track the elements already in the vector + vector_size_t totalElements = 0; + std::vector children; +}; +} // namespace + +void checkOrAssignType(TypePtr& type, const TypePtr& expected_type) { + if (type->kind() == TypeKind::UNKNOWN) { + type = expected_type; + } else if (!(type->kindEquals(expected_type))) { + throw py::type_error( + "Cannot construct type tree, invalid variant for complex type"); + } +} + +template +void setElementInFlatVector( + vector_size_t idx, + const variant& v, + VectorPtr& vector) { + using NativeType = typename TypeTraits::NativeType; + auto asFlat = vector->asFlatVector(); + asFlat->set(idx, NativeType{v.value()}); +} + +// This function determines the type and the number of elements for a variant. +// Takes reference to Type and ElementCounter which will be set after the run. +// It is supposed to run a recursive call with a pre-instantiated TypePtr, +// the target variant and the counter. The passed variant is checked for its +// data type, and for any complex type involved, the function is called again. +// The counter here is used to keep in track of the number of elements inserted +// and the number of types of elements allowed if a complex vector is involved +// in the variant. +void constructType(const variant& v, TypePtr& type, ElementCounter& counter) { + ++counter.totalElements; + + if (v.isNull()) { + // since the variant is NULL, we can't infer the data type + // thus it maybe UNKNOWN or INVALID at this stage + // which implies further investigation is required + if (v.kind() != TypeKind::UNKNOWN && v.kind() != TypeKind::INVALID && + v.kind() != type->kind()) { + throw std::invalid_argument("Variant was of an unexpected kind"); + } + return; + } else { + // if a Non-Null variant's type is unknown or not one of the valid + // types which are supported then the Type tree cannot be constructed + if (v.kind() == TypeKind::UNKNOWN || v.kind() == TypeKind::INVALID) { + throw std::invalid_argument( + "Non-null variant has unknown or invalid kind"); + } + + switch (v.kind()) { + case TypeKind::ARRAY: { + counter.children.resize(1); + auto asArray = v.array(); + TypePtr childType = createType(TypeKind::UNKNOWN, {}); + for (const auto& element : asArray) { + constructType(element, childType, counter.children[0]); + } + + // if child's type still remains Unknown, implies all the + // elements in the array are actually NULL + if (childType->kind() == TypeKind::UNKNOWN) { + throw py::value_error("Cannot construct array with all None values"); + } + checkOrAssignType(type, createType({childType})); + break; + } + + default: { + checkOrAssignType(type, createScalarType(v.kind())); + break; + } + } + } +} + +// Function is called with the variant to be added, +// the target vector and the element counter. The element counter +// is used to track the number of elements already inserted, so as +// to get the index for the next element to insert. For an array +// vector, the required offset and size is first set into the vector +// then the function is called recursively for the contained elements. +// In the default case where the variant is a scalar type, the +// setElementInFlatVector is called without any further recursion. +static void insertVariantIntoVector( + const variant& v, + VectorPtr& vector, + ElementCounter& counter, + vector_size_t previous_size, + vector_size_t previous_offset) { + if (v.isNull()) { + vector->setNull(counter.insertedElements, true); + } else { + switch (v.kind()) { + case TypeKind::ARRAY: { + auto asArray = vector->as(); + asArray->elements()->resize(counter.children[0].totalElements); + const std::vector& elements = v.array(); + vector_size_t offset = previous_offset + previous_size; + vector_size_t size = elements.size(); + asArray->setOffsetAndSize(counter.insertedElements, offset, size); + for (const variant& elt : elements) { + insertVariantIntoVector( + elt, asArray->elements(), counter.children[0], offset, size); + } + + break; + } + default: { + VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + setElementInFlatVector, + v.kind(), + counter.insertedElements, + v, + vector); + break; + } + } + } + counter.insertedElements += 1; +} + +VectorPtr variantsToVector( + const std::vector& variants, + velox::memory::MemoryPool* pool) { + ElementCounter counter; + TypePtr type = createType(TypeKind::UNKNOWN, {}); + for (const auto& variant : variants) { + constructType(variant, type, counter); + } + VectorPtr resultVector = + BaseVector::create(std::move(type), variants.size(), pool); + for (const variant& v : variants) { + insertVariantIntoVector( + v, + resultVector, + counter, + /*previous_size*/ 0, + /*previous_offset*/ 0); + } + return resultVector; +} + +} // namespace facebook::velox::py diff --git a/pyvelox/complex.h b/pyvelox/complex.h new file mode 100644 index 0000000000000..e8c0217d993f4 --- /dev/null +++ b/pyvelox/complex.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "velox/vector/FlatVector.h" + +namespace facebook::velox::py { + +VectorPtr variantsToVector( + const std::vector& variants, + velox::memory::MemoryPool* pool); + +} diff --git a/pyvelox/context.h b/pyvelox/context.h index bc154f039f486..ca3c320af4c49 100644 --- a/pyvelox/context.h +++ b/pyvelox/context.h @@ -56,9 +56,9 @@ struct PyVeloxContext { PyVeloxContext& operator=(const PyVeloxContext&&) = delete; std::shared_ptr pool_ = - facebook::velox::memory::addDefaultLeafMemoryPool(); + facebook::velox::memory::deprecatedAddDefaultLeafMemoryPool(); std::shared_ptr queryCtx_ = - std::make_shared(); + facebook::velox::core::QueryCtx::create(); std::unique_ptr execCtx_ = std::make_unique( pool_.get(), diff --git a/pyvelox/pyvelox.cpp b/pyvelox/pyvelox.cpp index 93957c098eebb..8f39fb4b904ef 100644 --- a/pyvelox/pyvelox.cpp +++ b/pyvelox/pyvelox.cpp @@ -15,6 +15,7 @@ */ #include "pyvelox.h" +#include "complex.h" #include "conversion.h" #include "serde.h" #include "signatures.h" @@ -36,7 +37,7 @@ static VectorPtr variantToConstantVector( facebook::velox::memory::MemoryPool* pool) { using NativeType = typename TypeTraits::NativeType; - TypePtr typePtr = fromKindToScalerType(T); + TypePtr typePtr = createScalarType(T); if (!variant.hasValue()) { return std::make_shared>( pool, @@ -90,7 +91,7 @@ static VectorPtr variantsToFlatVector( constexpr bool kNeedsHolder = (T == TypeKind::VARCHAR || T == TypeKind::VARBINARY); - TypePtr type = fromKindToScalerType(T); + TypePtr type = createScalarType(T); auto result = BaseVector::create>(type, variants.size(), pool); @@ -143,6 +144,8 @@ static VectorPtr pyListToVector( if (first_kind == velox::TypeKind::INVALID) { throw py::value_error( "Can't create a Velox vector consisting of only None"); + } else if (first_kind == velox::TypeKind::ARRAY) { + return variantsToVector(variants, pool); } return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( @@ -164,7 +167,7 @@ static VectorPtr pyListToVector( } template -static py::object getItemFromSimpleVector( +inline py::object getItemFromSimpleVector( SimpleVectorPtr& vector, vector_size_t idx) { checkBounds(vector, idx); diff --git a/pyvelox/pyvelox.h b/pyvelox/pyvelox.h index eb1de36ee9976..975f09c967c1f 100644 --- a/pyvelox/pyvelox.h +++ b/pyvelox/pyvelox.h @@ -16,6 +16,9 @@ #pragma once +#include + +#include #include #include #include @@ -27,9 +30,11 @@ #include #include #include +#include #include #include #include "folly/json.h" +#include "velox/vector/VariantToVector.h" #include "context.h" @@ -64,6 +69,13 @@ inline velox::variant pyToVariant(const py::handle& obj) { return pyToVariant(obj); } else if (py::isinstance(obj)) { return pyToVariant(obj); + } else if (py::isinstance(obj)) { + py::list objAsList = py::cast(obj); + std::vector result; + for (auto& item : objAsList) { + result.push_back(pyToVariant(item)); + } + return velox::variant::array(std::move(result)); } else { throw py::type_error("Invalid type of object"); } @@ -109,6 +121,32 @@ inline velox::variant pyToVariant(const py::handle& obj, const Type& dtype) { } } +inline void checkRowVectorBounds(const RowVectorPtr& v, vector_size_t idx) { + if (idx < 0 || size_t(idx) >= v->childrenSize()) { + throw std::out_of_range("Index out of range"); + } +} + +bool compareRowVector(const RowVectorPtr& u, const RowVectorPtr& v) { + CompareFlags compFlags = + CompareFlags::equality(CompareFlags::NullHandlingMode::kNullAsValue); + + if (u->size() != v->size()) { + return false; + } + for (size_t i = 0; i < u->size(); i++) { + if (u->compare(v.get(), i, i, compFlags) != 0) { + return false; + } + } + + return true; +} + +inline std::string rowVectorToString(const RowVectorPtr& vector) { + return vector->toString(0, vector->size()); +} + static VectorPtr pyToConstantVector( const py::handle& obj, vector_size_t length, @@ -145,13 +183,13 @@ static VectorPtr createDictionaryVector( } template -static py::object getItemFromSimpleVector( - SimpleVectorPtr& vector, +inline py::object getItemFromSimpleVector( + SimpleVectorPtr& v, vector_size_t idx); template inline void setItemInFlatVector( - FlatVectorPtr& vector, + FlatVectorPtr& v, vector_size_t idx, py::handle& obj); @@ -371,7 +409,9 @@ static void registerTypedVectors( [](DictionaryVectorPtr vec) { return DictionaryIndices{vec->indices()}; }) - .def("values", &DictionaryVector::valueVector); + .def("values", [](DictionaryVectorPtr vec) { + return vec->valueVector(); + }); } static void addVectorBindings( @@ -434,6 +474,12 @@ static void addVectorBindings( py::arg("stop"), py::arg("step") = 1); + py::class_( + m, "ArrayVector", py::module_local(asModuleLocalDefinitions)) + .def("elements", [](ArrayVectorPtr vec) -> VectorPtr { + return vec->elements(); + }); + constexpr TypeKind supportedTypes[] = { TypeKind::BOOLEAN, TypeKind::TINYINT, @@ -504,6 +550,75 @@ static void addVectorBindings( std::move(baseVector), PyVeloxContext::getSingletonInstance().pool()); }); + + m.def( + "row_vector", + [](std::vector& names, + std::vector& children, + const std::optional& nullabilityDict) { + if (children.size() == 0 || names.size() == 0) { + throw py::value_error("RowVector must have children."); + } + std::vector> childTypes; + childTypes.reserve(children.size()); + + size_t vectorSize = children[0]->size(); + for (int i = 0; i < children.size(); i++) { + if (i > 0 && children[i]->size() != vectorSize) { + PyErr_SetString(PyExc_ValueError, "Each child must have same size"); + throw py::error_already_set(); + } + childTypes.push_back(children[i]->type()); + } + auto rowType = ROW(std::move(names), std::move(childTypes)); + + BufferPtr nullabilityBuffer = nullptr; + if (nullabilityDict.has_value()) { + auto nullabilityValues = nullabilityDict.value(); + nullabilityBuffer = AlignedBuffer::allocate( + vectorSize, PyVeloxContext::getSingletonInstance().pool(), true); + for (const auto&& item : nullabilityValues) { + auto row = item.first; + auto nullability = item.second; + if (!py::isinstance(row) || + !py::isinstance(nullability)) { + throw py::type_error( + "Nullability must be a dictionary, rowId in int and nullability in boolean."); + } + int rowId = py::cast(row); + if (!(rowId >= 0 && rowId < vectorSize)) { + throw py::type_error("Nullability index out of bounds."); + } + bool nullabilityVal = py::cast(nullability); + bits::setBit( + nullabilityBuffer->asMutable(), + rowId, + bits::kNull ? nullabilityVal : !nullabilityVal); + } + } + + return std::make_shared( + PyVeloxContext::getSingletonInstance().pool(), + rowType, + nullabilityBuffer, + vectorSize, + children); + }, + py::arg("names"), + py::arg("children"), + py::arg("nullability") = std::nullopt); + + py::class_( + m, "RowVector", py::module_local(asModuleLocalDefinitions)) + .def( + "__len__", + [](RowVectorPtr& v) { + return v->childrenSize() > 0 ? v->childAt(0)->size() : 0; + }) + .def("__str__", [](RowVectorPtr& v) { return rowVectorToString(v); }) + .def("__eq__", [](RowVectorPtr& u, RowVectorPtr& v) { + return compareRowVector(u, v); + }); } static void addExpressionBindings( diff --git a/pyvelox/serde.cpp b/pyvelox/serde.cpp index 19ffde942dd91..64f6ab0be1023 100644 --- a/pyvelox/serde.cpp +++ b/pyvelox/serde.cpp @@ -24,7 +24,7 @@ namespace facebook::velox::py { namespace py = pybind11; namespace { -VectorPtr pyRestoreVectorFromFileHelper(const char* FOLLY_NONNULL filePath) { +VectorPtr pyRestoreVectorFromFileHelper(const char* filePath) { using namespace facebook::velox; memory::MemoryPool* pool = PyVeloxContext::getSingletonInstance().pool(); return restoreVectorFromFile(filePath, pool); diff --git a/pyvelox/signatures.cpp b/pyvelox/signatures.cpp index c12912f759040..27b5674f6557f 100644 --- a/pyvelox/signatures.cpp +++ b/pyvelox/signatures.cpp @@ -15,9 +15,12 @@ */ #include "signatures.h" // @manual +#include "velox/exec/Aggregate.h" #include "velox/functions/FunctionRegistry.h" +#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h" #include "velox/functions/prestosql/registration/RegistrationFunctions.h" #include "velox/functions/sparksql/Register.h" +#include "velox/functions/sparksql/aggregates/Register.h" namespace facebook::velox::py { @@ -31,6 +34,24 @@ void registerSparkFunctions(const std::string& prefix) { facebook::velox::functions::sparksql::registerFunctions(prefix); } +void registerPrestoAggregateFunctions(const std::string& prefix) { + facebook::velox::aggregate::prestosql::registerAllAggregateFunctions(prefix); +} + +void registerSparkAggregateFunctions(const std::string& prefix) { + facebook::velox::functions::aggregate::sparksql::registerAggregateFunctions( + prefix); +} + +exec::AggregateFunctionSignatureMap getAggregateSignatures() { + return exec::getAggregateFunctionSignatures(); +} + +void clearAggregateSignatures() { + exec::aggregateFunctions().withWLock( + [&](auto& aggregateFunctions) { aggregateFunctions.clear(); }); +} + void addSignatureBindings(py::module& m, bool asModuleLocalDefinitions) { // TypeSignature py::class_ typeSignature( @@ -53,6 +74,19 @@ void addSignatureBindings(py::module& m, bool asModuleLocalDefinitions) { functionSignature.def( "constant_arguments", &exec::FunctionSignature::constantArguments); + // AggregateFunctionSignature + py::class_< + exec::AggregateFunctionSignature, + std::unique_ptr> + aggregateFunctionSignature( + m, + "AggregateFunctionSignature", + py::module_local(asModuleLocalDefinitions)); + aggregateFunctionSignature.def( + "__str__", &exec::AggregateFunctionSignature::toString); + aggregateFunctionSignature.def( + "intermediate_type", &exec::AggregateFunctionSignature::intermediateType); + m.def( "clear_signatures", &clearFunctionRegistry, @@ -75,5 +109,28 @@ void addSignatureBindings(py::module& m, bool asModuleLocalDefinitions) { &getFunctionSignatures, py::return_value_policy::reference, "Returns a dictionary of the current signatures."); + + m.def( + "register_presto_aggregate_signatures", + ®isterPrestoAggregateFunctions, + "Adds Presto Aggregate signatures to the function registry.", + py::arg("prefix") = ""); + + m.def( + "register_spark_aggregate_signatures", + ®isterSparkAggregateFunctions, + "Adds Spark Aggregate signatures to the function registry.", + py::arg("prefix") = ""); + + m.def( + "get_aggregate_function_signatures", + &getAggregateSignatures, + py::return_value_policy::reference, + "Returns a dictionary of the current aggregate signatures."); + + m.def( + "clear_aggregate_signatures", + &clearAggregateSignatures, + "Clears the Aggregate function registry."); } } // namespace facebook::velox::py diff --git a/pyvelox/test/test_signatures.py b/pyvelox/test/test_signatures.py index 8cefcdd196544..dddb2869537ae 100644 --- a/pyvelox/test/test_signatures.py +++ b/pyvelox/test/test_signatures.py @@ -41,9 +41,11 @@ def test_function_signature(self): concat_signatures = presto_signatures["concat"] self.assertTrue(len(concat_signatures) > 0) # Array functions are registered first, then string functions. + concat_signatures.sort(key=lambda sig: sig.__str__()) self.assertEqual(str(concat_signatures[0].return_type()), "array(__user_T1)") self.assertEqual( - str(concat_signatures[0]), "(array(__user_T1)...) -> array(__user_T1)" + str(concat_signatures[0]), + "(__user_T1,array(__user_T1)) -> array(__user_T1)", ) self.assertEqual(str(concat_signatures[-1].return_type()), "varchar") self.assertEqual(str(concat_signatures[-1]), "(varchar,varchar...) -> varchar") @@ -62,3 +64,21 @@ def test_function_prefix(self): concat_signatures = spark_signatures["barconcat"] self.assertTrue(len(concat_signatures) > 0) + + def test_aggregate_signatures(self): + pv.clear_aggregate_signatures() + + pv.register_presto_aggregate_signatures() + presto_agg_signatures = pv.get_aggregate_function_signatures() + + min_signatures = presto_agg_signatures["min"] + self.assertTrue(len(min_signatures) > 0) + + max_signatures = presto_agg_signatures["max"] + self.assertTrue(len(max_signatures) > 0) + + pv.clear_aggregate_signatures() + + pv.register_spark_aggregate_signatures() + spark_agg_signatures = pv.get_aggregate_function_signatures() + self.assertTrue(len(spark_agg_signatures) > 0) diff --git a/pyvelox/test/test_vector.py b/pyvelox/test/test_vector.py index 482449ed060ca..2e15d4bfca0e9 100644 --- a/pyvelox/test/test_vector.py +++ b/pyvelox/test/test_vector.py @@ -156,6 +156,46 @@ def test_dictionary_encoding(self): pv.dictionary_vector(pv.from_list([1, 2, 3]), [1, 2, 1000000]) pv.dictionary_vector(pv.from_list([1, 2, 3]), [0, -1, -2]) + def test_array_vector(self): + v1 = pv.from_list([[1, 2, 3], [1, 2, 3]]) + self.assertTrue(isinstance(v1, pv.ArrayVector)) + self.assertTrue(isinstance(v1.elements(), pv.FlatVector_BIGINT)) + self.assertEqual(len(v1), 2) + expected_flat = [1, 2, 3, 1, 2, 3] + self.assertEqual(len(expected_flat), len(v1.elements())) + for i in range(len(expected_flat)): + self.assertEqual(expected_flat[i], v1.elements()[i]) + + v2 = pv.from_list([[1], [1, 2, None]]) + self.assertTrue(isinstance(v2, pv.ArrayVector)) + self.assertTrue(isinstance(v2.elements(), pv.FlatVector_BIGINT)) + self.assertEqual(len(v2), 2) + expected_flat = [1, 1, 2, None] + self.assertEqual(len(v2.elements()), len(expected_flat)) + for i in range(len(expected_flat)): + self.assertEqual(expected_flat[i], v2.elements()[i]) + + doubleNested = pv.from_list([[[1, 2], [3, None]], [[1], [2]]]) + self.assertTrue(isinstance(doubleNested, pv.ArrayVector)) + self.assertTrue(isinstance(doubleNested.elements(), pv.ArrayVector)) + self.assertEqual(len(doubleNested), 2) + elements = doubleNested.elements().elements() + self.assertTrue(isinstance(elements, pv.FlatVector_BIGINT)) + self.assertEqual(len(elements), 6) + expected_firstElements = [1, 2, 3, None, 1, 2] + self.assertEqual(len(elements), len(expected_firstElements)) + for i in range(len(expected_firstElements)): + self.assertEqual(expected_firstElements[i], elements[i]) + + with self.assertRaises(TypeError): + a = pv.from_list([[[1, 2], [3, 4]], [[1.1], [2.3]]]) + + with self.assertRaises(ValueError): + v = pv.from_list([[None], [None, None, None]]) + + with self.assertRaises(TypeError): + a = pv.from_list([[[1, 2], [3, 4]], [["hello"], ["world"]]]) + def test_to_string(self): self.assertEqual( str(pv.from_list([1, 2, 3])), @@ -351,3 +391,65 @@ def test_roundtrip_conversion(self): self.assertTrue(velox_vector.dtype(), expected_type) for i in range(0, len(data)): self.assertEqual(velox_vector[i], data[i]) + + def test_row_vector_basic(self): + vals = [ + pv.from_list([1, 2, 3]), + pv.from_list([4.0, 5.0, 6.0]), + pv.from_list(["a", "b", "c"]), + ] + + col_names = ["x", "y", "z"] + rw = pv.row_vector(col_names, vals) + rw_str = str(rw) + expected_str = "0: {1, 4, a}\n1: {2, 5, b}\n2: {3, 6, c}" + assert expected_str == rw_str + + def test_row_vector_with_nulls(self): + vals = [ + pv.from_list([1, 2, 3, 1, 2]), + pv.from_list([4, 5, 6, 4, 5]), + pv.from_list([7, 8, 9, 7, 8]), + pv.from_list([10, 11, 12, 10, 11]), + ] + + col_names = ["a", "b", "c", "d"] + rw = pv.row_vector(col_names, vals, {0: True, 2: True}) + rw_str = str(rw) + expected_str = ( + "0: null\n1: {2, 5, 8, 11}\n2: null\n3: {1, 4, 7, 10}\n4: {2, 5, 8, 11}" + ) + assert expected_str == rw_str + + def test_row_vector_comparison(self): + u = [ + pv.from_list([1, 2, 3]), + pv.from_list([7, 4, 9]), + pv.from_list([10, 11, 12]), + ] + + v = [ + pv.from_list([1, 2, 3]), + pv.from_list([7, 8, 9]), + pv.from_list([10, 11, 12]), + ] + + w = [ + pv.from_list([1, 2, 3]), + pv.from_list([7, 8, 9]), + ] + + u_names = ["a", "b", "c"] + w_names = ["x", "y"] + u_rw = pv.row_vector(u_names, u) + v_rw = pv.row_vector(u_names, v) + w_rw = pv.row_vector(w_names, w) + y_rw = pv.row_vector(u_names, u) + x1_rw = pv.row_vector(u_names, u, {0: True, 2: True}) + x2_rw = pv.row_vector(u_names, u, {0: True, 2: True}) + + assert u_rw != w_rw # num of children doesn't match + assert u_rw != v_rw # data doesn't match + assert u_rw == y_rw # data match + assert x1_rw == x2_rw # with null + assert x1_rw != u_rw # with and without null diff --git a/scripts/adapters.dockerfile b/scripts/adapters.dockerfile new file mode 100644 index 0000000000000..6f8ebc1c95808 --- /dev/null +++ b/scripts/adapters.dockerfile @@ -0,0 +1,46 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Build the test and build container for presto_cpp +ARG image=ghcr.io/facebookincubator/velox-dev:centos9 +FROM $image + +COPY scripts/setup-helper-functions.sh / +COPY scripts/setup-adapters.sh / +RUN mkdir build && ( cd build && source /opt/rh/gcc-toolset-12/enable && \ + bash /setup-adapters.sh ) && rm -rf build && dnf remove -y conda && dnf clean all + +# install miniforge +RUN curl -L -o /tmp/miniforge.sh https://github.com/conda-forge/miniforge/releases/download/23.11.0-0/Mambaforge-23.11.0-0-Linux-x86_64.sh && \ + bash /tmp/miniforge.sh -b -p /opt/miniforge && \ + rm /tmp/miniforge.sh +ENV PATH=/opt/miniforge/condabin:${PATH} + +# install test dependencies +RUN mamba create -y --name adapters python=3.8 +SHELL ["mamba", "run", "-n", "adapters", "/bin/bash", "-c"] + +RUN pip install https://github.com/googleapis/storage-testbench/archive/refs/tags/v0.36.0.tar.gz +RUN mamba install -y nodejs +RUN npm install -g azurite + +ENV HADOOP_HOME=/usr/local/hadoop \ + HADOOP_ROOT_LOGGER="WARN,DRFA" \ + LC_ALL=C \ + PATH=/usr/local/hadoop/bin:${PATH} \ + JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk \ + PATH=/usr/lib/jvm/java-1.8.0-openjdk/bin:${PATH} + +COPY scripts/setup-classpath.sh / +ENTRYPOINT ["/bin/bash", "-c", "source /set_classpath.sh && source /opt/rh/gcc-toolset-12/enable && exec \"$@\"", "--"] +CMD ["/bin/bash"] diff --git a/scripts/benchmark-requirements.txt b/scripts/benchmark-requirements.txt index bb0bfc7203ff7..3df0a15d0bd45 100644 --- a/scripts/benchmark-requirements.txt +++ b/scripts/benchmark-requirements.txt @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -benchadapt@git+https://github.com/conbench/conbench.git@7599305#subdirectory=benchadapt/python -benchalerts@git+https://github.com/conbench/conbench.git@7599305#subdirectory=benchalerts -benchclients@git+https://github.com/conbench/conbench.git@7599305#subdirectory=benchclients/python +benchadapt==2024.3.20 +benchalerts==2024.1.10.1 +benchclients==2024.3.29.1 diff --git a/scripts/bm-report/default.nix b/scripts/bm-report/default.nix new file mode 100644 index 0000000000000..4267be1044dd4 --- /dev/null +++ b/scripts/bm-report/default.nix @@ -0,0 +1,61 @@ +# This file was generated by the {rix} R package v0.6.0 on 2024-05-15 +# with following call: +# >rix::rix(r_ver = "abd6d48f8c77bea7dc51beb2adfa6ed3950d2585", +# > r_pkgs = c("dplyr", +# > "prettyunits", +# > "ggplot2", +# > "gh", +# > "gt", +# > "hms", +# > "jqr", +# > "jsonlite", +# > "lubridate", +# > "memoise", +# > "plotly", +# > "purrr", +# > "remotes"), +# > system_pkgs = c("quarto"), +# > git_pkgs = list(package_name = "conbenchcoms", +# > repo_url = "https://github.com/conbench/conbenchcoms", +# > branch_name = "main", +# > commit = "55cdb120bbe2c668d3cf8ae543f4922131653645"), +# > ide = "other", +# > project_path = path_default_nix, +# > overwrite = TRUE, +# > print = TRUE) +# It uses nixpkgs' revision abd6d48f8c77bea7dc51beb2adfa6ed3950d2585 for reproducibility purposes +# which will install R version latest +# Report any issues to https://github.com/b-rodrigues/rix +let + pkgs = import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/abd6d48f8c77bea7dc51beb2adfa6ed3950d2585.tar.gz") {}; + rpkgs = builtins.attrValues { + inherit (pkgs.rPackages) dplyr prettyunits ggplot2 gh gt hms jqr jsonlite lubridate memoise plotly quarto purrr remotes; +}; + git_archive_pkgs = [(pkgs.rPackages.buildRPackage { + name = "conbenchcoms"; + src = pkgs.fetchgit { + url = "https://github.com/conbench/conbenchcoms"; + branchName = "main"; + rev = "55cdb120bbe2c668d3cf8ae543f4922131653645"; + sha256 = "sha256-XR5+grCUKyvSxsqiOkKd3gMUsWDJblZDmF4O+Jehq6U="; + }; + propagatedBuildInputs = builtins.attrValues { + inherit (pkgs.rPackages) dplyr glue httr2 yaml; + }; + }) ]; + system_packages = builtins.attrValues { + inherit (pkgs) R glibcLocales nix quartoMinimal; +}; + in + pkgs.mkShell { + LOCALE_ARCHIVE = if pkgs.system == "x86_64-linux" then "${pkgs.glibcLocales}/lib/locale/locale-archive" else ""; + LANG = "en_US.UTF-8"; + LC_ALL = "en_US.UTF-8"; + LC_TIME = "en_US.UTF-8"; + LC_MONETARY = "en_US.UTF-8"; + LC_PAPER = "en_US.UTF-8"; + LC_MEASUREMENT = "en_US.UTF-8"; + + buildInputs = [ git_archive_pkgs rpkgs system_packages ]; + + } diff --git a/scripts/bm-report/report.qmd b/scripts/bm-report/report.qmd new file mode 100644 index 0000000000000..bca541e498411 --- /dev/null +++ b/scripts/bm-report/report.qmd @@ -0,0 +1,333 @@ +--- +title: "Velox Build Metrics" +execute: + echo: false + warning: false +format: + html: + grid: + sidebar-width: 0px + body-width: 1800px + margin-width: 150px + gutter-width: 1.5rem + self-contained: true + page-layout: full + toc: false + margin-left: 30px + link-external-newwindow: true + theme: cosmo +--- + + + + +```{r setup} +library(gt) +library(ggplot2) +library(plotly) +library(dplyr) +library(purrr) + +# Cache conbench and gh api results for local development +cd <- cachem::cache_disk(rappdirs::user_cache_dir("velox-bm-report")) +mgh <- memoise::memoise(gh::gh, cache = cd) +mruns <- memoise::memoise(conbenchcoms::runs, cache = cd) +mresults <- memoise::memoise(conbenchcoms::benchmark_results, cache = cd) + +# Get latest runs of build-metric job +runs <- mgh( + "GET /repos/facebookincubator/velox/actions/workflows/build-metrics.yml/runs", + status = "success", + branch = "main" +) |> jsonlite::toJSON() + +# Extract the commit sha of the most recent run. The results of the latest +# run are displayed in the tables. +newest_sha <- runs |> + jqr::jq(".workflow_runs | max_by(.updated_at) | .head_sha") |> + jsonlite::fromJSON() + +run_shas <- runs |> + jqr::jq("[.workflow_runs[].head_sha]") |> + jsonlite::fromJSON() +run_ids <- mruns(run_shas) |> + filter(commit.branch == "facebookincubator:main", substr(id, 1, 2) == "BM") |> + pull(id) + +# Fetch the result and do clean/format the data +results <- run_ids |> + purrr::map_df(mresults) |> + mutate( + timestamp = lubridate::as_datetime(timestamp), + stats.data = unlist(stats.data), + type = case_when( + startsWith(run_id, "BM-debug") ~ "debug", + .default = "release" + ) + ) +``` + +```{r ggplot2-specs} +theme_set(theme_minimal(base_size = 12) %+replace% + theme( + plot.title.position = "plot", + strip.text = element_text(size = 12) + )) + +format_tags <- function(x) { + x |> + stringr::str_replace_all("_", " ") |> + stringr::str_to_title() +} +``` + +::::: {.panel-tabset} + +## Times +```{r total-graphs} +# Filter the data and layout the overview plots +times_plot <- results |> + filter(tags.suite == "total", endsWith(tags.source, "time"), tags.name != "wall_time") |> + mutate( + stats.data = lubridate::dseconds(stats.data), + tags.name = format_tags(tags.name) + ) |> + ggplot(aes( + x = timestamp, + y = stats.data, + group = interaction(tags.name, type), color = tags.name + )) + + facet_wrap(~type) + + geom_line() + + geom_point() + + scale_y_time() + + scale_x_datetime() + + labs( + title = "Velox Build Times", + x = "Date", + y = "Time in Minutes" + ) + + scale_color_viridis_d() +ggplotly(times_plot) |> + layout(legend = list(title = list(text = "Tags Name
"))) ## needed because theme legend specs don't work with ggplotly +``` + +```{r expensive-objects-compile} +# Format compile time data +compile_times <- results |> + filter(tags.suite == "compiling", commit.sha == newest_sha) |> + mutate( + stats.data = lubridate::dseconds(stats.data), + tags.name = glue::glue("`{tags.name}`") + ) +``` + +### Compile Times + +:::: {.columns} + +::: {.column width="49%"} + +```{r compile-times-release} +# Select and format the data to be displayed in the release compile time table +compile_times |> + filter(type == "release") |> + select(tags.name, stats.data) |> + arrange(desc(stats.data)) |> + gt() |> + cols_label( + `tags.name` = "Object", + `stats.data` = "Time" + ) |> + cols_align(align = "left", columns = everything()) |> + tab_header(title = "Release") |> + fmt_markdown(columns = "tags.name") |> + opt_interactive(use_page_size_select = TRUE, use_search = TRUE) +``` + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="49%"} + +```{r compile-times-debug} +# Select and format the data to be displayed in the debug compile time table +compile_times |> + filter(type == "debug") |> + select(tags.name, stats.data) |> + arrange(desc(stats.data)) |> + gt() |> + cols_label( + `tags.name` = "Object", + `stats.data` = "Time" + ) |> + cols_align(align = "left", columns = everything()) |> + tab_header(title = "Debug") |> + fmt_markdown(columns = "tags.name") |> + opt_interactive(use_page_size_select = TRUE, use_search = TRUE) +``` + +::: + +:::: + +```{r expensive-objects-link} +# Format linke time data +link_times <- results |> + filter(tags.suite == "linking", commit.sha == newest_sha) |> + mutate( + stats.data = lubridate::dseconds(stats.data), + tags.name = glue::glue("`{tags.name}`") + ) + +``` + +### Link Times + +:::: {.columns} + +::: {.column width="49%"} + +```{r link-times-release} +# Select and format the data to be displayed in the release link time table +link_times |> + filter(type == "release") |> + select(tags.name, stats.data) |> + arrange(desc(stats.data)) |> + gt() |> + cols_label( + `tags.name` = "Object", + `stats.data` = "Time" + ) |> + cols_align(align = "left", columns = everything()) |> + tab_header(title = "Release") |> + fmt_markdown(columns = "tags.name") |> + opt_interactive(use_page_size_select = TRUE, use_search = TRUE) +``` + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="49%"} + +```{r link-times-debug} +# Select and format the data to be displayed in the debug link time table +link_times |> + filter(type == "debug") |> + select(tags.name, stats.data) |> + arrange(desc(stats.data)) |> + gt() |> + cols_label( + `tags.name` = "Object", + `stats.data` = "Time" + ) |> + cols_align(align = "left", columns = everything()) |> + tab_header(title = "Link Times - Debug") |> + fmt_markdown(columns = "tags.name") |> + opt_interactive(use_page_size_select = TRUE, use_search = TRUE) +``` + +::: + +:::: + + +## Sizes +```{r big-objects} +# This is converts byte values into human-readable values in the tables +size_formatter <- function(x) { + function(x) { + prettyunits::pretty_bytes(x) + } +} + +# Prepare object size data +object_sizes <- results |> + filter(endsWith(tags.source, "size"), commit.sha == newest_sha) |> + mutate( + tags.name = glue::glue("`{tags.name}`") + ) + +# Filter the data and layout the size overview plots +sizes_plot <- results |> + filter(tags.suite == "executable", startsWith(tags.name, "total_")) |> + ggplot(aes( + x = timestamp, + y = stats.data, + group = interaction(tags.name, type), color = tags.name + )) + + facet_wrap(~type) + + geom_line() + + geom_point() + + scale_y_continuous(labels = size_formatter()) + + scale_x_datetime() + + labs( + title = "Velox Object Sizes", + x = "Date", + y = "Size" + ) + + scale_color_viridis_d() +ggplotly(sizes_plot) |> + layout(legend = list(title = list(text = "Tags Name
"))) ## needed because theme legend specs don't work with ggplotly +``` + +### Object Sizes +:::: {.columns} + +::: {.column width="49%"} + +```{r object-sizes-release} +# Select and format the data to be displayed in the release size table +object_sizes |> + filter(type == "release") |> + select(tags.name, stats.data) |> + arrange(desc(stats.data)) |> + gt() |> + cols_label( + `tags.name` = "Object", + `stats.data` = "Size" + ) |> + fmt(columns = `stats.data`, fn = size_formatter()) |> + fmt_markdown(columns = "tags.name") |> + cols_align(align = "left", columns = everything()) |> + tab_header(title = "Release") |> + opt_interactive(use_page_size_select = TRUE, use_search = TRUE) +``` + +::: + +::: {.column width="2%"} + +::: + +::: {.column width="49%"} + +```{r object-sizes-debug} +# Select and format the data to be displayed in the debug size table +object_sizes |> + filter(type == "debug") |> + select(tags.name, stats.data) |> + arrange(desc(stats.data)) |> + gt() |> + fmt(columns = `stats.data`, fn = size_formatter()) |> + fmt_markdown(columns = "tags.name") |> + cols_label( + `tags.name` = "Object", + `stats.data` = "Time" + ) |> + cols_align(align = "left", columns = everything()) |> + tab_header(title = "Debug") |> + opt_interactive(use_page_size_select = TRUE, use_search = TRUE) +``` + +::: + +:::: + +::::: diff --git a/scripts/build-metrics.py b/scripts/build-metrics.py new file mode 100755 index 0000000000000..707d81eb6a31b --- /dev/null +++ b/scripts/build-metrics.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys +import uuid +from os.path import join, splitext +from pathlib import Path +from typing import Any, Dict, List + +from benchadapt import BenchmarkResult +from benchadapt.adapters import BenchmarkAdapter + + +class BinarySizeAdapter(BenchmarkAdapter): + """ + Adapter to track build artifact sizes in conbench. + Expects the `size_file` to be formatted like this: + + + Suite meta data will be library, object or executable + based on file ending. + """ + + size_file: Path + + def __init__( + self, + command: List[str], + size_file: str, + build_type: str, + result_fields_override: Dict[str, Any] = {}, + result_fields_append: Dict[str, Any] = {}, + ) -> None: + self.size_file = Path(size_file) + if build_type not in ["debug", "release"]: + raise ValueError(f"Build type '{build_type}' is not valid!") + self.build_type = build_type + super().__init__(command, result_fields_override, result_fields_append) + + def _transform_results(self) -> List[BenchmarkResult]: + results = [] + + batch_id = uuid.uuid4().hex + with open(self.size_file, "r") as file: + sizes = [line.strip() for line in file] + + if not sizes: + raise ValueError("'size_file' is empty!") + + for line in sizes: + size, path = line.split(maxsplit=1) + path = path.strip() + _, ext = splitext(path) + if ext in [".so", ".a"]: + suite = "library" + elif ext == ".o": + suite = "object" + else: + suite = "executable" + + parsed_size = BenchmarkResult( + run_reason="merge", + batch_id=batch_id, + stats={ + "data": [size], + "unit": "B", + "iterations": 1, + }, + tags={ + "name": path, + "suite": suite, + "source": f"{self.build_type}_build_metrics_size", + }, + info={}, + context={"benchmark_language": "C++"}, + ) + results.append(parsed_size) + + return results + + +class NinjaLogAdapter(BenchmarkAdapter): + """ + Adapter to extract compile and link times from a .ninja_log. + Will calculate aggregates for total, compile and link time. + Suite metadata will be set based on binary ending to object, library or executable. + + Only files in paths beginning with velox/ will be tracked to avoid dependencies. + """ + + ninja_log: Path + + def __init__( + self, + command: List[str], + ninja_log: str, + build_type: str, + result_fields_override: Dict[str, Any] = {}, + result_fields_append: Dict[str, Any] = {}, + ) -> None: + self.ninja_log = Path(ninja_log) + if build_type not in ["debug", "release"]: + raise ValueError(f"Build type '{build_type}' is not valid!") + self.build_type = build_type + super().__init__(command, result_fields_override, result_fields_append) + + def _transform_results(self) -> List[BenchmarkResult]: + results = [] + + batch_id = uuid.uuid4().hex + with open(self.ninja_log, "r") as file: + log_lines = [line.strip() for line in file] + + if not log_lines[0].startswith("# ninja log v"): + raise ValueError("Malformed Ninja log found!") + else: + del log_lines[0] + + ms2sec = lambda x: x / 1000 + get_epoch = lambda l: int(l.split()[2]) + totals = { + "link_time": 0, + "compile_time": 0, + "total_time": 0, + } + + for line in log_lines: + start, end, epoch, object_path, _ = line.split() + start = int(start) + end = int(end) + duration = ms2sec(end - start) + + # Don't track dependency times (refine check potentially?) + if not object_path.startswith("velox"): + continue + + _, ext = splitext(object_path) + if ext in [".so", ".a"] or not ext: + totals["link_time"] += duration + suite = "linking" + elif ext == ".o": + totals["compile_time"] += duration + suite = "compiling" + else: + print(f"Unkown file type found: {object_path}") + print("Skipping...") + continue + + time_result = BenchmarkResult( + run_reason="merge", + batch_id=batch_id, + stats={ + "data": [duration], + "unit": "s", + "iterations": 1, + }, + tags={ + "name": object_path, + "suite": suite, + "source": f"{self.build_type}_build_metrics_time", + }, + info={}, + context={"benchmark_language": "C++"}, + ) + results.append(time_result) + + totals["total_time"] = totals["link_time"] + totals["compile_time"] + for total_name, total in totals.items(): + total_result = BenchmarkResult( + run_reason="merge", + batch_id=batch_id, + stats={ + "data": [total], + "unit": "s", + "iterations": 1, + }, + tags={ + "name": total_name, + "suite": "total", + "source": f"{self.build_type}_build_metrics_time", + }, + info={}, + context={"benchmark_language": "C++"}, + ) + results.append(total_result) + + return results + + +# find velox -type f -name '*.o' -exec ls -l -BB {} \; | awk '{print $5, $9}' | sed 's|CMakeFiles/.*dir/||g' > /tmp/object-size + + +def upload(args): + print("Uploading Build Metrics") + pr_number = int(args.pr_number) if args.pr_number else None + run_reason = "pull request" if pr_number else "commit" + run_name = f"{run_reason}: {args.sha}" + sizes = BinarySizeAdapter( + command=["true"], + size_file=join(args.base_path, args.size_file), + build_type=args.build_type, + result_fields_override={ + "run_id": args.run_id, + "run_name": run_name, + "run_reason": run_reason, + "github": { + "repository": "https://github.com/facebookincubator/velox", + "pr_number": pr_number, + "commit": args.sha, + }, + }, + ) + sizes() + + times = NinjaLogAdapter( + command=["true"], + ninja_log=join(args.base_path, args.ninja_log), + build_type=args.build_type, + result_fields_override={ + "run_id": args.run_id, + "run_name": run_name, + "run_reason": run_reason, + "github": { + "repository": "https://github.com/facebookincubator/velox", + "pr_number": pr_number, + "commit": args.sha, + }, + }, + ) + times() + + +def parse_args(): + parser = argparse.ArgumentParser(description="Velox Build Metric Utility.") + parser.set_defaults(func=lambda _: parser.print_help()) + + subparsers = parser.add_subparsers(help="Please specify one of the subcommands.") + + upload_parser = subparsers.add_parser( + "upload", help="Parse and upload build metrics" + ) + upload_parser.set_defaults(func=upload) + upload_parser.add_argument( + "--ninja_log", default=".ninja_log", help="Name of the ninja log file." + ) + upload_parser.add_argument( + "--size_file", + default="object_sizes", + help="Name of the file containing size information.", + ) + upload_parser.add_argument( + "--build_type", + required=True, + help="Type of build results come from, e.g. debug or release", + ) + upload_parser.add_argument( + "--run_id", + required=True, + help="A Conbench run ID unique to this build.", + ) + upload_parser.add_argument( + "--sha", + required=True, + help="HEAD sha for the result upload to conbench.", + ) + upload_parser.add_argument( + "--pr_number", + default=0, + help="PR number for the result upload to conbench.", + ) + upload_parser.add_argument( + "base_path", + help="Path in which the .ninja_log and sizes_file are found.", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + args.func(args) diff --git a/scripts/centos-8-stream.dockerfile b/scripts/centos-8-stream.dockerfile deleted file mode 100644 index a6a4531c8ef8c..0000000000000 --- a/scripts/centos-8-stream.dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Build the test and build container for presto_cpp -# -FROM quay.io/centos/centos:stream8 - -ADD scripts /velox/scripts/ -RUN /velox/scripts/setup-centos8.sh - -WORKDIR /velox - diff --git a/scripts/centos.dockerfile b/scripts/centos.dockerfile new file mode 100644 index 0000000000000..50a18e9a7243e --- /dev/null +++ b/scripts/centos.dockerfile @@ -0,0 +1,32 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Build the test and build container for presto_cpp +ARG image=quay.io/centos/centos:stream9 +FROM $image + +COPY scripts/setup-helper-functions.sh / +COPY scripts/setup-centos9.sh / +# The removal of the build dir has to happen in the same layer as the build +# to minimize the image size. gh & jq are required for CI +RUN mkdir build && ( cd build && bash /setup-centos9.sh ) && rm -rf build && \ + dnf install -y -q 'dnf-command(config-manager)' && \ + dnf config-manager --add-repo 'https://cli.github.com/packages/rpm/gh-cli.repo' && \ + dnf install -y -q gh jq && \ + dnf clean all + +ENV CC=/opt/rh/gcc-toolset-12/root/bin/gcc \ + CXX=/opt/rh/gcc-toolset-12/root/bin/g++ + +ENTRYPOINT ["/bin/bash", "-c", "source /opt/rh/gcc-toolset-12/enable && exec \"$@\"", "--"] +CMD ["/bin/bash"] diff --git a/scripts/check-container.dockfile b/scripts/check-container.dockfile index d5f1ba4c873e4..9240a97dcd8c1 100644 --- a/scripts/check-container.dockfile +++ b/scripts/check-container.dockfile @@ -11,8 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -FROM amd64/ubuntu:22.04 -ARG cpu_target -COPY setup-check.sh /root -COPY setup-helper-functions.sh / -RUN CPU_TARGET="$cpu_target" bash /root/setup-check.sh +FROM amd64/ubuntu:24.04 +COPY scripts/setup-check.sh /root +COPY scripts/setup-helper-functions.sh / +RUN bash /root/setup-check.sh diff --git a/scripts/check.py b/scripts/check.py index f76f130533f77..bd47ecd5324f1 100755 --- a/scripts/check.py +++ b/scripts/check.py @@ -63,6 +63,15 @@ def fix(self, commit): class CMakeFormatter(str): + def __init__(self, commit) -> None: + super().__init__() + try: + import yaml + except ModuleNotFoundError: + # We need pyyaml so cmake-format can read '.cmake-format.yml' + # otherwise it will run with default + raise SystemExit("Please install 'pyyaml' for the CMake formatter.") + def diff(self, commit): return get_diff( self, util.run(f"cmake-format --first-comment-is-literal True {self}")[1] @@ -185,13 +194,16 @@ def get_files(commit, path): if commit != "": status, stdout, stderr = util.run( - f"git diff --relative --name-only --diff-filter='ACM' {commit}" + f"git diff --relative --name-only --diff-filter='ACMR' {commit}" ) filelist = stdout.splitlines() else: - for root, dirs, files in os.walk(path): - for name in files: - filelist.append(os.path.join(root, name)) + if os.path.isfile(path): + filelist.append(path) + else: + for root, dirs, files in os.walk(path): + for name in files: + filelist.append(os.path.join(root, name)) return [ file @@ -201,6 +213,7 @@ def get_files(commit, path): and "build/fbcode_builder" not in file and "build/deps" not in file and "cmake-build-debug" not in file + and "NOTICE.txt" != file ] diff --git a/scripts/circleci-container.dockfile b/scripts/circleci-container.dockfile deleted file mode 100644 index 3c9152d7e7eb9..0000000000000 --- a/scripts/circleci-container.dockfile +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Build the test and build container for presto_cpp -# -FROM quay.io/centos/centos:stream8 -ARG cpu_target -COPY setup-circleci.sh / -COPY setup-helper-functions.sh / -RUN mkdir build && ( cd build && CPU_TARGET="$cpu_target" bash /setup-circleci.sh ) && rm -rf build diff --git a/scripts/docker-command.sh b/scripts/docker-command.sh index 4c340351313b8..8abc2af301885 100755 --- a/scripts/docker-command.sh +++ b/scripts/docker-command.sh @@ -16,4 +16,4 @@ set -eu # Compilation and testing make -cd _build/release && ctest -j${NUM_THREADS} -VV --output-on-failure +cd _build/release && ctest -j${NUM_THREADS} --output-on-failure --no-tests=error diff --git a/scripts/gen-docs.sh b/scripts/gen-docs.sh deleted file mode 100755 index 4b40dda01a7c6..0000000000000 --- a/scripts/gen-docs.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Generate Documentation -# **NOTE** -# ******** -# scripts assume there is a conda environment already -# created with the name as an argument to the script -# Example -# Create Environment -# `conda create -y --name pyveloxenv-docs python=3.7` -DOCS_CONDA_ENV=$1 -ENVS=$(conda env list | grep $DOCS_CONDA_ENV) -if [ -z "$ENVS" ] -then - echo "conda environment for documentation not available" -else - echo "Installing doc generation dependencies..." - source ${CONDA_PREFIX}/etc/profile.d/conda.sh - conda activate ${DOCS_CONDA_ENV} - conda install -y -c anaconda sphinx - conda install -y -c conda-forge pandoc - ## install Pyvelox - LD_LIBRARY_PATH=/usr/local/lib make python-build - ## generate the Python README - cd velox/docs \ - && make clean \ - && mkdir -p bindings/python \ - && pandoc ../../pyvelox/README.md --from markdown --to rst -s -o bindings/python/README_generated_pyvelox.rst \ - && make html -fi diff --git a/.circleci/hdfs-client.xml b/scripts/hdfs-client.xml similarity index 100% rename from .circleci/hdfs-client.xml rename to scripts/hdfs-client.xml diff --git a/scripts/presto/etc/config.properties.example b/scripts/presto/etc/config.properties.example new file mode 100644 index 0000000000000..1639cec2fe979 --- /dev/null +++ b/scripts/presto/etc/config.properties.example @@ -0,0 +1,5 @@ +coordinator=true +node-scheduler.include-coordinator=true +http-server.http.port=8080 +discovery-server.enabled=true +discovery.uri=http://localhost:8080 diff --git a/scripts/presto/etc/hive.properties b/scripts/presto/etc/hive.properties new file mode 100644 index 0000000000000..e9a0d05c76a77 --- /dev/null +++ b/scripts/presto/etc/hive.properties @@ -0,0 +1,4 @@ +connector.name=hive-hadoop2 +hive.metastore=file +hive.metastore.catalog.dir=file:/opt/presto-server/etc/data +hive.allow-drop-table=true \ No newline at end of file diff --git a/scripts/presto/etc/jvm.config.example b/scripts/presto/etc/jvm.config.example new file mode 100644 index 0000000000000..3005c82637f8d --- /dev/null +++ b/scripts/presto/etc/jvm.config.example @@ -0,0 +1,10 @@ +-server +-Xmx10G +-XX:+UseG1GC +-XX:G1HeapRegionSize=32M +-XX:+UseGCOverheadLimit +-XX:+ExplicitGCInvokesConcurrent +-XX:+HeapDumpOnOutOfMemoryError +-XX:+ExitOnOutOfMemoryError +-Djdk.attach.allowAttachSelf=true +-Duser.timezone=America/Los_Angeles diff --git a/scripts/presto/etc/node.properties b/scripts/presto/etc/node.properties new file mode 100644 index 0000000000000..9aeda0213159b --- /dev/null +++ b/scripts/presto/etc/node.properties @@ -0,0 +1,3 @@ +node.environment=production +node.id=ffffffff-ffff-ffff-ffff-ffffffffffff +node.data-dir=/tmp/presto/data diff --git a/scripts/presto/start-prestojava.sh b/scripts/presto/start-prestojava.sh new file mode 100755 index 0000000000000..290e43af8afe7 --- /dev/null +++ b/scripts/presto/start-prestojava.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +"$PRESTO_HOME"/bin/launcher --pid-file=/tmp/pidfile run + diff --git a/scripts/prestojava-container.dockerfile b/scripts/prestojava-container.dockerfile new file mode 100644 index 0000000000000..7ef27c6f68a51 --- /dev/null +++ b/scripts/prestojava-container.dockerfile @@ -0,0 +1,51 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Build the test and build container for presto_cpp +# +FROM ghcr.io/facebookincubator/velox-dev:centos9 + +ARG PRESTO_VERSION=0.288 + +ADD scripts /velox/scripts/ +RUN wget https://repo1.maven.org/maven2/com/facebook/presto/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz +RUN wget https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar + +ARG PRESTO_PKG=presto-server-$PRESTO_VERSION.tar.gz +ARG PRESTO_CLI_JAR=presto-cli-$PRESTO_VERSION-executable.jar + +ENV PRESTO_HOME="/opt/presto-server" +RUN cp $PRESTO_CLI_JAR /opt/presto-cli + +RUN dnf install -y java-11-openjdk less procps python3 tzdata \ + && ln -s $(which python3) /usr/bin/python \ + && tar -zxf $PRESTO_PKG \ + && mv ./presto-server-$PRESTO_VERSION $PRESTO_HOME \ + && chmod +x /opt/presto-cli \ + && ln -s /opt/presto-cli /usr/local/bin/ \ + && mkdir -p $PRESTO_HOME/etc \ + && mkdir -p $PRESTO_HOME/etc/catalog \ + && mkdir -p $PRESTO_HOME/etc/data \ + && mkdir -p /usr/lib/presto/utils + +# We set the timezone to America/Los_Angeles due to issue +# detailed here : https://github.com/facebookincubator/velox/issues/8127 +ENV TZ=America/Los_Angeles + +COPY scripts/presto/etc/config.properties.example $PRESTO_HOME/etc/config.properties +COPY scripts/presto/etc/jvm.config.example $PRESTO_HOME/etc/jvm.config +COPY scripts/presto/etc/node.properties $PRESTO_HOME/etc/node.properties +COPY scripts/presto/etc/hive.properties $PRESTO_HOME/etc/catalog +COPY scripts/presto/start-prestojava.sh /opt + +WORKDIR /velox diff --git a/scripts/setup-adapters.sh b/scripts/setup-adapters.sh index 5bc9c81e4605f..4fed3cdced9a1 100755 --- a/scripts/setup-adapters.sh +++ b/scripts/setup-adapters.sh @@ -21,14 +21,39 @@ set -eufx -o pipefail SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}") source $SCRIPTDIR/setup-helper-functions.sh -DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)} +DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)/deps-download} +CMAKE_BUILD_TYPE="${BUILD_TYPE:-Release}" +MACHINE=$(uname -m) -function install_aws-sdk-cpp { +if [[ "$OSTYPE" == darwin* ]]; then + export INSTALL_PREFIX=${INSTALL_PREFIX:-"$(pwd)/deps-install"} +fi + +function install_aws_deps { local AWS_REPO_NAME="aws/aws-sdk-cpp" - local AWS_SDK_VERSION="1.10.57" + local AWS_SDK_VERSION="1.11.321" github_checkout $AWS_REPO_NAME $AWS_SDK_VERSION --depth 1 --recurse-submodules - cmake_install -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS:BOOL=OFF -DMINIMIZE_SIZE:BOOL=ON -DENABLE_TESTING:BOOL=OFF -DBUILD_ONLY:STRING="s3;identity-management" + cmake_install -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS:BOOL=OFF -DMINIMIZE_SIZE:BOOL=ON -DENABLE_TESTING:BOOL=OFF -DBUILD_ONLY:STRING="s3;identity-management" + # Dependencies for S3 testing + # We need this specific version of Minio for testing. + local MINIO_ARCH=$MACHINE + if [[ $MACHINE == aarch64 ]]; then + MINIO_ARCH="arm64" + elif [[ $MACHINE == x86_64 ]]; then + MINIO_ARCH="amd64" + fi + local MINIO_BINARY="minio-2022-05-26" + if [ ! -f /usr/local/bin/${MINIO_BINARY} ]; then + local MINIO_OS="linux" + if [[ "$OSTYPE" == darwin* ]]; then + # minio will have to approved under the Privacy & Security on MacOS on first use. + MINIO_OS="darwin" + fi + wget https://dl.min.io/server/minio/release/${MINIO_OS}-${MINIO_ARCH}/archive/minio.RELEASE.2022-05-26T05-48-41Z -O ${MINIO_BINARY} + chmod +x ./${MINIO_BINARY} + mv ./${MINIO_BINARY} /usr/local/bin/ + fi } function install_gcs-sdk-cpp { @@ -36,95 +61,132 @@ function install_gcs-sdk-cpp { # https://github.com/googleapis/google-cloud-cpp/blob/main/doc/packaging.md#required-libraries # abseil-cpp - github_checkout abseil/abseil-cpp 20230125.3 --depth 1 - sed -i 's/^#define ABSL_OPTION_USE_\(.*\) 2/#define ABSL_OPTION_USE_\1 0/' "absl/base/options.h" - cmake_install -DBUILD_SHARED_LIBS=OFF \ - -DABSL_BUILD_TESTING=OFF + github_checkout abseil/abseil-cpp 20240116.2 --depth 1 + cmake_install \ + -DABSL_BUILD_TESTING=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DABSL_PROPAGATE_CXX_STD=ON \ + -DABSL_ENABLE_INSTALL=ON + + # protobuf + github_checkout protocolbuffers/protobuf v21.8 --depth 1 + cmake_install \ + -Dprotobuf_BUILD_TESTS=OFF \ + -Dprotobuf_ABSL_PROVIDER=package + + # grpc + github_checkout grpc/grpc v1.48.1 --depth 1 + cmake_install \ + -DgRPC_BUILD_TESTS=OFF \ + -DgRPC_ABSL_PROVIDER=package \ + -DgRPC_ZLIB_PROVIDER=package \ + -DgRPC_CARES_PROVIDER=package \ + -DgRPC_RE2_PROVIDER=package \ + -DgRPC_SSL_PROVIDER=package \ + -DgRPC_PROTOBUF_PROVIDER=package \ + -DgRPC_INSTALL=ON # crc32 github_checkout google/crc32c 1.1.2 --depth 1 - cmake_install -DBUILD_SHARED_LIBS=OFF \ + cmake_install \ -DCRC32C_BUILD_TESTS=OFF \ -DCRC32C_BUILD_BENCHMARKS=OFF \ -DCRC32C_USE_GLOG=OFF # nlohmann json - github_checkout nlohmann/json v3.11.2 --depth 1 - cmake_install -DBUILD_SHARED_LIBS=OFF \ + github_checkout nlohmann/json v3.11.3 --depth 1 + cmake_install \ -DJSON_BuildTests=OFF # google-cloud-cpp - github_checkout googleapis/google-cloud-cpp v2.10.1 --depth 1 - cmake_install -DBUILD_SHARED_LIBS=OFF \ - -DCMAKE_INSTALL_MESSAGE=NEVER \ + github_checkout googleapis/google-cloud-cpp v2.22.0 --depth 1 + cmake_install \ -DGOOGLE_CLOUD_CPP_ENABLE_EXAMPLES=OFF \ -DGOOGLE_CLOUD_CPP_ENABLE=storage } function install_azure-storage-sdk-cpp { - github_checkout azure/azure-sdk-for-cpp azure-storage-blobs_12.8.0 - - cd sdk/core/azure-core - if ! grep -q "baseline" vcpkg.json; then + # Disable VCPKG to install additional static dependencies under the VCPKG installed path + # instead of using system pre-installed dependencies. + export AZURE_SDK_DISABLE_AUTO_VCPKG=ON + vcpkg_commit_id=7a6f366cefd27210f6a8309aed10c31104436509 + github_checkout azure/azure-sdk-for-cpp azure-storage-files-datalake_12.8.0 + sed -i "s/set(VCPKG_COMMIT_STRING .*)/set(VCPKG_COMMIT_STRING $vcpkg_commit_id)/" cmake-modules/AzureVcpkg.cmake + + azure_core_dir="sdk/core/azure-core" + if ! grep -q "baseline" $azure_core_dir/vcpkg.json; then # build and install azure-core with the version compatible with system pre-installed openssl openssl_version=$(openssl version -v | awk '{print $2}') if [[ "$openssl_version" == 1.1.1* ]]; then openssl_version="1.1.1n" fi - sed -i 's/"version-string"/"builtin-baseline": "dafef74af53669ef1cc9015f55e0ce809ead62aa","version-string"/' vcpkg.json - sed -i "s/\"version-string\"/\"overrides\": [{ \"name\": \"openssl\", \"version-string\": \"$openssl_version\" }],\"version-string\"/" vcpkg.json + sed -i "s/\"version-string\"/\"builtin-baseline\": \"$vcpkg_commit_id\",\"version-string\"/" $azure_core_dir/vcpkg.json + sed -i "s/\"version-string\"/\"overrides\": [{ \"name\": \"openssl\", \"version-string\": \"$openssl_version\" }],\"version-string\"/" $azure_core_dir/vcpkg.json fi - cmake_install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF - - cd - + ( + cd $azure_core_dir + cmake_install -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=OFF + ) # install azure-storage-common - cd sdk/storage/azure-storage-common - cmake_install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF + ( + cd sdk/storage/azure-storage-common + cmake_install -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=OFF + ) - cd - # install azure-storage-blobs - cd sdk/storage/azure-storage-blobs - if ! grep -q "baseline" vcpkg.json; then - sed -i 's/"version-semver"/"builtin-baseline": "dafef74af53669ef1cc9015f55e0ce809ead62aa","version-semver"/' vcpkg.json - fi - cmake_install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF + ( + cd sdk/storage/azure-storage-blobs + cmake_install -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=OFF + ) + # install azure-storage-files-datalake + ( + cd sdk/storage/azure-storage-files-datalake + cmake_install -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=OFF + ) } -function install_libhdfs3 { +function install_hdfs_deps { github_checkout apache/hawq master - cd $DEPENDENCY_DIR/hawq/depends/libhdfs3 + libhdfs3_dir=hawq/depends/libhdfs3 if [[ "$OSTYPE" == darwin* ]]; then - sed -i '' -e "/FIND_PACKAGE(GoogleTest REQUIRED)/d" ./CMakeLists.txt - sed -i '' -e "s/dumpversion/dumpfullversion/" ./CMakeLists.txt + sed -i '' -e "/FIND_PACKAGE(GoogleTest REQUIRED)/d" $DEPENDENCY_DIR/$libhdfs3_dir/CMakeLists.txt + sed -i '' -e "s/dumpversion/dumpfullversion/" $DEPENDENCY_DIR/$libhdfs3_dir/CMakeLists.txt fi if [[ "$OSTYPE" == linux-gnu* ]]; then - sed -i "/FIND_PACKAGE(GoogleTest REQUIRED)/d" ./CMakeLists.txt - sed -i "s/dumpversion/dumpfullversion/" ./CMake/Platform.cmake + sed -i "/FIND_PACKAGE(GoogleTest REQUIRED)/d" $DEPENDENCY_DIR/$libhdfs3_dir/CMakeLists.txt + sed -i "s/dumpversion/dumpfullversion/" $DEPENDENCY_DIR/$libhdfs3_dir/CMake/Platform.cmake + # Dependencies for Hadoop testing + wget_and_untar https://archive.apache.org/dist/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz hadoop + cp -a ${DEPENDENCY_DIR}/hadoop /usr/local/ + wget -P /usr/local/hadoop/share/hadoop/common/lib/ https://repo1.maven.org/maven2/junit/junit/4.11/junit-4.11.jar + + yum install -y java-1.8.0-openjdk-devel + fi - cmake_install + cmake_install_dir $libhdfs3_dir } -cd "${DEPENDENCY_DIR}" || exit +(mkdir -p "${DEPENDENCY_DIR}") || exit # aws-sdk-cpp missing dependencies if [[ "$OSTYPE" == "linux-gnu"* ]]; then # /etc/os-release is a standard way to query various distribution # information and is available everywhere LINUX_DISTRIBUTION=$(. /etc/os-release && echo ${ID}) - if [[ "$LINUX_DISTRIBUTION" == "ubuntu" ]]; then + if [[ "$LINUX_DISTRIBUTION" == "ubuntu" || "$LINUX_DISTRIBUTION" == "debian" ]]; then apt install -y --no-install-recommends libxml2-dev libgsasl7-dev uuid-dev # Dependencies of GCS, probably a workaround until the docker image is rebuilt apt install -y --no-install-recommends libc-ares-dev libcurl4-openssl-dev # Dependencies of Azure Storage Blob cpp apt install -y openssl else # Assume Fedora/CentOS - yum -y install libxml2-devel libgsasl-devel libuuid-devel + dnf -y install libxml2-devel libgsasl-devel libuuid-devel krb5-devel # Dependencies of GCS, probably a workaround until the docker image is rebuilt - yum -y install curl-devel c-ares-devel + dnf -y install npm curl-devel c-ares-devel # Dependencies of Azure Storage Blob Cpp - yum -y install perl-IPC-Cmd - yum -y install openssl + dnf -y install perl-IPC-Cmd + dnf -y install openssl fi fi @@ -174,10 +236,10 @@ if [ $install_gcs -eq 1 ]; then install_gcs-sdk-cpp fi if [ $install_aws -eq 1 ]; then - install_aws-sdk-cpp + install_aws_deps fi if [ $install_hdfs -eq 1 ]; then - install_libhdfs3 + install_hdfs_deps fi if [ $install_abfs -eq 1 ]; then install_azure-storage-sdk-cpp diff --git a/scripts/setup-centos8.sh b/scripts/setup-centos8.sh deleted file mode 100755 index fe6042301d8fb..0000000000000 --- a/scripts/setup-centos8.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -efx -o pipefail -# Some of the packages must be build with the same compiler flags -# so that some low level types are the same size. Also, disable warnings. -SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}") -source $SCRIPTDIR/setup-helper-functions.sh -CPU_TARGET="${CPU_TARGET:-avx}" -NPROC=$(getconf _NPROCESSORS_ONLN) -export CFLAGS=$(get_cxx_flags $CPU_TARGET) # Used by LZO. -export CXXFLAGS=$CFLAGS # Used by boost. -export CPPFLAGS=$CFLAGS # Used by LZO. - -function dnf_install { - dnf install -y -q --setopt=install_weak_deps=False "$@" -} - -dnf_install epel-release dnf-plugins-core # For ccache, ninja -dnf config-manager --set-enabled powertools -dnf_install ninja-build ccache gcc-toolset-9 git wget which libevent-devel \ - openssl-devel re2-devel libzstd-devel lz4-devel double-conversion-devel \ - libdwarf-devel curl-devel cmake libicu-devel - -dnf remove -y gflags - -# Required for Thrift -dnf_install autoconf automake libtool bison flex python3 - -# Required for google-cloud-storage -dnf_install curl-devel c-ares-devel - -dnf_install conda - -# Activate gcc9; enable errors on unset variables afterwards. -source /opt/rh/gcc-toolset-9/enable || exit 1 -set -u - -function cmake_install_deps { - cmake -B "$1-build" -GNinja -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS="${CFLAGS}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=Release -Wno-dev "$@" - ninja -C "$1-build" install -} - -function wget_and_untar { - local URL=$1 - local DIR=$2 - mkdir -p "${DIR}" - wget -q --max-redirect 3 -O - "${URL}" | tar -xz -C "${DIR}" --strip-components=1 -} - - -# Fetch sources. -wget_and_untar https://github.com/gflags/gflags/archive/v2.2.2.tar.gz gflags & -wget_and_untar https://github.com/google/glog/archive/v0.4.0.tar.gz glog & -wget_and_untar http://www.oberhumer.com/opensource/lzo/download/lzo-2.10.tar.gz lzo & -wget_and_untar https://boostorg.jfrog.io/artifactory/main/release/1.72.0/source/boost_1_72_0.tar.gz boost & -wget_and_untar https://github.com/google/snappy/archive/1.1.8.tar.gz snappy & -wget_and_untar https://github.com/fmtlib/fmt/archive/8.0.1.tar.gz fmt & - -wait # For cmake and source downloads to complete. - -# Build & install. -( - cd lzo - ./configure --prefix=/usr --enable-shared --disable-static --docdir=/usr/share/doc/lzo-2.10 - make "-j$(nproc)" - make install -) - -( - cd boost - ./bootstrap.sh --prefix=/usr/local - ./b2 "-j$(nproc)" -d0 install threading=multi -) - -cmake_install_deps gflags -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DBUILD_gflags_LIB=ON -DLIB_SUFFIX=64 -DCMAKE_INSTALL_PREFIX:PATH=/usr -cmake_install_deps glog -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX:PATH=/usr -cmake_install_deps snappy -DSNAPPY_BUILD_TESTS=OFF -cmake_install_deps fmt -DFMT_TEST=OFF - -dnf clean all diff --git a/scripts/setup-centos9.sh b/scripts/setup-centos9.sh new file mode 100755 index 0000000000000..1efb8e53ec34e --- /dev/null +++ b/scripts/setup-centos9.sh @@ -0,0 +1,281 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script documents setting up a Centos9 host for Velox +# development. Running it should make you ready to compile. +# +# Environment variables: +# * INSTALL_PREREQUISITES="N": Skip installation of packages for build. +# * PROMPT_ALWAYS_RESPOND="n": Automatically respond to interactive prompts. +# Use "n" to never wipe directories. +# +# You can also run individual functions below by specifying them as arguments: +# $ scripts/setup-centos9.sh install_googletest install_fmt +# + +set -efx -o pipefail +# Some of the packages must be build with the same compiler flags +# so that some low level types are the same size. Also, disable warnings. +SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}") +source $SCRIPTDIR/setup-helper-functions.sh +NPROC=$(getconf _NPROCESSORS_ONLN) +export CXXFLAGS=$(get_cxx_flags) # Used by boost. +export CFLAGS=${CXXFLAGS//"-std=c++17"/} # Used by LZO. +CMAKE_BUILD_TYPE="${BUILD_TYPE:-Release}" +BUILD_DUCKDB="${BUILD_DUCKDB:-true}" +USE_CLANG="${USE_CLANG:-false}" +export INSTALL_PREFIX=${INSTALL_PREFIX:-"/usr/local"} +DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)/deps-download} + +FB_OS_VERSION="v2024.09.16.00" +FMT_VERSION="10.1.1" +BOOST_VERSION="boost-1.84.0" +ARROW_VERSION="15.0.0" +FAST_FLOAT_VERSION="v6.1.6" + +function dnf_install { + dnf install -y -q --setopt=install_weak_deps=False "$@" +} + +function install_clang15 { + dnf_install clang15 gcc-toolset-13-libatomic-devel +} + +# Install packages required for build. +function install_build_prerequisites { + dnf update -y + dnf_install epel-release dnf-plugins-core # For ccache, ninja + dnf config-manager --set-enabled crb + dnf update -y + dnf_install ninja-build cmake ccache gcc-toolset-12 git wget which + dnf_install autoconf automake python3-devel pip libtool + + pip install cmake==3.28.3 + + if [[ ${USE_CLANG} != "false" ]]; then + install_clang15 + fi +} + +# Install dependencies from the package managers. +function install_velox_deps_from_dnf { + dnf_install libevent-devel \ + openssl-devel re2-devel libzstd-devel lz4-devel double-conversion-devel \ + libdwarf-devel elfutils-libelf-devel curl-devel libicu-devel bison flex \ + libsodium-devel zlib-devel + + # install sphinx for doc gen + pip install sphinx sphinx-tabs breathe sphinx_rtd_theme +} + +function install_conda { + dnf_install conda +} + +function install_gflags { + # Remove an older version if present. + dnf remove -y gflags + wget_and_untar https://github.com/gflags/gflags/archive/v2.2.2.tar.gz gflags + cmake_install_dir gflags -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DBUILD_gflags_LIB=ON -DLIB_SUFFIX=64 +} + +function install_glog { + wget_and_untar https://github.com/google/glog/archive/v0.6.0.tar.gz glog + cmake_install_dir glog -DBUILD_SHARED_LIBS=ON +} + +function install_lzo { + wget_and_untar http://www.oberhumer.com/opensource/lzo/download/lzo-2.10.tar.gz lzo + ( + cd ${DEPENDENCY_DIR}/lzo + ./configure --prefix=${INSTALL_PREFIX} --enable-shared --disable-static --docdir=/usr/share/doc/lzo-2.10 + make "-j$(nproc)" + make install + ) +} + +function install_boost { + wget_and_untar https://github.com/boostorg/boost/releases/download/${BOOST_VERSION}/${BOOST_VERSION}.tar.gz boost + ( + cd ${DEPENDENCY_DIR}/boost + if [[ ${USE_CLANG} != "false" ]]; then + ./bootstrap.sh --prefix=${INSTALL_PREFIX} --with-toolset="clang-15" + # Switch the compiler from the clang-15 toolset which doesn't exist (clang-15.jam) to + # clang of version 15 when toolset clang-15 is used. + # This reconciles the project-config.jam generation with what the b2 build system allows for customization. + sed -i 's/using clang-15/using clang : 15/g' project-config.jam + ${SUDO} ./b2 "-j$(nproc)" -d0 install threading=multi toolset=clang-15 --without-python + else + ./bootstrap.sh --prefix=${INSTALL_PREFIX} + ${SUDO} ./b2 "-j$(nproc)" -d0 install threading=multi --without-python + fi + ) +} + +function install_snappy { + wget_and_untar https://github.com/google/snappy/archive/1.1.8.tar.gz snappy + cmake_install_dir snappy -DSNAPPY_BUILD_TESTS=OFF +} + +function install_fmt { + wget_and_untar https://github.com/fmtlib/fmt/archive/${FMT_VERSION}.tar.gz fmt + cmake_install_dir fmt -DFMT_TEST=OFF +} + +function install_protobuf { + wget_and_untar https://github.com/protocolbuffers/protobuf/releases/download/v21.8/protobuf-all-21.8.tar.gz protobuf + ( + cd ${DEPENDENCY_DIR}/protobuf + ./configure --prefix=${INSTALL_PREFIX} + make "-j${NPROC}" + make install + ldconfig + ) +} + +function install_fizz { + wget_and_untar https://github.com/facebookincubator/fizz/archive/refs/tags/${FB_OS_VERSION}.tar.gz fizz + cmake_install_dir fizz/fizz -DBUILD_TESTS=OFF +} + +function install_folly { + wget_and_untar https://github.com/facebook/folly/archive/refs/tags/${FB_OS_VERSION}.tar.gz folly + cmake_install_dir folly -DBUILD_TESTS=OFF -DFOLLY_HAVE_INT128_T=ON +} + +function install_wangle { + wget_and_untar https://github.com/facebook/wangle/archive/refs/tags/${FB_OS_VERSION}.tar.gz wangle + cmake_install_dir wangle/wangle -DBUILD_TESTS=OFF +} + +function install_fbthrift { + wget_and_untar https://github.com/facebook/fbthrift/archive/refs/tags/${FB_OS_VERSION}.tar.gz fbthrift + cmake_install_dir fbthrift -Denable_tests=OFF -DBUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF +} + +function install_mvfst { + wget_and_untar https://github.com/facebook/mvfst/archive/refs/tags/${FB_OS_VERSION}.tar.gz mvfst + cmake_install_dir mvfst -DBUILD_TESTS=OFF +} + +function install_duckdb { + if $BUILD_DUCKDB ; then + echo 'Building DuckDB' + wget_and_untar https://github.com/duckdb/duckdb/archive/refs/tags/v0.8.1.tar.gz duckdb + cmake_install_dir duckdb -DBUILD_UNITTESTS=OFF -DENABLE_SANITIZER=OFF -DENABLE_UBSAN=OFF -DBUILD_SHELL=OFF -DEXPORT_DLL_SYMBOLS=OFF -DCMAKE_BUILD_TYPE=Release + fi +} + +function install_arrow { + wget_and_untar https://archive.apache.org/dist/arrow/arrow-${ARROW_VERSION}/apache-arrow-${ARROW_VERSION}.tar.gz arrow + cmake_install_dir arrow/cpp \ + -DARROW_PARQUET=OFF \ + -DARROW_WITH_THRIFT=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_ZLIB=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_JEMALLOC=OFF \ + -DARROW_SIMD_LEVEL=NONE \ + -DARROW_RUNTIME_SIMD_LEVEL=NONE \ + -DARROW_WITH_UTF8PROC=OFF \ + -DARROW_TESTING=ON \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DCMAKE_BUILD_TYPE=Release \ + -DARROW_BUILD_STATIC=ON \ + -DThrift_SOURCE=BUNDLED + + ( + # Install thrift. + cd ${DEPENDENCY_DIR}/arrow/cpp/_build/thrift_ep-prefix/src/thrift_ep-build + cmake --install ./ --prefix ${INSTALL_PREFIX} + ) +} + +function install_cuda { + # See https://developer.nvidia.com/cuda-downloads + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + dnf install -y cuda-nvcc-$(echo $1 | tr '.' '-') cuda-cudart-devel-$(echo $1 | tr '.' '-') +} + +function install_fast_float { + # Dependency of folly. + wget_and_untar https://github.com/fastfloat/fast_float/archive/refs/tags/${FAST_FLOAT_VERSION}.tar.gz fast_float + cmake_install_dir fast_float +} + +function install_velox_deps { + run_and_time install_velox_deps_from_dnf + run_and_time install_conda + run_and_time install_gflags + run_and_time install_glog + run_and_time install_lzo + run_and_time install_snappy + run_and_time install_boost + run_and_time install_protobuf + run_and_time install_fmt + run_and_time install_fast_float + run_and_time install_folly + run_and_time install_fizz + run_and_time install_wangle + run_and_time install_mvfst + run_and_time install_fbthrift + run_and_time install_duckdb + run_and_time install_arrow +} + +(return 2> /dev/null) && return # If script was sourced, don't run commands. + +( + if [[ $# -ne 0 ]]; then + if [[ ${USE_CLANG} != "false" ]]; then + export CC=/usr/bin/clang-15 + export CXX=/usr/bin/clang++-15 + else + # Activate gcc12; enable errors on unset variables afterwards. + source /opt/rh/gcc-toolset-12/enable || exit 1 + set -u + fi + + for cmd in "$@"; do + run_and_time "${cmd}" + done + echo "All specified dependencies installed!" + else + if [ "${INSTALL_PREREQUISITES:-Y}" == "Y" ]; then + echo "Installing build dependencies" + run_and_time install_build_prerequisites + else + echo "Skipping installation of build dependencies since INSTALL_PREREQUISITES is not set" + fi + if [[ ${USE_CLANG} != "false" ]]; then + export CC=/usr/bin/clang-15 + export CXX=/usr/bin/clang++-15 + else + # Activate gcc12; enable errors on unset variables afterwards. + source /opt/rh/gcc-toolset-12/enable || exit 1 + set -u + fi + install_velox_deps + echo "All dependencies for Velox installed!" + if [[ ${USE_CLANG} != "false" ]]; then + echo "To use clang for the Velox build set the CC and CXX environment variables in your session." + echo " export CC=/usr/bin/clang-15" + echo " export CXX=/usr/bin/clang++-15" + fi + dnf clean all + fi +) + diff --git a/scripts/setup-check.sh b/scripts/setup-check.sh index 59b7baece99d5..d3d6573a8eda9 100644 --- a/scripts/setup-check.sh +++ b/scripts/setup-check.sh @@ -18,9 +18,9 @@ set -x export DEBIAN_FRONTEND=noninteractive apt update -apt install --no-install-recommends -y clang-format-12 python3-pip git make ssh -pip3 install cmake_format black regex +apt install --no-install-recommends -y clang-format-18 python3-pip git make ssh +pip3 install --break-system-packages cmake==3.28.3 cmake_format black pyyaml regex pip3 cache purge apt purge --auto-remove -y python3-pip -update-alternatives --install /usr/bin/clang-format clang-format "$(command -v clang-format-12)" 12 +update-alternatives --install /usr/bin/clang-format clang-format "$(command -v clang-format-18)" 18 apt clean diff --git a/scripts/setup-circleci.sh b/scripts/setup-circleci.sh deleted file mode 100755 index bd275f48869ef..0000000000000 --- a/scripts/setup-circleci.sh +++ /dev/null @@ -1,122 +0,0 @@ -#!/bin/bash -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -efx -o pipefail -# Some of the packages must be build with the same compiler flags -# so that some low level types are the same size. Also, disable warnings. -SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}") -source $SCRIPTDIR/setup-helper-functions.sh -CPU_TARGET="${CPU_TARGET:-avx}" -NPROC=$(getconf _NPROCESSORS_ONLN) -export CFLAGS=$(get_cxx_flags $CPU_TARGET) # Used by LZO. -export CXXFLAGS=$CFLAGS # Used by boost. -export CPPFLAGS=$CFLAGS # Used by LZO. - -function dnf_install { - dnf install -y -q --setopt=install_weak_deps=False "$@" -} - -dnf_install epel-release dnf-plugins-core # For ccache, ninja -dnf config-manager --set-enabled powertools -dnf_install ninja-build ccache gcc-toolset-9 git wget which libevent-devel \ - openssl-devel re2-devel libzstd-devel lz4-devel double-conversion-devel \ - libdwarf-devel curl-devel libicu-devel - -dnf remove -y gflags - -# Required for Thrift -dnf_install autoconf automake libtool bison flex python3 libsodium-devel - -dnf_install conda - -# install sphinx for doc gen -pip3 install sphinx sphinx-tabs breathe sphinx_rtd_theme - -# Activate gcc9; enable errors on unset variables afterwards. -source /opt/rh/gcc-toolset-9/enable || exit 1 -set -u - -function cmake_install { - cmake -B "$1-build" -GNinja -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS="${CFLAGS}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=Release -Wno-dev "$@" - ninja -C "$1-build" install -} - -function wget_and_untar { - local URL=$1 - local DIR=$2 - mkdir -p "${DIR}" - wget -q --max-redirect 3 -O - "${URL}" | tar -xz -C "${DIR}" --strip-components=1 -} - -# untar cmake binary release directly to /usr. -wget_and_untar https://github.com/Kitware/CMake/releases/download/v3.17.5/cmake-3.17.5-Linux-x86_64.tar.gz /usr & - -# Fetch sources. -wget_and_untar https://github.com/gflags/gflags/archive/v2.2.2.tar.gz gflags & -wget_and_untar https://github.com/google/glog/archive/v0.4.0.tar.gz glog & -wget_and_untar http://www.oberhumer.com/opensource/lzo/download/lzo-2.10.tar.gz lzo & -wget_and_untar https://boostorg.jfrog.io/artifactory/main/release/1.72.0/source/boost_1_72_0.tar.gz boost & -wget_and_untar https://github.com/google/snappy/archive/1.1.8.tar.gz snappy & -wget_and_untar https://github.com/fmtlib/fmt/archive/8.0.1.tar.gz fmt & -# wget_and_untar https://github.com/ericniebler/range-v3/archive/0.11.0.tar.gz ranges-v3 & -wget_and_untar https://archive.apache.org/dist/hadoop/common/hadoop-2.10.1/hadoop-2.10.1.tar.gz hadoop -wget_and_untar https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protobuf-all-21.4.tar.gz protobuf & - -FB_OS_VERSION="v2022.11.14.00" - -wget_and_untar https://github.com/facebook/folly/archive/${FB_OS_VERSION}.tar.gz folly & -wget_and_untar https://github.com/facebookincubator/fizz/archive/refs/tags/${FB_OS_VERSION}.tar.gz fizz & -wget_and_untar https://github.com/facebook/wangle/archive/refs/tags/${FB_OS_VERSION}.tar.gz wangle & -wget_and_untar https://github.com/facebook/fbthrift/archive/refs/tags/${FB_OS_VERSION}.tar.gz fbthrift & - -wait # For cmake and source downloads to complete. - -cp -a hadoop /usr/local/ - -# Build & install. -( - cd lzo - ./configure --prefix=/usr --enable-shared --disable-static --docdir=/usr/share/doc/lzo-2.10 - make "-j$(nproc)" - make install -) - -( - cd boost - ./bootstrap.sh --prefix=/usr/local - ./b2 "-j$(nproc)" -d0 install threading=multi -) - -( - cd protobuf - ./configure --prefix=/usr - make "-j${NPROC}" - make install - ldconfig -) - -cmake_install gflags -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON -DBUILD_gflags_LIB=ON -DLIB_SUFFIX=64 -DCMAKE_INSTALL_PREFIX:PATH=/usr -cmake_install glog -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX:PATH=/usr -cmake_install snappy -DSNAPPY_BUILD_TESTS=OFF -cmake_install fmt -DFMT_TEST=OFF -cmake_install folly -DFOLLY_HAVE_INT128_T=ON - -cmake_install fizz/fizz -DBUILD_TESTS=OFF -cmake_install wangle/wangle -DBUILD_TESTS=OFF -cmake_install fbthrift -Denable_tests=OFF -# cmake_install ranges-v3 - -dnf clean all diff --git a/scripts/setup-classpath.sh b/scripts/setup-classpath.sh new file mode 100644 index 0000000000000..e52184d92138a --- /dev/null +++ b/scripts/setup-classpath.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export CLASSPATH=`/usr/local/hadoop/bin/hdfs classpath --glob` diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh old mode 100644 new mode 100755 index 8076edc4a1850..4c332bb305f42 --- a/scripts/setup-helper-functions.sh +++ b/scripts/setup-helper-functions.sh @@ -15,6 +15,30 @@ # github_checkout $REPO $VERSION $GIT_CLONE_PARAMS clones or re-uses an existing clone of the # specified repo, checking out the requested version. + +DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)/deps-download} +OS_CXXFLAGS="" + +function run_and_time { + time "$@" || (echo "Failed to run $* ." ; exit 1 ) + { echo "+ Finished running $*"; } 2> /dev/null +} + +function prompt { + ( + while true; do + local input="${PROMPT_ALWAYS_RESPOND:-}" + echo -n "$(tput bold)$* [Y, n]$(tput sgr0) " + [[ -z "${input}" ]] && read input + if [[ "${input}" == "Y" || "${input}" == "y" || "${input}" == "" ]]; then + return 0 + elif [[ "${input}" == "N" || "${input}" == "n" ]]; then + return 1 + fi + done + ) 2> /dev/null +} + function github_checkout { local REPO=$1 shift @@ -22,13 +46,14 @@ function github_checkout { shift local GIT_CLONE_PARAMS=$@ local DIRNAME=$(basename $REPO) + SUDO="${SUDO:-""}" cd "${DEPENDENCY_DIR}" if [ -z "${DIRNAME}" ]; then echo "Failed to get repo name from ${REPO}" exit 1 fi if [ -d "${DIRNAME}" ] && prompt "${DIRNAME} already exists. Delete?"; then - rm -rf "${DIRNAME}" + ${SUDO} rm -rf "${DIRNAME}" fi if [ ! -d "${DIRNAME}" ]; then git clone -q -b $VERSION $GIT_CLONE_PARAMS "https://github.com/${REPO}.git" @@ -36,9 +61,8 @@ function github_checkout { cd "${DIRNAME}" } - # get_cxx_flags [$CPU_ARCH] -# Sets and exports the variable VELOX_CXX_FLAGS with appropriate compiler flags. +# Echos appropriate compiler flags. # If $CPU_ARCH is set then we use that else we determine best possible set of flags # to use based on current cpu architecture. # The goal of this function is to consolidate all architecture specific flags to one @@ -53,65 +77,78 @@ function github_checkout { # CXX_FLAGS=$(get_cxx_flags "avx") function get_cxx_flags { - local CPU_ARCH=$1 - - local OS - OS=$(uname) - local MACHINE - MACHINE=$(uname -m) - ADDITIONAL_FLAGS="" - - if [[ -z "$CPU_ARCH" ]] || [[ $CPU_ARCH == "unknown" ]]; then - if [ "$OS" = "Darwin" ]; then - - if [ "$MACHINE" = "x86_64" ]; then - local CPU_CAPABILITIES - CPU_CAPABILITIES=$(sysctl -a | grep machdep.cpu.features | awk '{print tolower($0)}') - - if [[ $CPU_CAPABILITIES =~ "avx" ]]; then - CPU_ARCH="avx" - else - CPU_ARCH="sse" - fi - - elif [[ $(sysctl -a | grep machdep.cpu.brand_string) =~ "Apple" ]]; then - # Apple silicon. - CPU_ARCH="arm64" - fi - - # On MacOs prevent the flood of translation visibility settings warnings. - ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden" - else [ "$OS" = "Linux" ]; - - local CPU_CAPABILITIES - CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1| awk '{print tolower($0)}') - - if [[ "$CPU_CAPABILITIES" =~ "avx" ]]; then - CPU_ARCH="avx" - elif [[ "$CPU_CAPABILITIES" =~ "sse" ]]; then - CPU_ARCH="sse" - elif [ "$MACHINE" = "aarch64" ]; then - CPU_ARCH="aarch64" - fi - fi + local CPU_ARCH=${1:-""} + local OS=$(uname) + local MACHINE=$(uname -m) + + if [[ -z "$CPU_ARCH" ]]; then + if [ "$OS" = "Darwin" ]; then + if [ "$MACHINE" = "arm64" ]; then + CPU_ARCH="arm64" + else # x86_64 + local CPU_CAPABILITIES=$(sysctl -a | grep machdep.cpu.features | awk '{print tolower($0)}') + if [[ $CPU_CAPABILITIES =~ "avx" ]]; then + CPU_ARCH="avx" + else + CPU_ARCH="sse" + fi + fi + elif [ "$OS" = "Linux" ]; then + if [ "$MACHINE" = "aarch64" ]; then + CPU_ARCH="aarch64" + else # x86_64 + local CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1| awk '{print tolower($0)}') + if [[ $CPU_CAPABILITIES =~ "avx" ]]; then + CPU_ARCH="avx" + elif [[ $CPU_CAPABILITIES =~ "sse" ]]; then + CPU_ARCH="sse" + fi + fi + else + echo "Unsupported platform $OS"; exit 1; + fi fi - case $CPU_ARCH in "arm64") - echo -n "-mcpu=apple-m1+crc -std=c++17 -fvisibility=hidden $ADDITIONAL_FLAGS" + echo -n "-mcpu=apple-m1+crc -std=c++17 -fvisibility=hidden" ;; "avx") - echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17 -mbmi2 $ADDITIONAL_FLAGS" + echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17 -mbmi2" ;; "sse") - echo -n "-msse4.2 -std=c++17 $ADDITIONAL_FLAGS" + echo -n "-msse4.2 -std=c++17" ;; "aarch64") - echo -n "-mcpu=neoverse-n1 -std=c++17 $ADDITIONAL_FLAGS" + # Read Arm MIDR_EL1 register to detect Arm cpu. + # https://developer.arm.com/documentation/100616/0301/register-descriptions/aarch64-system-registers/midr-el1--main-id-register--el1 + ARM_CPU_FILE="/sys/devices/system/cpu/cpu0/regs/identification/midr_el1" + + # https://gitlab.arm.com/telemetry-solution/telemetry-solution/-/blob/main/data/pmu/cpu/neoverse/neoverse-n1.json#L13 + # N1:d0c; N2:d49; V1:d40; + Neoverse_N1="d0c" + Neoverse_N2="d49" + Neoverse_V1="d40" + if [ -f "$ARM_CPU_FILE" ]; then + hex_ARM_CPU_DETECT=`cat $ARM_CPU_FILE` + # PartNum, [15:4]: The primary part number such as Neoverse N1/N2 core. + ARM_CPU_PRODUCT=${hex_ARM_CPU_DETECT: -4:3} + + if [ "$ARM_CPU_PRODUCT" = "$Neoverse_N1" ]; then + echo -n "-mcpu=neoverse-n1 -std=c++17" + elif [ "$ARM_CPU_PRODUCT" = "$Neoverse_N2" ]; then + echo -n "-mcpu=neoverse-n2 -std=c++17" + elif [ "$ARM_CPU_PRODUCT" = "$Neoverse_V1" ]; then + echo -n "-mcpu=neoverse-v1 -std=c++17" + else + echo -n "-march=armv8-a+crc+crypto -std=c++17" + fi + else + echo -n "-std=c++17" + fi ;; *) echo -n "Architecture not supported!" @@ -119,15 +156,52 @@ function get_cxx_flags { } +function wget_and_untar { + local URL=$1 + local DIR=$2 + mkdir -p "${DEPENDENCY_DIR}" + pushd "${DEPENDENCY_DIR}" + SUDO="${SUDO:-""}" + if [ -d "${DIR}" ]; then + if prompt "${DIR} already exists. Delete?"; then + ${SUDO} rm -rf "${DIR}" + else + popd + return + fi + fi + mkdir -p "${DIR}" + pushd "${DIR}" + curl -L "${URL}" > $2.tar.gz + tar -xz --strip-components=1 -f $2.tar.gz + popd + popd +} + +function cmake_install_dir { + pushd "${DEPENDENCY_DIR}/$1" + # remove the directory argument + shift + cmake_install $@ + popd +} + function cmake_install { local NAME=$(basename "$(pwd)") local BINARY_DIR=_build - if [ -d "${BINARY_DIR}" ] && prompt "Do you want to rebuild ${NAME}?"; then - rm -rf "${BINARY_DIR}" + SUDO="${SUDO:-""}" + if [ -d "${BINARY_DIR}" ]; then + if prompt "Do you want to rebuild ${NAME}?"; then + ${SUDO} rm -rf "${BINARY_DIR}" + else + return + fi fi + mkdir -p "${BINARY_DIR}" - CPU_TARGET="${CPU_TARGET:-unknown}" - COMPILER_FLAGS=$(get_cxx_flags $CPU_TARGET) + COMPILER_FLAGS=$(get_cxx_flags) + # Add platform specific CXX flags if any + COMPILER_FLAGS+=${OS_CXXFLAGS} # CMAKE_POSITION_INDEPENDENT_CODE is required so that Velox can be built into dynamic libraries \ cmake -Wno-dev -B"${BINARY_DIR}" \ @@ -139,6 +213,8 @@ function cmake_install { -DCMAKE_CXX_FLAGS="$COMPILER_FLAGS" \ -DBUILD_TESTING=OFF \ "$@" - ninja -C "${BINARY_DIR}" install + # Exit if the build fails. + cmake --build "${BINARY_DIR}" || { echo 'build failed' ; exit 1; } + ${SUDO} cmake --install "${BINARY_DIR}" } diff --git a/scripts/setup-macos.sh b/scripts/setup-macos.sh index 83d8990603eae..6eb600c34aa0e 100755 --- a/scripts/setup-macos.sh +++ b/scripts/setup-macos.sh @@ -29,32 +29,20 @@ set -e # Exit on error. set -x # Print commands that are executed. SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}") +export INSTALL_PREFIX=${INSTALL_PREFIX:-"$(pwd)/deps-install"} source $SCRIPTDIR/setup-helper-functions.sh - +PYTHON_VENV=${PYHTON_VENV:-"${SCRIPTDIR}/../.venv"} +# Allow installed package headers to be picked up before brew package headers +# by tagging the brew packages to be system packages. +# This is used during package builds. +export OS_CXXFLAGS=" -isystem $(brew --prefix)/include " NPROC=$(getconf _NPROCESSORS_ONLN) DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)} -MACOS_DEPS="ninja flex bison cmake ccache protobuf@21 icu4c boost gflags glog libevent lz4 lzo snappy xz zstd openssl@1.1" - -function run_and_time { - time "$@" || (echo "Failed to run $* ." ; exit 1 ) - { echo "+ Finished running $*"; } 2> /dev/null -} - -function prompt { - ( - while true; do - local input="${PROMPT_ALWAYS_RESPOND:-}" - echo -n "$(tput bold)$* [Y, n]$(tput sgr0) " - [[ -z "${input}" ]] && read input - if [[ "${input}" == "Y" || "${input}" == "y" || "${input}" == "" ]]; then - return 0 - elif [[ "${input}" == "N" || "${input}" == "n" ]]; then - return 1 - fi - done - ) 2> /dev/null -} +MACOS_VELOX_DEPS="bison flex gflags glog googletest icu4c libevent libsodium lz4 lzo openssl protobuf@21 snappy xz zstd" +MACOS_BUILD_DEPS="ninja cmake" +FB_OS_VERSION="v2024.09.16.00" +FMT_VERSION="10.1.1" function update_brew { DEFAULT_BREW_PATH=/usr/local/bin/brew @@ -67,76 +55,132 @@ function update_brew { $BREW_PATH developer off } +function install_from_brew { + pkg=$1 + if [[ "${pkg}" =~ ^([0-9a-z-]*):([0-9](\.[0-9\])*)$ ]]; + then + pkg=${BASH_REMATCH[1]} + ver=${BASH_REMATCH[2]} + echo "Installing '${pkg}' at '${ver}'" + tap="velox/local-${pkg}" + brew tap-new "${tap}" + brew extract "--version=${ver}" "${pkg}" "${tap}" + brew install "${tap}/${pkg}@${ver}" || ( echo "Failed to install ${tap}/${pkg}@${ver}" ; exit 1 ) + else + ( brew install --formula "${pkg}" && echo "Installation of ${pkg} is successful" || brew upgrade --formula "$pkg" ) || ( echo "Failed to install ${pkg}" ; exit 1 ) + fi +} + function install_build_prerequisites { - for pkg in ${MACOS_DEPS} + for pkg in ${MACOS_BUILD_DEPS} do - if [[ "${pkg}" =~ ^([0-9a-z-]*):([0-9](\.[0-9\])*)$ ]]; - then - pkg=${BASH_REMATCH[1]} - ver=${BASH_REMATCH[2]} - echo "Installing '${pkg}' at '${ver}'" - tap="velox/local-${pkg}" - brew tap-new "${tap}" - brew extract "--version=${ver}" "${pkg}" "${tap}" - brew install "${tap}/${pkg}@${ver}" || ( echo "Failed to install ${tap}/${pkg}@${ver}" ; exit 1 ) - else - ( brew install --formula "${pkg}" && echo "Installation of ${pkg} is successful" || brew upgrade --formula "$pkg" ) || ( echo "Failed to install ${pkg}" ; exit 1 ) - fi + install_from_brew ${pkg} done + if [ ! -f ${PYTHON_VENV}/pyvenv.cfg ]; then + echo "Creating Python Virtual Environment at ${PYTHON_VENV}" + python3 -m venv ${PYTHON_VENV} + fi + source ${PYTHON_VENV}/bin/activate; pip3 install cmake-format regex pyyaml + if [ ! -f /usr/local/bin/ccache ]; then + curl -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-darwin.tar.gz > ccache.tar.gz + tar -xf ccache.tar.gz + mv ccache-4.10.2-darwin/ccache /usr/local/bin/ + rm -rf ccache-4.10.2-darwin ccache.tar.gz + fi +} - pip3 install --user cmake-format regex +function install_velox_deps_from_brew { + for pkg in ${MACOS_VELOX_DEPS} + do + install_from_brew ${pkg} + done } function install_fmt { - github_checkout fmtlib/fmt 8.0.1 - cmake_install -DFMT_TEST=OFF + wget_and_untar https://github.com/fmtlib/fmt/archive/${FMT_VERSION}.tar.gz fmt + cmake_install_dir fmt -DFMT_TEST=OFF } function install_folly { - github_checkout facebook/folly "v2022.11.14.00" - OPENSSL_ROOT_DIR=$(brew --prefix openssl@1.1) \ - cmake_install -DBUILD_TESTS=OFF -DFOLLY_HAVE_INT128_T=ON + wget_and_untar https://github.com/facebook/folly/archive/refs/tags/${FB_OS_VERSION}.tar.gz folly + cmake_install_dir folly -DBUILD_TESTS=OFF -DFOLLY_HAVE_INT128_T=ON +} + +function install_fizz { + wget_and_untar https://github.com/facebookincubator/fizz/archive/refs/tags/${FB_OS_VERSION}.tar.gz fizz + cmake_install_dir fizz/fizz -DBUILD_TESTS=OFF +} + +function install_wangle { + wget_and_untar https://github.com/facebook/wangle/archive/refs/tags/${FB_OS_VERSION}.tar.gz wangle + cmake_install_dir wangle/wangle -DBUILD_TESTS=OFF +} + +function install_mvfst { + wget_and_untar https://github.com/facebook/mvfst/archive/refs/tags/${FB_OS_VERSION}.tar.gz mvfst + cmake_install_dir mvfst -DBUILD_TESTS=OFF +} + +function install_fbthrift { + wget_and_untar https://github.com/facebook/fbthrift/archive/refs/tags/${FB_OS_VERSION}.tar.gz fbthrift + cmake_install_dir fbthrift -Denable_tests=OFF -DBUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF } function install_double_conversion { - github_checkout google/double-conversion v3.1.5 - cmake_install -DBUILD_TESTING=OFF + wget_and_untar https://github.com/google/double-conversion/archive/refs/tags/v3.1.5.tar.gz double-conversion + cmake_install_dir double-conversion -DBUILD_TESTING=OFF } function install_ranges_v3 { - github_checkout ericniebler/range-v3 0.12.0 - cmake_install -DRANGES_ENABLE_WERROR=OFF -DRANGE_V3_TESTS=OFF -DRANGE_V3_EXAMPLES=OFF + wget_and_untar https://github.com/ericniebler/range-v3/archive/refs/tags/0.12.0.tar.gz ranges_v3 + cmake_install_dir ranges_v3 -DRANGES_ENABLE_WERROR=OFF -DRANGE_V3_TESTS=OFF -DRANGE_V3_EXAMPLES=OFF } function install_re2 { - github_checkout google/re2 2021-04-01 - cmake_install -DRE2_BUILD_TESTING=OFF + wget_and_untar https://github.com/google/re2/archive/refs/tags/2022-02-01.tar.gz re2 + cmake_install_dir re2 -DRE2_BUILD_TESTING=OFF +} + +function install_fast_float { + # Dependency of folly. + wget_and_untar https://github.com/fastfloat/fast_float/archive/refs/tags/${FAST_FLOAT_VERSION}.tar.gz fast_float + cmake_install_dir fast_float } function install_velox_deps { - if [ "${INSTALL_PREREQUISITES:-Y}" == "Y" ]; then - run_and_time install_build_prerequisites - fi + run_and_time install_velox_deps_from_brew run_and_time install_ranges_v3 - run_and_time install_fmt run_and_time install_double_conversion run_and_time install_re2 + run_and_time install_fmt + run_and_time install_fast_float + run_and_time install_folly + run_and_time install_fizz + run_and_time install_wangle + run_and_time install_mvfst + run_and_time install_fbthrift } (return 2> /dev/null) && return # If script was sourced, don't run commands. ( - echo "Installing mac dependencies" update_brew if [[ $# -ne 0 ]]; then for cmd in "$@"; do run_and_time "${cmd}" done + echo "All specified dependencies installed!" else + if [ "${INSTALL_PREREQUISITES:-Y}" == "Y" ]; then + echo "Installing build dependencies" + run_and_time install_build_prerequisites + else + echo "Skipping installation of build dependencies since INSTALL_PREREQUISITES is not set" + fi install_velox_deps + echo "All deps for Velox installed! Now try \"make\"" fi ) -echo "All deps for Velox installed! Now try \"make\"" -echo 'To add cmake-format bin to your $PATH, consider adding this to your ~/.profile:' -echo 'export PATH=$HOME/bin:$HOME/Library/Python/3.7/bin:$PATH' +echo "To reuse the installed dependencies for subsequent builds, consider adding this to your ~/.zshrc" +echo "export INSTALL_PREFIX=$INSTALL_PREFIX" diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh index a1dda4b6c29ce..c4785af00fe5d 100755 --- a/scripts/setup-ubuntu.sh +++ b/scripts/setup-ubuntu.sh @@ -13,127 +13,268 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Minimal setup for Ubuntu 20.04. +# This script documents setting up a Ubuntu host for Velox +# development. Running it should make you ready to compile. +# +# Environment variables: +# * INSTALL_PREREQUISITES="N": Skip installation of packages for build. +# * PROMPT_ALWAYS_RESPOND="n": Automatically respond to interactive prompts. +# Use "n" to never wipe directories. +# +# You can also run individual functions below by specifying them as arguments: +# $ scripts/setup-ubuntu.sh install_googletest install_fmt +# + +# Minimal setup for Ubuntu 22.04. set -eufx -o pipefail SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}") source $SCRIPTDIR/setup-helper-functions.sh # Folly must be built with the same compiler flags so that some low level types # are the same size. -CPU_TARGET="${CPU_TARGET:-avx}" -COMPILER_FLAGS=$(get_cxx_flags "$CPU_TARGET") +COMPILER_FLAGS=$(get_cxx_flags) export COMPILER_FLAGS -FB_OS_VERSION=v2022.11.14.00 NPROC=$(getconf _NPROCESSORS_ONLN) -DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)} +BUILD_DUCKDB="${BUILD_DUCKDB:-true}" export CMAKE_BUILD_TYPE=Release +SUDO="${SUDO:-"sudo --preserve-env"}" +USE_CLANG="${USE_CLANG:-false}" +export INSTALL_PREFIX=${INSTALL_PREFIX:-"/usr/local"} +DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)/deps-download} -# Install all velox and folly dependencies. -# The is an issue on 22.04 where a version conflict prevents glog install, -# installing libunwind first fixes this. -sudo --preserve-env apt update && sudo --preserve-env apt install -y libunwind-dev && \ - sudo --preserve-env apt install -y \ - g++ \ - cmake \ - ccache \ - ninja-build \ - checkinstall \ - git \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libssl-dev \ - libboost-all-dev \ - libicu-dev \ - libdouble-conversion-dev \ - libgoogle-glog-dev \ - libbz2-dev \ - libgflags-dev \ - libgmock-dev \ - libevent-dev \ - liblz4-dev \ - libzstd-dev \ - libre2-dev \ - libsnappy-dev \ - libsodium-dev \ - libthrift-dev \ - liblzo2-dev \ - bison \ - flex \ - libfl-dev \ - tzdata \ - wget - -function run_and_time { - time "$@" - { echo "+ Finished running $*"; } 2> /dev/null -} - -function prompt { - ( - while true; do - local input="${PROMPT_ALWAYS_RESPOND:-}" - echo -n "$(tput bold)$* [Y, n]$(tput sgr0) " - [[ -z "${input}" ]] && read input - if [[ "${input}" == "Y" || "${input}" == "y" || "${input}" == "" ]]; then - return 0 - elif [[ "${input}" == "N" || "${input}" == "n" ]]; then - return 1 - fi - done - ) 2> /dev/null +function install_clang15 { + VERSION=`cat /etc/os-release | grep VERSION_ID` + if [[ ! ${VERSION} =~ "22.04" && ! ${VERSION} =~ "24.04" ]]; then + echo "Warning: using the Clang configuration is for Ubuntu 22.04 and 24.04. Errors might occur." + fi + CLANG_PACKAGE_LIST=clang-15 + if [[ ${VERSION} =~ "22.04" ]]; then + CLANG_PACKAGE_LIST=${CLANG_PACKAGE_LIST} gcc-12 g++-12 libc++-12-dev + fi + ${SUDO} apt install ${CLANG_PACKAGE_LIST} -y +} + +FB_OS_VERSION="v2024.09.16.00" +FMT_VERSION="10.1.1" +BOOST_VERSION="boost-1.84.0" +ARROW_VERSION="15.0.0" +FAST_FLOAT_VERSION="v6.1.6" + +# Install packages required for build. +function install_build_prerequisites { + ${SUDO} apt update + # The is an issue on 22.04 where a version conflict prevents glog install, + # installing libunwind first fixes this. + ${SUDO} apt install -y libunwind-dev + ${SUDO} apt install -y \ + build-essential \ + python3-pip \ + ccache \ + curl \ + ninja-build \ + checkinstall \ + git \ + wget + + # Install to /usr/local to make it available to all users. + ${SUDO} pip3 install cmake==3.28.3 + + if [[ ${USE_CLANG} != "false" ]]; then + install_clang15 + fi +} + +# Install packages required for build. +function install_velox_deps_from_apt { + ${SUDO} apt update + ${SUDO} apt install -y \ + libc-ares-dev \ + libcurl4-openssl-dev \ + libssl-dev \ + libicu-dev \ + libdouble-conversion-dev \ + libgoogle-glog-dev \ + libbz2-dev \ + libgflags-dev \ + libgmock-dev \ + libevent-dev \ + liblz4-dev \ + libzstd-dev \ + libre2-dev \ + libsnappy-dev \ + libsodium-dev \ + liblzo2-dev \ + libelf-dev \ + libdwarf-dev \ + bison \ + flex \ + libfl-dev \ + tzdata } function install_fmt { - github_checkout fmtlib/fmt 8.0.1 - cmake_install -DFMT_TEST=OFF + wget_and_untar https://github.com/fmtlib/fmt/archive/${FMT_VERSION}.tar.gz fmt + cmake_install_dir fmt -DFMT_TEST=OFF +} + +function install_boost { + wget_and_untar https://github.com/boostorg/boost/releases/download/${BOOST_VERSION}/${BOOST_VERSION}.tar.gz boost + ( + cd ${DEPENDENCY_DIR}/boost + if [[ ${USE_CLANG} != "false" ]]; then + ./bootstrap.sh --prefix=${INSTALL_PREFIX} --with-toolset="clang-15" + # Switch the compiler from the clang-15 toolset which doesn't exist (clang-15.jam) to + # clang of version 15 when toolset clang-15 is used. + # This reconciles the project-config.jam generation with what the b2 build system allows for customization. + sed -i 's/using clang-15/using clang : 15/g' project-config.jam + ${SUDO} ./b2 "-j$(nproc)" -d0 install threading=multi toolset=clang-15 --without-python + else + ./bootstrap.sh --prefix=${INSTALL_PREFIX} + ${SUDO} ./b2 "-j$(nproc)" -d0 install threading=multi --without-python + fi + ) } function install_folly { - github_checkout facebook/folly "${FB_OS_VERSION}" - cmake_install -DBUILD_TESTS=OFF -DFOLLY_HAVE_INT128_T=ON + wget_and_untar https://github.com/facebook/folly/archive/refs/tags/${FB_OS_VERSION}.tar.gz folly + cmake_install_dir folly -DBUILD_TESTS=OFF -DFOLLY_HAVE_INT128_T=ON } function install_fizz { - github_checkout facebookincubator/fizz "${FB_OS_VERSION}" - cmake_install -DBUILD_TESTS=OFF -S fizz + wget_and_untar https://github.com/facebookincubator/fizz/archive/refs/tags/${FB_OS_VERSION}.tar.gz fizz + cmake_install_dir fizz/fizz -DBUILD_TESTS=OFF } function install_wangle { - github_checkout facebook/wangle "${FB_OS_VERSION}" - cmake_install -DBUILD_TESTS=OFF -S wangle + wget_and_untar https://github.com/facebook/wangle/archive/refs/tags/${FB_OS_VERSION}.tar.gz wangle + cmake_install_dir wangle/wangle -DBUILD_TESTS=OFF +} + +function install_mvfst { + wget_and_untar https://github.com/facebook/mvfst/archive/refs/tags/${FB_OS_VERSION}.tar.gz mvfst + cmake_install_dir mvfst -DBUILD_TESTS=OFF } function install_fbthrift { - github_checkout facebook/fbthrift "${FB_OS_VERSION}" - cmake_install -DBUILD_TESTS=OFF + wget_and_untar https://github.com/facebook/fbthrift/archive/refs/tags/${FB_OS_VERSION}.tar.gz fbthrift + cmake_install_dir fbthrift -Denable_tests=OFF -DBUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF } function install_conda { - mkdir -p conda && cd conda - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - MINICONDA_PATH=/opt/miniconda-for-velox - bash Miniconda3-latest-Linux-x86_64.sh -b -p $MINICONDA_PATH + MINICONDA_PATH="${HOME:-/opt}/miniconda-for-velox" + if [ -e ${MINICONDA_PATH} ]; then + echo "File or directory already exists: ${MINICONDA_PATH}" + return + fi + ARCH=$(uname -m) + if [ "$ARCH" != "x86_64" ] && [ "$ARCH" != "aarch64" ]; then + echo "Unsupported architecture: $ARCH" + exit 1 + fi + ( + mkdir -p conda && cd conda + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$ARCH.sh -O Miniconda3-latest-Linux-$ARCH.sh + bash Miniconda3-latest-Linux-$ARCH.sh -b -p $MINICONDA_PATH + ) +} + +function install_duckdb { + if $BUILD_DUCKDB ; then + echo 'Building DuckDB' + wget_and_untar https://github.com/duckdb/duckdb/archive/refs/tags/v0.8.1.tar.gz duckdb + cmake_install_dir duckdb -DBUILD_UNITTESTS=OFF -DENABLE_SANITIZER=OFF -DENABLE_UBSAN=OFF -DBUILD_SHELL=OFF -DEXPORT_DLL_SYMBOLS=OFF -DCMAKE_BUILD_TYPE=Release + fi +} + +function install_arrow { + wget_and_untar https://archive.apache.org/dist/arrow/arrow-${ARROW_VERSION}/apache-arrow-${ARROW_VERSION}.tar.gz arrow + cmake_install_dir arrow/cpp \ + -DARROW_PARQUET=OFF \ + -DARROW_WITH_THRIFT=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_ZLIB=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_JEMALLOC=OFF \ + -DARROW_SIMD_LEVEL=NONE \ + -DARROW_RUNTIME_SIMD_LEVEL=NONE \ + -DARROW_WITH_UTF8PROC=OFF \ + -DARROW_TESTING=ON \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DCMAKE_BUILD_TYPE=Release \ + -DARROW_BUILD_STATIC=ON \ + -DThrift_SOURCE=BUNDLED + + ( + # Install thrift. + cd ${DEPENDENCY_DIR}/arrow/cpp/_build/thrift_ep-prefix/src/thrift_ep-build + $SUDO cmake --install ./ --prefix ${INSTALL_PREFIX} + ) +} + +function install_cuda { + # See https://developer.nvidia.com/cuda-downloads + if ! dpkg -l cuda-keyring 1>/dev/null; then + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + $SUDO dpkg -i cuda-keyring_1.1-1_all.deb + rm cuda-keyring_1.1-1_all.deb + $SUDO apt update + fi + $SUDO apt install -y cuda-nvcc-$(echo $1 | tr '.' '-') cuda-cudart-dev-$(echo $1 | tr '.' '-') +} + +function install_fast_float { + # Dependency of folly. + wget_and_untar https://github.com/fastfloat/fast_float/archive/refs/tags/${FAST_FLOAT_VERSION}.tar.gz fast_float + cmake_install_dir fast_float } function install_velox_deps { + run_and_time install_velox_deps_from_apt run_and_time install_fmt + run_and_time install_boost + run_and_time install_fast_float run_and_time install_folly run_and_time install_fizz run_and_time install_wangle + run_and_time install_mvfst run_and_time install_fbthrift run_and_time install_conda + run_and_time install_duckdb + run_and_time install_arrow +} + +function install_apt_deps { + install_build_prerequisites + install_velox_deps_from_apt } (return 2> /dev/null) && return # If script was sourced, don't run commands. ( + if [[ ${USE_CLANG} != "false" ]]; then + export CC=/usr/bin/clang-15 + export CXX=/usr/bin/clang++-15 + fi if [[ $# -ne 0 ]]; then for cmd in "$@"; do run_and_time "${cmd}" done + echo "All specified dependencies installed!" else + if [ "${INSTALL_PREREQUISITES:-Y}" == "Y" ]; then + echo "Installing build dependencies" + run_and_time install_build_prerequisites + else + echo "Skipping installation of build dependencies since INSTALL_PREREQUISITES is not set" + fi install_velox_deps + echo "All dependencies for Velox installed!" + if [[ ${USE_CLANG} != "false" ]]; then + echo "To use clang for the Velox build set the CC and CXX environment variables in your session." + echo " export CC=/usr/bin/clang-15" + echo " export CXX=/usr/bin/clang++-15" + fi fi ) -echo "All deps for Velox installed! Now try \"make\"" diff --git a/scripts/setup-velox-torcharrow.sh b/scripts/setup-velox-torcharrow.sh deleted file mode 100755 index a31e35515d214..0000000000000 --- a/scripts/setup-velox-torcharrow.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -efx -o pipefail - -# Some of the packages must be build with the same compiler flags -# so that some low level types are the same size. Also, disable warnings. - -SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}") -source $SCRIPTDIR/setup-helper-functions.sh -CPU_TARGET="${CPU_TARGET:-avx}" -CONDA=${1:-true} -export CFLAGS=$(get_cxx_flags $CPU_TARGET) -export CXXFLAGS=$CFLAGS # Used by boost. - -yum -y install bzip2-devel \ - ccache \ - double-conversion-devel \ - flex \ - gflags-devel \ - git \ - glog-devel \ - libevent-devel \ - libicu-devel \ - libzstd-devel \ - lz4-devel \ - lzo-devel \ - ninja-build \ - pcre-devel \ - perl-core \ - python3-devel.x86_64 \ - re2-devel \ - snappy-devel \ - wget \ - zlib-devel - -if [ "$CONDA" = true ]; then - #Install conda - rpm --import https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc - - # Add the Anaconda repository - cat < /etc/yum.repos.d/conda.repo -[conda] -name=Conda -baseurl=https://repo.anaconda.com/pkgs/misc/rpmrepo/conda -enabled=1 -gpgcheck=1 -gpgkey=https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc -EOF - - yum -y install conda -fi - -function cmake_install { - cmake -B "$1-build" -GNinja -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS="${CFLAGS}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=Release -Wno-dev "$@" - ninja -C "$1-build" install -} - -function wget_and_untar { - local URL=$1 - local DIR=$2 - mkdir -p "${DIR}" - wget --no-check-certificate -q --max-redirect 3 -O - "${URL}" | tar -xz -C "${DIR}" --strip-components=1 -} - -wget_and_untar https://github.com/gflags/gflags/archive/refs/tags/v2.2.2.tar.gz gflags -wget_and_untar https://ftp.openssl.org/source/openssl-1.1.1k.tar.gz openssl & -wget_and_untar https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.gz boost & -wget_and_untar https://github.com/facebook/folly/archive/v2022.11.14.00.tar.gz folly & -wget_and_untar https://github.com/fmtlib/fmt/archive/refs/tags/8.0.1.tar.gz fmt & - -wait - -( - cd openssl - ./config --prefix=/usr --openssldir=/etc/ssl --libdir=lib no-shared zlib-dynamic - make install -) - -( - cd boost - ls - ./bootstrap.sh --prefix=/usr/local - CPLUS_INCLUDE_PATH=/usr/include/python3.6m ./b2 "-j$(nproc)" -d0 install threading=multi -) - -cmake_install gflags -DBUILD_SHARED_LIBS=ON -cmake_install fmt -DFMT_TEST=OFF -cmake_install folly -DFOLLY_HAVE_INT128_T=ON diff --git a/scripts/signature.py b/scripts/signature.py index 95a51b11371b5..a619d4fe05a1b 100644 --- a/scripts/signature.py +++ b/scripts/signature.py @@ -13,11 +13,14 @@ # limitations under the License. import argparse import json +import os +import re import sys +from typing import Any -import pyvelox.pyvelox as pv from deepdiff import DeepDiff +import pyvelox.pyvelox as pv # Utility to export and diff function signatures. @@ -31,8 +34,44 @@ class bcolors: BOLD = "\033[1m" +aggregate_pattern = re.compile("(.*)(_merge|_merge_extract|_partial)") + + +def get_error_string(error_message): + return f""" +Incompatible changes in function signatures have been detected. + +{error_message} + +Changing or removing function signatures breaks backwards compatibility as some users may rely on function signatures that no longer exist. + +""" + + +def set_gh_output(name: str, value: Any): + """Sets a Github Actions output variable. Only single line values are supported. + value will be converted to a lower case string.""" + value = str(value).lower() + + if "\n" in value: + raise ValueError("Only single line values are supported.") + + with open(os.environ["GITHUB_OUTPUT"], "a") as f: + f.write(f"{name}={value}\n") + + +def show_error(error_message, error_path): + if error_path: + with open(error_path, "a+") as f: + f.writelines(get_error_string(error_message)) + + print(get_error_string(error_message)) + + def export(args): """Exports Velox function signatures.""" + pv.clear_signatures() + if args.spark: pv.register_spark_signatures() @@ -41,6 +80,30 @@ def export(args): signatures = pv.get_function_signatures() + # Convert signatures to json + jsoned_signatures = {} + for key in signatures.keys(): + jsoned_signatures[key] = [str(value) for value in signatures[key]] + + # Persist to file + with open(args.output_file, "w") as f: + json.dump(jsoned_signatures, f) + + return 0 + + +def export_aggregates(args): + """Exports Velox Aggregate function signatures.""" + pv.clear_aggregate_signatures() + + if args.spark: + pv.register_spark_aggregate_signatures() + + if args.presto: + pv.register_presto_aggregate_signatures() + + signatures = pv.get_aggregate_function_signatures() + # Convert signatures to json jsoned_signatures = {} for key in signatures.keys(): @@ -51,104 +114,190 @@ def export(args): return 0 -def diff_signatures(base_signatures, contender_signatures): +def diff_signatures(base_signatures, contender_signatures, error_path=""): """Diffs Velox function signatures. Returns a tuple of the delta diff and exit status""" delta = DeepDiff( base_signatures, contender_signatures, ignore_order=True, + cutoff_distance_for_pairs=0.9, report_repetition=True, view="tree", ) exit_status = 0 if delta: if "dictionary_item_removed" in delta: - print( - f"Signature removed: {bcolors.FAIL}{delta['dictionary_item_removed']}" - ) + error_message = "" + for dic_removed in delta["dictionary_item_removed"]: + error_message += ( + f"""Function '{dic_removed.get_root_key()}' has been removed.\n""" + ) + show_error(error_message, error_path) exit_status = 1 if "values_changed" in delta: - print(f"Signature changed: {bcolors.FAIL}{delta['values_changed']}") + error_message = "" + for value_change in delta["values_changed"]: + error_message += f"""'{value_change.get_root_key()}{value_change.t1}' is changed to '{value_change.get_root_key()}{value_change.t2}'.\n""" + show_error(error_message, error_path) exit_status = 1 if "repetition_change" in delta: - print(f"Signature repeated: {bcolors.FAIL}{delta['repetition_change']}") + error_message = "" + for rep_change in delta["repetition_change"]: + error_message += f"""'{rep_change.get_root_key()}{rep_change.t1}' is repeated {rep_change.repetition['new_repeat']} times.\n""" + show_error(error_message, error_path) exit_status = 1 if "iterable_item_removed" in delta: - print( - f"Iterable item removed: {bcolors.FAIL}{delta['iterable_item_removed']}" - ) + error_message = "" + for iter_change in delta["iterable_item_removed"]: + error_message += f"""{iter_change.get_root_key()} has its function signature '{iter_change.t1}' removed.\n""" + show_error(error_message, error_path) exit_status = 1 - print(f"Found differences: {bcolors.OKGREEN}{delta}") - else: print(f"{bcolors.BOLD}No differences found.") - if exit_status: - print( - f""" - {bcolors.BOLD}Incompatible changes in function signatures have been detected. - This means your changes have modified function signatures and possibly broken backwards compatibility. - """ - ) - return delta, exit_status def diff(args): """Diffs Velox function signatures.""" - base_signatures = json.load(args.base) - contender_signatures = json.load(args.contender) + with open(args.base) as f: + base_signatures = json.load(f) + + with open(args.contender) as f: + contender_signatures = json.load(f) return diff_signatures(base_signatures, contender_signatures)[1] def bias(args): - base_signatures = json.load(args.base) - contender_signatures = json.load(args.contender) + with open(args.base) as f: + base_signatures = json.load(f) + + with open(args.contender) as f: + contender_signatures = json.load(f) + tickets = args.ticket_value bias_output, status = bias_signatures( - base_signatures, contender_signatures, tickets + base_signatures, contender_signatures, tickets, args.error_path ) - if status: - return status if bias_output: with open(args.output_path, "w") as f: print(f"{bias_output}", file=f, end="") - return 0 + return status -def bias_signatures(base_signatures, contender_signatures, tickets): +def bias_signatures(base_signatures, contender_signatures, tickets, error_path): """Returns newly added functions as string and a status flag. Newly added functions are biased like so `fn_name1=,fn_name2=`. - If it detects incompatible changes returns 1 in the status and empty string. + If it detects incompatible changes returns 1 in the status. """ - delta, status = diff_signatures(base_signatures, contender_signatures) - - # Return if the signature check call flags incompatible changes. - if status: - return "", status + delta, status = diff_signatures(base_signatures, contender_signatures, error_path) if not delta: print(f"{bcolors.BOLD} No changes detected: Nothing to do!") - return "", 0 + return "", status function_set = set() for items in delta.values(): for item in items: function_set.add(item.get_root_key()) - print(f"{bcolors.BOLD}Functions to be biased: {function_set}") + if function_set: + return f"{f'={tickets},'.join(sorted(function_set)) + f'={tickets}'}", status + + return "", status + + +def bias_aggregates(args): + """ + Finds and exports aggregates whose signatures have been modified agasint a baseline. + Saves the results to a file and sets a Github Actions Output. + Currently this is hardcoded to presto aggregates. + """ + with open(args.base) as f: + base_signatures = json.load(f) + + with open(args.contender) as f: + contender_signatures = json.load(f) + + delta, status = diff_signatures( + base_signatures, contender_signatures, args.error_path + ) + + set_gh_output("presto_aggregate_error", status == 1) + + if not delta: + print(f"{bcolors.BOLD} No changes detected: Nothing to do!") + return status + + function_set = set() + for items in delta.values(): + for item in items: + fn_name = item.get_root_key() + pattern = aggregate_pattern.match(fn_name) + if pattern: + function_set.add(pattern.group(1)) + else: + function_set.add(fn_name) if function_set: - return f"{f'={tickets},'.join(sorted(function_set)) + f'={tickets}'}", 0 + biased_functions = ",".join(function_set) + with open(args.output_path, "w") as f: + print(f"{biased_functions}", file=f, end="") + + set_gh_output("presto_aggregate_functions", True) + + return 0 + + +def gh_bias_check(args): + """ + Exports signatures for the given group(s) and checks them for changes compared to a baseline. + Saves the results to a file and sets a Github Actions Output for each group. + """ + if not os.getenv("GITHUB_ACTIONS"): + print("This command is meant to be run in a Github Actions environment.") + return 1 + + # export signatures for each group + for group in args.group: + print(f"Exporting {group} signatures...") + export_args = parse_args( + [ + "export", + f"--{group}", + os.path.join(args.signature_dir, group + args.contender_postfix), + ] + ) + export(export_args) + + # compare signatures for each group + for group in args.group: + print(f"Comparing {group} signatures...") + bias_args = parse_args( + [ + "bias", + os.path.join(args.signature_dir, group + args.base_postfix), + os.path.join(args.signature_dir, group + args.contender_postfix), + os.path.join(args.signature_dir, group + args.output_postfix), + os.path.join(args.signature_dir, group + "_errors"), + ] + ) - return "", 0 + bias_status = bias(bias_args) + set_gh_output(f"{group}_error", bias_status == 1) + + # check if there are any changes that require the bias fuzzer to run + has_tickets = os.path.isfile( + os.path.join(args.signature_dir, group + args.output_postfix) + ) + set_gh_output(f"{group}_functions", has_tickets) def get_tickets(val): @@ -169,20 +318,55 @@ def parse_args(args): command = parser.add_subparsers(dest="command") export_command_parser = command.add_parser("export") export_command_parser.add_argument("--spark", action="store_true") - export_command_parser.add_argument("--presto", action="store_false") - export_command_parser.add_argument("output_file", type=argparse.FileType("w")) + export_command_parser.add_argument("--presto", action="store_true") + export_command_parser.add_argument("output_file", type=str) + + export_aggregates_command_parser = command.add_parser("export_aggregates") + export_aggregates_command_parser.add_argument("--spark", action="store_true") + export_aggregates_command_parser.add_argument("--presto", action="store_true") + export_aggregates_command_parser.add_argument( + "output_file", type=argparse.FileType("w") + ) diff_command_parser = command.add_parser("diff") - diff_command_parser.add_argument("base", type=argparse.FileType("r")) - diff_command_parser.add_argument("contender", type=argparse.FileType("r")) + diff_command_parser.add_argument("base", type=str) + diff_command_parser.add_argument("contender", type=str) bias_command_parser = command.add_parser("bias") - bias_command_parser.add_argument("base", type=argparse.FileType("r")) - bias_command_parser.add_argument("contender", type=argparse.FileType("r")) - bias_command_parser.add_argument("output_path") + bias_command_parser.add_argument("base", type=str) + bias_command_parser.add_argument("contender", type=str) + bias_command_parser.add_argument("output_path", type=str) bias_command_parser.add_argument( "ticket_value", type=get_tickets, default=10, nargs="?" ) + bias_command_parser.add_argument("error_path", type=str, default="") + + gh_command_parser = command.add_parser("gh_bias_check") + gh_command_parser.add_argument( + "group", + nargs="+", + help='One or more group names to check for changed signatures. e.g. "spark" or "presto"', + type=str, + ) + gh_command_parser.add_argument( + "--signature_dir", type=str, default="/tmp/signatures" + ) + gh_command_parser.add_argument( + "--base_postfix", type=str, default="_signatures_main.json" + ) + gh_command_parser.add_argument( + "--contender_postfix", type=str, default="_signatures_contender.json" + ) + gh_command_parser.add_argument( + "--output_postfix", type=str, default="_bias_functions" + ) + + bias_aggregate_command_parser = command.add_parser("bias_aggregates") + bias_aggregate_command_parser.add_argument("base", type=str) + bias_aggregate_command_parser.add_argument("contender", type=str) + bias_aggregate_command_parser.add_argument("output_path", type=str) + bias_aggregate_command_parser.add_argument("error_path", type=str, default="") + parser.set_defaults(command="help") return parser.parse_args(args) diff --git a/scripts/spark-container.dockerfile b/scripts/spark-container.dockerfile new file mode 100644 index 0000000000000..7b7df7910c794 --- /dev/null +++ b/scripts/spark-container.dockerfile @@ -0,0 +1,45 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Build the test and build container for presto_cpp +# +FROM ghcr.io/facebookincubator/velox-dev:centos9 + +ARG SPARK_VERSION=3.5.1 + +ADD scripts /velox/scripts/ +RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz +RUN wget https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/${SPARK_VERSION}/spark-connect_2.12-${SPARK_VERSION}.jar + +ARG SPARK_PKG=spark-${SPARK_VERSION}-bin-hadoop3.tgz +ARG SPARK_CONNECT_JAR=spark-connect_2.12-${SPARK_VERSION}.jar + +ENV SPARK_HOME="/opt/spark-server" + +RUN dnf install -y java-11-openjdk less procps python3 tzdata \ + && ln -s $(which python3) /usr/bin/python \ + && tar -zxf $SPARK_PKG \ + && mv ./spark-${SPARK_VERSION}-bin-hadoop3 $SPARK_HOME \ + && mkdir ${SPARK_HOME}/misc/ \ + && mv ./$SPARK_CONNECT_JAR ${SPARK_HOME}/misc/ + +# We set the timezone to America/Los_Angeles due to issue +# detailed here : https://github.com/facebookincubator/velox/issues/8127 +ENV TZ=America/Los_Angeles + +COPY scripts/spark/conf/spark-defaults.conf.example $SPARK_HOME/conf/spark-defaults.conf +COPY scripts/spark/conf/spark-env.sh.example $SPARK_HOME/conf/spark-env.sh +COPY scripts/spark/conf/workers.example $SPARK_HOME/conf/workers +COPY scripts/spark/start-spark.sh /opt + +WORKDIR /velox diff --git a/scripts/spark/conf/spark-defaults.conf.example b/scripts/spark/conf/spark-defaults.conf.example new file mode 100644 index 0000000000000..5b008b4480144 --- /dev/null +++ b/scripts/spark/conf/spark-defaults.conf.example @@ -0,0 +1 @@ +spark.master local[*] diff --git a/scripts/spark/conf/spark-env.sh.example b/scripts/spark/conf/spark-env.sh.example new file mode 100644 index 0000000000000..8cd004a86130f --- /dev/null +++ b/scripts/spark/conf/spark-env.sh.example @@ -0,0 +1 @@ +export SPARK_DAEMON_MEMORY=5g diff --git a/scripts/spark/conf/workers.example b/scripts/spark/conf/workers.example new file mode 100644 index 0000000000000..2fbb50c4a8dc7 --- /dev/null +++ b/scripts/spark/conf/workers.example @@ -0,0 +1 @@ +localhost diff --git a/scripts/spark/start-spark.sh b/scripts/spark/start-spark.sh new file mode 100755 index 0000000000000..7aca5999f8f94 --- /dev/null +++ b/scripts/spark/start-spark.sh @@ -0,0 +1,18 @@ +#!/bin/sh +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +"$SPARK_HOME"/sbin/start-connect-server.sh --jars "$SPARK_HOME"/misc/spark-connect_2.12-3.5.1.jar diff --git a/scripts/tests/data/foo.almost.cpp b/scripts/tests/data/foo.almost.cpp index 2804910f6e73a..e0d9ceefb65cb 100644 --- a/scripts/tests/data/foo.almost.cpp +++ b/scripts/tests/data/foo.almost.cpp @@ -14,12 +14,12 @@ */ import argparse import fnmatch import os import regex import sys - class attrdict(dict) : __getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ + class attrdict(dict) :__getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ - def parse_args() : parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') + def parse_args() :parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') - group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') + group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') - parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') + parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') - return parser.parse_args() + return parser.parse_args() diff --git a/scripts/tests/data/foo.cpp b/scripts/tests/data/foo.cpp index f4f78dfbffbe2..d631de2ff2e1d 100644 --- a/scripts/tests/data/foo.cpp +++ b/scripts/tests/data/foo.cpp @@ -1,11 +1,11 @@ import argparse import fnmatch import os import regex import sys - class attrdict(dict) : __getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ + class attrdict(dict) :__getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ - def parse_args() : parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') + def parse_args() :parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') - group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') + group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') - parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') + parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') - return parser.parse_args() + return parser.parse_args() diff --git a/scripts/tests/data/foo.expected.cpp b/scripts/tests/data/foo.expected.cpp index e3f75b39c6a5a..31b2bce9b2833 100644 --- a/scripts/tests/data/foo.expected.cpp +++ b/scripts/tests/data/foo.expected.cpp @@ -13,12 +13,12 @@ */ import argparse import fnmatch import os import regex import sys - class attrdict(dict) : __getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ + class attrdict(dict) :__getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ - def parse_args() : parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') + def parse_args() :parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') - group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') + group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') - parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') + parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') - return parser.parse_args() + return parser.parse_args() diff --git a/scripts/tests/data/foo.expected.h b/scripts/tests/data/foo.expected.h index e3f75b39c6a5a..31b2bce9b2833 100644 --- a/scripts/tests/data/foo.expected.h +++ b/scripts/tests/data/foo.expected.h @@ -13,12 +13,12 @@ */ import argparse import fnmatch import os import regex import sys - class attrdict(dict) : __getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ + class attrdict(dict) :__getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ - def parse_args() : parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') + def parse_args() :parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') - group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') + group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') - parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') + parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') - return parser.parse_args() + return parser.parse_args() diff --git a/scripts/tests/data/foo.h b/scripts/tests/data/foo.h index f4f78dfbffbe2..d631de2ff2e1d 100644 --- a/scripts/tests/data/foo.h +++ b/scripts/tests/data/foo.h @@ -1,11 +1,11 @@ import argparse import fnmatch import os import regex import sys - class attrdict(dict) : __getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ + class attrdict(dict) :__getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ - def parse_args() : parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') + def parse_args() :parser = argparse.ArgumentParser(description = 'Update license headers') parser.add_argument('--header', default = 'license.header', help = 'header file') parser.add_argument('--extra', default = 30, help = 'extra characters past beginning of file to look for header') parser.add_argument('--editdist', default = 7, help = 'max edit distance between headers') parser.add_argument('--remove', default = False, action = "store_true", help = 'remove the header') parser.add_argument('--cslash', default = False, action = "store_true", help = 'use C slash "//" style comments') parser.add_argument('-v', default = False, action = "store_true", dest = "verbose", help = 'verbose output') - group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') + group = parser.add_mutually_exclusive_group() group.add_argument('-k', default = False, action = "store_true", dest = "check", help = 'check headers') group.add_argument('-i', default = False, action = "store_true", dest = "inplace", help = 'edit file inplace') - parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') + parser.add_argument('files', metavar = 'FILES', nargs = '+', help = 'files to process') - return parser.parse_args() + return parser.parse_args() diff --git a/scripts/tests/test_signature.py b/scripts/tests/test_signature.py index a32c1121c0b72..e003fb5e97309 100644 --- a/scripts/tests/test_signature.py +++ b/scripts/tests/test_signature.py @@ -13,9 +13,11 @@ # limitations under the License. import unittest -from scripts.signature import bias_signatures +import unittest.mock +from scripts.signature import bias_signatures, get_error_string from pathlib import Path import json +import io def read_from_file(file_path): @@ -65,6 +67,54 @@ def test_bias(self): self.assertEqual(bias_functions, "bar=10,foo=10") + @unittest.mock.patch("sys.stdout", new_callable=io.StringIO) + def get_bias_messaging(self, base_signatures, contender_signatures, mock_stdout): + test_bias(base_signatures, contender_signatures) + return mock_stdout.getvalue() + + def assert_messaging(self, base_signatures, contender_signatures, expected_message): + test_bias(base_signatures, contender_signatures) + actual = self.get_bias_messaging(base_signatures, contender_signatures) + expected = get_error_string(expected_message) + expected += "\n" # Add trailing newline for std output. + self.assertEquals(expected, actual) + + def test_messaging(self): + # Remove a signature + self.assert_messaging( + """{"reverse": ["(array(T)) -> array(T)"]}""", + """{"reverse": []}""", + "reverse has its function signature '(array(T)) -> array(T)' removed.\n", + ) + + # Remove more than one signature + self.assert_messaging( + """{"reverse": ["(array(T)) -> array(T)", "(varchar) -> varchar"]}""", + """{"reverse": []}""", + """reverse has its function signature '(array(T)) -> array(T)' removed.\nreverse has its function signature '(varchar) -> varchar' removed.\n""", + ) + + # Mutate a signature + self.assert_messaging( + """{"reverse": ["(array(T)) -> array(T)"]}""", + """{"reverse": ["(array(T)) -> array(varchar)"]}""", + """'reverse(array(T)) -> array(T)' is changed to 'reverse(array(T)) -> array(varchar)'.\n""", + ) + + # Function repeated + self.assert_messaging( + """{"reverse": ["(array(T)) -> array(T)"]}""", + """{"reverse": ["(array(T)) -> array(T)", "(array(T)) -> array(T)"]}""", + "'reverse(array(T)) -> array(T)' is repeated 2 times.\n", + ) + + # Remove a udf + self.assert_messaging( + """{"reverse": ["(array(T)) -> array(T)"]}""", + """{}""", + "Function 'reverse' has been removed.\n", + ) + if __name__ == "__main__": unittest.main() diff --git a/scripts/ubuntu-22.04-cpp.dockerfile b/scripts/ubuntu-22.04-cpp.dockerfile index 2aa1ee7cbb3ff..ea98a0f64b9eb 100644 --- a/scripts/ubuntu-22.04-cpp.dockerfile +++ b/scripts/ubuntu-22.04-cpp.dockerfile @@ -11,10 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -ARG base=amd64/ubuntu:22.04 -# Set a default timezone, can be overriden via ARG -ARG tz="Europe/Madrid" - +ARG base=ubuntu:22.04 FROM ${base} SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -32,6 +29,8 @@ ADD scripts /velox/scripts/ # are required to avoid tzdata installation # to prompt for region selection. ARG DEBIAN_FRONTEND="noninteractive" +# Set a default timezone, can be overriden via ARG +ARG tz="Etc/UTC" ENV TZ=${tz} RUN /velox/scripts/setup-ubuntu.sh diff --git a/scripts/velox-torcharrow-container.dockfile b/scripts/velox-torcharrow-container.dockfile deleted file mode 100644 index f4498101cfb9d..0000000000000 --- a/scripts/velox-torcharrow-container.dockfile +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Build container to be used for TorchArrow. - -FROM quay.io/pypa/manylinux2014_x86_64 -ARG cpu_target -COPY setup-velox-torcharrow.sh / -COPY setup-helper-functions.sh / -RUN mkdir build && ( cd build && CPU_TARGET="$cpu_target" bash /setup-velox-torcharrow.sh ) && rm -rf build diff --git a/scripts/velox_env_linux.yml b/scripts/velox_env_linux.yml new file mode 100644 index 0000000000000..fc0b4cf0c40fe --- /dev/null +++ b/scripts/velox_env_linux.yml @@ -0,0 +1,67 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: velox_base + +channels: + - conda-forge + +variables: + CC: clang + CXX: clang++ + +dependencies: +# tools + - binutils + - bison + - clangxx=14 + - cmake=3.28.3 + - ccache + - flex + - gxx=12 # has to be installed to get clang to work... + - make + - minio-server + - ninja + - nodejs + - openjdk=8.* + - python=3.8 + - sysroot_linux-64=2.17 +# dependencies + - aws-sdk-cpp + - azure-storage-blobs-cpp + - boost-cpp + - bzip2 + - double-conversion + - fmt=8.0.* + - gflags=2.2.2 + - glog=0.6.0 + - gmock=1.13 + - google-cloud-cpp + - gtest=1.13 + - libaio + - libdwarf-dev + - libevent + - libprotobuf=3.21 + - libsodium + - libtool + - libunwind + - lz4-c + - lzo + - openssl=1.1 + - re2 + - simdjson + - snappy + - xz + - zlib + - zstd diff --git a/scripts/velox_env_mac.yml b/scripts/velox_env_mac.yml new file mode 100644 index 0000000000000..a999f18d0d6b0 --- /dev/null +++ b/scripts/velox_env_mac.yml @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: velox_base + +channels: + - conda-forge + +variables: + CC: clang + CXX: clang++ + +dependencies: +# tools + - binutils + - bison + - clangxx=14 # pin to something recent'ish to avoid warings on upgrade + - cmake=3.28 + - ccache + - flex + - make + - minio-server + - ninja + - nodejs + - openjdk=8.* + - python=3.8 + - sysroot_linux-64=2.17 +# dependencies + - aws-sdk-cpp + - azure-storage-blobs-cpp + - boost-cpp + - bzip2 + - crc32c + - double-conversion + - fmt=8.0.* + - gflags=2.2.2 + - glog=0.6.0 + - gmock=1.13 + - google-cloud-cpp + - gtest=1.13 + - libdwarf-dev + - libevent + - libprotobuf=3.21 + - libsodium + - libtool + - lz4-c + - lzo + - openssl=1.1.* + - re2 + - snappy + - simdjson + - xz + - zlib + - zstd + diff --git a/setup.py b/setup.py index 8ec26c7d3aab7..40f8ae9645bfe 100644 --- a/setup.py +++ b/setup.py @@ -109,6 +109,16 @@ def run(self): def build_extension(self, ext): extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) + # Allow using a pre-built Velox library (for CI and development) e.g. 'VELOX_BUILD_DIR=_build/velox/debug' + # The build in question must have been built with 'VELOX_BUILD_PYTHON_PACKAGE=ON' and the same python version. + if "VELOX_BUILD_DIR" in os.environ: + velox_dir = os.path.abspath(os.environ["VELOX_BUILD_DIR"]) + + if not os.path.isdir(extdir): + os.symlink(velox_dir, os.path.dirname(extdir), target_is_directory=True) + + print(f"Using pre-built Velox library from {velox_dir}") + return # required for auto-detection of auxiliary "native" libs if not extdir.endswith(os.path.sep): @@ -126,9 +136,9 @@ def build_extension(self, ext): f"-DCMAKE_BUILD_TYPE={cfg}", f"-DCMAKE_INSTALL_PREFIX={extdir}", "-DCMAKE_VERBOSE_MAKEFILE=ON", + "-DVELOX_BUILD_MINIMAL=ON", "-DVELOX_BUILD_PYTHON_PACKAGE=ON", f"-DPYTHON_EXECUTABLE={exec_path} ", - "-DVELOX_CODEGEN_SUPPORT=OFF", ] build_args = [] @@ -149,9 +159,9 @@ def build_extension(self, ext): os.makedirs(self.build_temp) subprocess.check_call( - ["cmake", str(ROOT_DIR)] + cmake_args, cwd=self.build_temp + ["cmake", str(ROOT_DIR)] + cmake_args, + cwd=self.build_temp, ) - print(self.build_temp) subprocess.check_call( ["cmake", "--build", "."] + build_args, cwd=self.build_temp ) diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt deleted file mode 100644 index 570af4593e721..0000000000000 --- a/third_party/CMakeLists.txt +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} - "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules/") - -include(ExternalProject) - -if(VELOX_ENABLE_ARROW) - find_package(Thrift) - if(Thrift_FOUND) - set(THRIFT_SOURCE "SYSTEM") - else() - set(THRIFT_SOURCE "BUNDLED") - endif() - set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep") - set(ARROW_CMAKE_ARGS - -DARROW_PARQUET=ON - -DARROW_WITH_LZ4=ON - -DARROW_WITH_SNAPPY=ON - -DARROW_WITH_ZLIB=ON - -DARROW_WITH_ZSTD=ON - -DARROW_JEMALLOC=OFF - -DARROW_SIMD_LEVEL=NONE - -DARROW_RUNTIME_SIMD_LEVEL=NONE - -DARROW_WITH_UTF8PROC=OFF - -DARROW_TESTING=ON - -DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}/install - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DARROW_BUILD_STATIC=ON - -DThrift_SOURCE=${THRIFT_SOURCE}) - set(ARROW_LIBDIR ${ARROW_PREFIX}/install/${CMAKE_INSTALL_LIBDIR}) - - add_library(thrift STATIC IMPORTED GLOBAL) - if(NOT Thrift_FOUND) - set(THRIFT_ROOT ${ARROW_PREFIX}/src/arrow_ep-build/thrift_ep-install) - set(THRIFT_LIB ${THRIFT_ROOT}/lib/libthrift.a) - - file(MAKE_DIRECTORY ${THRIFT_ROOT}/include) - set(THRIFT_INCLUDE_DIR ${THRIFT_ROOT}/include) - endif() - - set_property(TARGET thrift PROPERTY INTERFACE_INCLUDE_DIRECTORIES - ${THRIFT_INCLUDE_DIR}) - set_property(TARGET thrift PROPERTY IMPORTED_LOCATION ${THRIFT_LIB}) - - set(VELOX_ARROW_BUILD_VERSION 13.0.0) - set(VELOX_ARROW_BUILD_SHA256_CHECKSUM - 35dfda191262a756be934eef8afee8d09762cad25021daa626eb249e251ac9e6) - set(VELOX_ARROW_SOURCE_URL - "https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz" - ) - - resolve_dependency_url(ARROW) - - ExternalProject_Add( - arrow_ep - PREFIX ${ARROW_PREFIX} - URL ${VELOX_ARROW_SOURCE_URL} - URL_HASH ${VELOX_ARROW_BUILD_SHA256_CHECKSUM} - SOURCE_SUBDIR cpp - CMAKE_ARGS ${ARROW_CMAKE_ARGS} - BUILD_BYPRODUCTS ${ARROW_LIBDIR}/libarrow.a ${ARROW_LIBDIR}/libparquet.a - ${ARROW_LIBDIR}/libarrow_testing.a ${THRIFT_LIB}) - add_library(arrow STATIC IMPORTED GLOBAL) - add_library(arrow_testing STATIC IMPORTED GLOBAL) - add_library(parquet STATIC IMPORTED GLOBAL) - add_dependencies(arrow arrow_ep) - add_dependencies(arrow_testing arrow) - add_dependencies(parquet arrow) - file(MAKE_DIRECTORY ${ARROW_PREFIX}/install/include) - set_target_properties( - arrow arrow_testing parquet PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - ${ARROW_PREFIX}/install/include) - set_target_properties(arrow PROPERTIES IMPORTED_LOCATION - ${ARROW_LIBDIR}/libarrow.a) - set_property(TARGET arrow PROPERTY INTERFACE_LINK_LIBRARIES ${RE2} thrift) - set_target_properties( - arrow_testing PROPERTIES IMPORTED_LOCATION - ${ARROW_LIBDIR}/libarrow_testing.a) - set_target_properties(parquet PROPERTIES IMPORTED_LOCATION - ${ARROW_LIBDIR}/libparquet.a) - -endif() diff --git a/third_party/cmake_modules/FindThrift.cmake b/third_party/cmake_modules/FindThrift.cmake deleted file mode 100644 index 07028971d9fcc..0000000000000 --- a/third_party/cmake_modules/FindThrift.cmake +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright 2012 Cloudera Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# - Find Thrift (a cross platform RPC lib/tool) -# -# Variables used by this module, they can change the default behaviour and need -# to be set before calling find_package: -# -# Thrift_ROOT - When set, this path is inspected instead of standard library -# locations as the root of the Thrift installation. -# The environment variable THRIFT_HOME overrides this variable. -# -# This module defines -# Thrift_FOUND, whether Thrift is found or not -# Thrift_COMPILER_FOUND, whether Thrift compiler is found or not -# -# thrift::thrift, a library target to use Thrift -# thrift::compiler, a executable target to use Thrift compiler - -function(EXTRACT_THRIFT_VERSION) - if(THRIFT_INCLUDE_DIR) - file(READ "${THRIFT_INCLUDE_DIR}/thrift/config.h" THRIFT_CONFIG_H_CONTENT) - string(REGEX MATCH "#define PACKAGE_VERSION \"[0-9.]+\"" THRIFT_VERSION_DEFINITION - "${THRIFT_CONFIG_H_CONTENT}") - string(REGEX MATCH "[0-9.]+" Thrift_VERSION "${THRIFT_VERSION_DEFINITION}") - set(Thrift_VERSION - "${Thrift_VERSION}" - PARENT_SCOPE) - else() - set(Thrift_VERSION - "" - PARENT_SCOPE) - endif() -endfunction(EXTRACT_THRIFT_VERSION) - -if(MSVC_TOOLCHAIN AND NOT DEFINED THRIFT_MSVC_LIB_SUFFIX) - if(NOT ARROW_THRIFT_USE_SHARED) - if(ARROW_USE_STATIC_CRT) - if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(THRIFT_MSVC_LIB_SUFFIX "mtd") - else() - set(THRIFT_MSVC_LIB_SUFFIX "mt") - endif() - else() - if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(THRIFT_MSVC_LIB_SUFFIX "mdd") - else() - set(THRIFT_MSVC_LIB_SUFFIX "md") - endif() - endif() - endif() -endif() -set(THRIFT_LIB_NAME_BASE "thrift${THRIFT_MSVC_LIB_SUFFIX}") - -if(ARROW_THRIFT_USE_SHARED) - set(THRIFT_LIB_NAMES thrift) - if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list(APPEND - THRIFT_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" - ) - endif() - list(APPEND - THRIFT_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}" - ) -else() - set(THRIFT_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) -endif() - -if(Thrift_ROOT) - find_library(THRIFT_LIB - NAMES ${THRIFT_LIB_NAMES} - PATHS ${Thrift_ROOT} - PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") - find_path(THRIFT_INCLUDE_DIR thrift/Thrift.h - PATHS ${Thrift_ROOT} - PATH_SUFFIXES "include") - find_program(THRIFT_COMPILER thrift - PATHS ${Thrift_ROOT} - PATH_SUFFIXES "bin") - extract_thrift_version() -else() - # THRIFT-4760: The pkgconfig files are currently only installed when using autotools. - # Starting with 0.13, they are also installed for the CMake-based installations of Thrift. - find_package(PkgConfig QUIET) - pkg_check_modules(THRIFT_PC thrift) - if(THRIFT_PC_FOUND) - set(THRIFT_INCLUDE_DIR "${THRIFT_PC_INCLUDEDIR}") - - list(APPEND THRIFT_PC_LIBRARY_DIRS "${THRIFT_PC_LIBDIR}") - - find_library(THRIFT_LIB - NAMES ${THRIFT_LIB_NAMES} - PATHS ${THRIFT_PC_LIBRARY_DIRS} - NO_DEFAULT_PATH) - find_program(THRIFT_COMPILER thrift - HINTS ${THRIFT_PC_PREFIX} - NO_DEFAULT_PATH - PATH_SUFFIXES "bin") - set(Thrift_VERSION ${THRIFT_PC_VERSION}) - else() - find_library(THRIFT_LIB - NAMES ${THRIFT_LIB_NAMES} - PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") - find_path(THRIFT_INCLUDE_DIR thrift/Thrift.h PATH_SUFFIXES "include") - find_program(THRIFT_COMPILER thrift PATH_SUFFIXES "bin") - extract_thrift_version() - endif() -endif() - -if(THRIFT_COMPILER) - set(Thrift_COMPILER_FOUND TRUE) -else() - set(Thrift_COMPILER_FOUND FALSE) -endif() - -find_package_handle_standard_args( - Thrift - REQUIRED_VARS THRIFT_LIB THRIFT_INCLUDE_DIR - VERSION_VAR Thrift_VERSION - HANDLE_COMPONENTS) - -if(Thrift_FOUND) - if(ARROW_THRIFT_USE_SHARED) - add_library(thrift::thrift SHARED IMPORTED) - else() - add_library(thrift::thrift STATIC IMPORTED) - endif() - set_target_properties(thrift::thrift - PROPERTIES IMPORTED_LOCATION "${THRIFT_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${THRIFT_INCLUDE_DIR}") - if(WIN32 AND NOT MSVC_TOOLCHAIN) - # We don't need this for Visual C++ because Thrift uses - # "#pragma comment(lib, "Ws2_32.lib")" in - # thrift/windows/config.h for Visual C++. - set_target_properties(thrift::thrift PROPERTIES INTERFACE_LINK_LIBRARIES "ws2_32") - endif() - - if(Thrift_COMPILER_FOUND) - add_executable(thrift::compiler IMPORTED) - set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION - "${THRIFT_COMPILER}") - endif() -endif() diff --git a/velox/CMakeLists.txt b/velox/CMakeLists.txt index ea35aa1ea59ca..064917ef60ed4 100644 --- a/velox/CMakeLists.txt +++ b/velox/CMakeLists.txt @@ -44,7 +44,7 @@ if(${VELOX_ENABLE_PARSE}) endif() # hive connector depends on dwio -if(${VELOX_ENABLE_HIVE_CONNECTOR}) +if(${VELOX_BUILD_MINIMAL_WITH_DWIO} OR ${VELOX_ENABLE_HIVE_CONNECTOR}) add_subdirectory(dwio) endif() @@ -55,19 +55,12 @@ endif() add_subdirectory(functions) # depends on md5 (postgresql) add_subdirectory(connectors) -# exec depends on codegen if(${VELOX_ENABLE_EXEC}) add_subdirectory(exec) - add_subdirectory(codegen) endif() if(${VELOX_ENABLE_DUCKDB}) add_subdirectory(duckdb) - add_subdirectory(external/duckdb) -endif() - -if(${VELOX_CODEGEN_SUPPORT}) - add_subdirectory(experimental/codegen) endif() if(${VELOX_ENABLE_GPU}) diff --git a/velox/benchmarks/CMakeLists.txt b/velox/benchmarks/CMakeLists.txt index 263920704d544..dda7226d67150 100644 --- a/velox/benchmarks/CMakeLists.txt +++ b/velox/benchmarks/CMakeLists.txt @@ -29,10 +29,37 @@ set(velox_benchmark_deps glog::glog) add_library(velox_benchmark_builder ExpressionBenchmarkBuilder.cpp) -target_link_libraries(velox_benchmark_builder ${velox_benchmark_deps}) +target_link_libraries( + velox_benchmark_builder ${velox_benchmark_deps}) # This is a workaround for the use of VectorTestBase.h which includes gtest.h -target_link_libraries(velox_benchmark_builder gtest) +target_link_libraries( + velox_benchmark_builder GTest::gtest) if(${VELOX_ENABLE_BENCHMARKS}) add_subdirectory(tpch) + add_subdirectory(filesystem) endif() + +add_library(velox_query_benchmark QueryBenchmarkBase.cpp) +target_link_libraries( + velox_query_benchmark + velox_aggregates + velox_exec + velox_exec_test_lib + velox_dwio_common + velox_dwio_common_exception + velox_dwio_parquet_reader + velox_dwio_common_test_utils + velox_hive_connector + velox_exception + velox_memory + velox_process + velox_serialization + velox_encode + velox_type + velox_type_fbhive + velox_caching + velox_vector_test_lib + ${FOLLY_BENCHMARK} + Folly::folly + fmt::fmt) diff --git a/velox/benchmarks/ExpressionBenchmarkBuilder.cpp b/velox/benchmarks/ExpressionBenchmarkBuilder.cpp index fcb0621272666..bfed3aa229846 100644 --- a/velox/benchmarks/ExpressionBenchmarkBuilder.cpp +++ b/velox/benchmarks/ExpressionBenchmarkBuilder.cpp @@ -39,9 +39,9 @@ ExpressionBenchmarkSet& ExpressionBenchmarkSet::addExpressions( // Make sure all input vectors are generated. void ExpressionBenchmarkBuilder::ensureInputVectors() { for (auto& [_, benchmarkSet] : benchmarkSets_) { - if (!benchmarkSet.inputRowVetor_) { + if (!benchmarkSet.inputRowVector_) { VectorFuzzer fuzzer(benchmarkSet.fuzzerOptions_, pool()); - benchmarkSet.inputRowVetor_ = std::dynamic_pointer_cast( + benchmarkSet.inputRowVector_ = std::dynamic_pointer_cast( fuzzer.fuzzFlat(benchmarkSet.inputType_)); } } @@ -64,10 +64,10 @@ void ExpressionBenchmarkBuilder::testBenchmarks() { } // Evaluate the first expression. auto it = benchmarkSet.expressions_.begin(); - auto refResult = evalExpression(it->second, benchmarkSet.inputRowVetor_); + auto refResult = evalExpression(it->second, benchmarkSet.inputRowVector_); it++; while (it != benchmarkSet.expressions_.end()) { - auto result = evalExpression(it->second, benchmarkSet.inputRowVetor_); + auto result = evalExpression(it->second, benchmarkSet.inputRowVector_); test::assertEqualVectors(refResult, result); it++; } @@ -86,9 +86,9 @@ void ExpressionBenchmarkBuilder::registerBenchmarks() { for (auto& [setName, benchmarkSet] : benchmarkSets_) { for (auto& [exprName, exprSet] : benchmarkSet.expressions_) { auto name = fmt::format("{}##{}", setName, exprName); - auto& inputVector = benchmarkSet.inputRowVetor_; - auto times = benchmarkSet.itterations_; - // The compiler does not allow capturing exprSet int the lambda + auto& inputVector = benchmarkSet.inputRowVector_; + auto times = benchmarkSet.iterations_; + // The compiler does not allow capturing exprSet int the lambda. auto& exprSetLocal = exprSet; folly::addBenchmark( __FILE__, name, [this, &inputVector, &exprSetLocal, times]() { @@ -104,7 +104,7 @@ void ExpressionBenchmarkBuilder::registerBenchmarks() { for (auto i = 0; i < times; i++) { exprSetLocal.eval(rows, evalCtx, results); - // TODO: add flag to enable/disable flatenning. + // TODO: add flag to enable/disable flattening. BaseVector::flattenVector(results[0]); // TODO: add flag to enable/disable reuse. @@ -116,8 +116,8 @@ void ExpressionBenchmarkBuilder::registerBenchmarks() { return 1; }); } - BENCHMARK_DRAW_LINE(); - BENCHMARK_DRAW_LINE(); + + folly::addBenchmark(__FILE__, "-", []() -> unsigned { return 0; }); } } } // namespace facebook::velox diff --git a/velox/benchmarks/ExpressionBenchmarkBuilder.h b/velox/benchmarks/ExpressionBenchmarkBuilder.h index ac29707e99eec..8034397f77cfb 100644 --- a/velox/benchmarks/ExpressionBenchmarkBuilder.h +++ b/velox/benchmarks/ExpressionBenchmarkBuilder.h @@ -47,23 +47,23 @@ class ExpressionBenchmarkSet { ExpressionBenchmarkSet& withFuzzerOptions( const VectorFuzzer::Options& options) { VELOX_CHECK( - !inputRowVetor_, + !inputRowVector_, "input row vector is already passed, fuzzer wont be used"); fuzzerOptions_ = options; return *this; } - ExpressionBenchmarkSet& withIterations(int itterations) { - itterations_ = itterations; + ExpressionBenchmarkSet& withIterations(int iterations) { + iterations_ = iterations; return *this; } private: ExpressionBenchmarkSet( ExpressionBenchmarkBuilder& builder, - const RowVectorPtr& inputRowVetor) - : inputRowVetor_(inputRowVetor), - inputType_(inputRowVetor_->type()), + const RowVectorPtr& inputRowVector) + : inputRowVector_(inputRowVector), + inputType_(inputRowVector_->type()), builder_(builder) {} ExpressionBenchmarkSet( @@ -74,9 +74,9 @@ class ExpressionBenchmarkSet { // All the expressions that belongs to this set. std::vector> expressions_; - // The input that will be used for for benchmarking expressions. If not set, + // The input that will be used for benchmarking expressions. If not set, // a flat input vector is fuzzed using fuzzerOptions_. - RowVectorPtr inputRowVetor_; + RowVectorPtr inputRowVector_; // The type of the input that will be used for all the expressions // benchmarked. @@ -84,11 +84,11 @@ class ExpressionBenchmarkSet { // User can provide fuzzer options for the input row vector used for this // benchmark. Note that the fuzzer will be used to generate a flat input row - // vector if inputRowVetor_ is nullptr. + // vector if inputRowVector_ is nullptr. VectorFuzzer::Options fuzzerOptions_{.vectorSize = 10000, .nullRatio = 0}; // Number of times to run each benchmark. - int itterations_ = 1000; + int iterations_ = 1000; bool disableTesting_ = false; @@ -108,7 +108,7 @@ class ExpressionBenchmarkBuilder void registerBenchmarks(); // All benchmarks within one group set are expected to have the same results. - // If disbleTesting=true for a group set, testing is skipped. + // If disableTesting=true for a group set, testing is skipped. void testBenchmarks(); test::VectorMaker& vectorMaker() { @@ -117,9 +117,9 @@ class ExpressionBenchmarkBuilder ExpressionBenchmarkSet& addBenchmarkSet( const std::string& name, - const RowVectorPtr& inputRowVetor) { + const RowVectorPtr& inputRowVector) { VELOX_CHECK(!benchmarkSets_.count(name)); - benchmarkSets_.emplace(name, ExpressionBenchmarkSet(*this, inputRowVetor)); + benchmarkSets_.emplace(name, ExpressionBenchmarkSet(*this, inputRowVector)); return benchmarkSets_.at(name); } diff --git a/velox/benchmarks/QueryBenchmarkBase.cpp b/velox/benchmarks/QueryBenchmarkBase.cpp new file mode 100644 index 0000000000000..f51acd5544ad7 --- /dev/null +++ b/velox/benchmarks/QueryBenchmarkBase.cpp @@ -0,0 +1,391 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/benchmarks/QueryBenchmarkBase.h" + +DEFINE_string(data_format, "parquet", "Data format"); + +DEFINE_validator( + data_format, + &facebook::velox::QueryBenchmarkBase::validateDataFormat); + +DEFINE_bool( + include_custom_stats, + false, + "Include custom statistics along with execution statistics"); +DEFINE_bool(include_results, false, "Include results in the output"); +DEFINE_int32(num_drivers, 4, "Number of drivers"); + +DEFINE_int32(num_splits_per_file, 10, "Number of splits per file"); +DEFINE_int32( + cache_gb, + 0, + "GB of process memory for cache and query.. if " + "non-0, uses mmap to allocator and in-process data cache."); +DEFINE_int32(num_repeats, 1, "Number of times to run each query"); +DEFINE_int32(num_io_threads, 8, "Threads for speculative IO"); +DEFINE_string( + test_flags_file, + "", + "Path to a file containing gflafs and " + "values to try. Produces results for each flag combination " + "sorted on performance"); +DEFINE_bool( + full_sorted_stats, + true, + "Add full stats to the report on --test_flags_file"); + +DEFINE_string(ssd_path, "", "Directory for local SSD cache"); +DEFINE_int32(ssd_cache_gb, 0, "Size of local SSD cache in GB"); +DEFINE_int32( + ssd_checkpoint_interval_gb, + 8, + "Checkpoint every n " + "GB new data in cache"); +DEFINE_bool( + clear_ram_cache, + false, + "Clear RAM cache before each query." + "Flushes in process and OS file system cache (if root on Linux)"); +DEFINE_bool( + clear_ssd_cache, + false, + "Clears SSD cache before " + "each query"); + +DEFINE_bool( + warmup_after_clear, + false, + "Runs one warmup of the query before " + "measured run. Use to run warm after clearing caches."); + +DEFINE_int64( + max_coalesced_bytes, + 128 << 20, + "Maximum size of single coalesced IO"); + +DEFINE_int32( + max_coalesced_distance_bytes, + 512 << 10, + "Maximum distance in bytes in which coalesce will combine requests"); + +DEFINE_int32( + parquet_prefetch_rowgroups, + 1, + "Number of next row groups to " + "prefetch. 1 means prefetch the next row group before decoding " + "the current one"); + +DEFINE_int32(split_preload_per_driver, 2, "Prefetch split metadata"); + +using namespace facebook::velox::exec; +using namespace facebook::velox::exec::test; +using namespace facebook::velox::dwio::common; + +namespace facebook::velox { + +// static +bool QueryBenchmarkBase::validateDataFormat( + const char* flagname, + const std::string& value) { + if ((value.compare("parquet") == 0) || (value.compare("dwrf") == 0)) { + return true; + } + std::cout + << fmt::format( + "Invalid value for --{}: {}. Allowed values are [\"parquet\", \"dwrf\"]", + flagname, + value) + << std::endl; + return false; +} + +// static +void QueryBenchmarkBase::ensureTaskCompletion(exec::Task* task) { + // ASSERT_TRUE requires a function with return type void. + ASSERT_TRUE(exec::test::waitForTaskCompletion(task)); +} + +// static +void QueryBenchmarkBase::printResults( + const std::vector& results, + std::ostream& out) { + out << "Results:" << std::endl; + bool printType = true; + for (const auto& vector : results) { + // Print RowType only once. + if (printType) { + out << vector->type()->asRow().toString() << std::endl; + printType = false; + } + for (vector_size_t i = 0; i < vector->size(); ++i) { + out << vector->toString(i) << std::endl; + } + } +} + +void QueryBenchmarkBase::initialize() { + if (FLAGS_cache_gb) { + memory::MemoryManagerOptions options; + int64_t memoryBytes = FLAGS_cache_gb * (1LL << 30); + options.useMmapAllocator = true; + options.allocatorCapacity = memoryBytes; + options.useMmapArena = true; + options.mmapArenaCapacityRatio = 1; + memory::MemoryManager::testingSetInstance(options); + std::unique_ptr ssdCache; + if (FLAGS_ssd_cache_gb) { + constexpr int32_t kNumSsdShards = 16; + cacheExecutor_ = + std::make_unique(kNumSsdShards); + const cache::SsdCache::Config config( + FLAGS_ssd_path, + static_cast(FLAGS_ssd_cache_gb) << 30, + kNumSsdShards, + cacheExecutor_.get(), + static_cast(FLAGS_ssd_checkpoint_interval_gb) << 30); + ssdCache = std::make_unique(config); + } + + cache_ = cache::AsyncDataCache::create( + memory::memoryManager()->allocator(), std::move(ssdCache)); + cache::AsyncDataCache::setInstance(cache_.get()); + } else { + memory::MemoryManager::testingSetInstance({}); + } + functions::prestosql::registerAllScalarFunctions(); + aggregate::prestosql::registerAllAggregateFunctions(); + parse::registerTypeResolver(); + filesystems::registerLocalFileSystem(); + + ioExecutor_ = + std::make_unique(FLAGS_num_io_threads); + + // Add new values into the hive configuration... + auto configurationValues = std::unordered_map(); + configurationValues[connector::hive::HiveConfig::kMaxCoalescedBytes] = + std::to_string(FLAGS_max_coalesced_bytes); + configurationValues[connector::hive::HiveConfig::kMaxCoalescedDistanceBytes] = + std::to_string(FLAGS_max_coalesced_distance_bytes); + configurationValues[connector::hive::HiveConfig::kPrefetchRowGroups] = + std::to_string(FLAGS_parquet_prefetch_rowgroups); + auto properties = std::make_shared( + std::move(configurationValues)); + + // Create hive connector with config... + auto hiveConnector = + connector::getConnectorFactory( + connector::hive::HiveConnectorFactory::kHiveConnectorName) + ->newConnector(kHiveConnectorId, properties, ioExecutor_.get()); + connector::registerConnector(hiveConnector); +} + +std::vector> +QueryBenchmarkBase::listSplits( + const std::string& path, + int32_t numSplitsPerFile, + const exec::test::TpchPlan& plan) { + std::vector> result; + auto temp = HiveConnectorTestBase::makeHiveConnectorSplits( + path, numSplitsPerFile, plan.dataFileFormat); + for (auto& i : temp) { + result.push_back(i); + } + return result; +} + +void QueryBenchmarkBase::shutdown() { + if (cache_) { + cache_->shutdown(); + } +} + +std::pair, std::vector> +QueryBenchmarkBase::run(const TpchPlan& tpchPlan) { + int32_t repeat = 0; + try { + for (;;) { + CursorParameters params; + params.maxDrivers = FLAGS_num_drivers; + params.planNode = tpchPlan.plan; + params.queryConfigs[core::QueryConfig::kMaxSplitPreloadPerDriver] = + std::to_string(FLAGS_split_preload_per_driver); + const int numSplitsPerFile = FLAGS_num_splits_per_file; + + bool noMoreSplits = false; + auto addSplits = [&](exec::Task* task) { + if (!noMoreSplits) { + for (const auto& entry : tpchPlan.dataFiles) { + for (const auto& path : entry.second) { + auto splits = listSplits(path, numSplitsPerFile, tpchPlan); + for (auto split : splits) { + task->addSplit(entry.first, exec::Split(std::move(split))); + } + } + task->noMoreSplits(entry.first); + } + } + noMoreSplits = true; + }; + auto result = readCursor(params, addSplits); + ensureTaskCompletion(result.first->task().get()); + if (++repeat >= FLAGS_num_repeats) { + return result; + } + } + } catch (const std::exception& e) { + LOG(ERROR) << "Query terminated with: " << e.what(); + return {nullptr, std::vector()}; + } +} + +void QueryBenchmarkBase::readCombinations() { + std::ifstream file(FLAGS_test_flags_file); + std::string line; + while (std::getline(file, line)) { + ParameterDim dim; + int32_t previous = 0; + for (auto i = 0; i < line.size(); ++i) { + if (line[i] == ':') { + dim.flag = line.substr(0, i); + previous = i + 1; + } else if (line[i] == ',') { + dim.values.push_back(line.substr(previous, i - previous)); + previous = i + 1; + } + } + if (previous < line.size()) { + dim.values.push_back(line.substr(previous, line.size() - previous)); + } + if (!dim.flag.empty() && !dim.values.empty()) { + parameters_.push_back(dim); + } + } +} + +void QueryBenchmarkBase::runCombinations(int32_t level) { + if (level == parameters_.size()) { + if (FLAGS_clear_ram_cache) { +#ifdef linux + // system("echo 3 >/proc/sys/vm/drop_caches"); + bool success = false; + auto fd = open("/proc//sys/vm/drop_caches", O_WRONLY); + if (fd > 0) { + success = write(fd, "3", 1) == 1; + close(fd); + } + if (!success) { + LOG(ERROR) << "Failed to clear OS disk cache: errno=" << errno; + } +#endif + + if (cache_) { + cache_->clear(); + } + } + if (FLAGS_clear_ssd_cache) { + if (cache_) { + auto ssdCache = cache_->ssdCache(); + if (ssdCache) { + ssdCache->clear(); + } + } + } + if (FLAGS_warmup_after_clear) { + std::stringstream result; + RunStats ignore; + runMain(result, ignore); + } + RunStats stats; + std::stringstream result; + uint64_t micros = 0; + { + struct rusage start; + getrusage(RUSAGE_SELF, &start); + MicrosecondTimer timer(µs); + runMain(result, stats); + struct rusage final; + getrusage(RUSAGE_SELF, &final); + auto tvNanos = [](struct timeval tv) { + return tv.tv_sec * 1000000000 + tv.tv_usec * 1000; + }; + stats.userNanos = tvNanos(final.ru_utime) - tvNanos(start.ru_utime); + stats.systemNanos = tvNanos(final.ru_stime) - tvNanos(start.ru_stime); + } + stats.micros = micros; + stats.output = result.str(); + for (auto i = 0; i < parameters_.size(); ++i) { + std::string name; + gflags::GetCommandLineOption(parameters_[i].flag.c_str(), &name); + stats.flags[parameters_[i].flag] = name; + } + runStats_.push_back(std::move(stats)); + } else { + auto& flag = parameters_[level].flag; + for (auto& value : parameters_[level].values) { + std::string result = + gflags::SetCommandLineOption(flag.c_str(), value.c_str()); + if (result.empty()) { + LOG(ERROR) << "Failed to set " << flag << "=" << value; + } + std::cout << result << std::endl; + runCombinations(level + 1); + } + } +} + +void QueryBenchmarkBase::runOne(std::ostream& out, RunStats& stats) { + std::stringstream result; + uint64_t micros = 0; + { + struct rusage start; + getrusage(RUSAGE_SELF, &start); + MicrosecondTimer timer(µs); + runMain(out, stats); + struct rusage final; + getrusage(RUSAGE_SELF, &final); + auto tvNanos = [](struct timeval tv) { + return tv.tv_sec * 1000000000 + tv.tv_usec * 1000; + }; + stats.userNanos = tvNanos(final.ru_utime) - tvNanos(start.ru_utime); + stats.systemNanos = tvNanos(final.ru_stime) - tvNanos(start.ru_stime); + } + stats.micros = micros; + stats.output = result.str(); + out << result.str(); +} + +void QueryBenchmarkBase::runAllCombinations() { + readCombinations(); + runCombinations(0); + std::sort( + runStats_.begin(), + runStats_.end(), + [](const RunStats& left, const RunStats& right) { + return left.micros < right.micros; + }); + for (auto& stats : runStats_) { + std::cout << stats.toString(false); + } + if (FLAGS_full_sorted_stats) { + std::cout << "Detail for stats:" << std::endl; + for (auto& stats : runStats_) { + std::cout << stats.toString(true); + } + } +} + +} // namespace facebook::velox diff --git a/velox/benchmarks/QueryBenchmarkBase.h b/velox/benchmarks/QueryBenchmarkBase.h new file mode 100644 index 0000000000000..d3577fe53cf25 --- /dev/null +++ b/velox/benchmarks/QueryBenchmarkBase.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "velox/common/base/SuccinctPrinter.h" +#include "velox/common/file/FileSystems.h" +#include "velox/common/memory/MmapAllocator.h" +#include "velox/connectors/hive/HiveConfig.h" +#include "velox/connectors/hive/HiveConnector.h" +#include "velox/dwio/common/Options.h" +#include "velox/exec/PlanNodeStats.h" +#include "velox/exec/Split.h" +#include "velox/exec/tests/utils/HiveConnectorTestBase.h" +#include "velox/exec/tests/utils/TpchQueryBuilder.h" +#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h" +#include "velox/functions/prestosql/registration/RegistrationFunctions.h" +#include "velox/parse/TypeResolver.h" + +DECLARE_string(test_flags_file); +DECLARE_bool(include_results); +DECLARE_bool(include_custom_stats); +DECLARE_string(data_format); + +namespace facebook::velox { + +struct RunStats { + std::map flags; + int64_t micros{0}; + int64_t rawInputBytes{0}; + int64_t userNanos{0}; + int64_t systemNanos{0}; + std::string output; + + std::string toString(bool detail) { + std::stringstream out; + out << succinctNanos(micros * 1000) << " " + << succinctBytes(rawInputBytes / (micros / 1000000.0)) << "/s raw, " + << succinctNanos(userNanos) << " user " << succinctNanos(systemNanos) + << " system (" << (100 * (userNanos + systemNanos) / (micros * 1000)) + << "%), flags: "; + for (auto& pair : flags) { + out << pair.first << "=" << pair.second << " "; + } + out << std::endl << "======" << std::endl; + if (detail) { + out << std::endl << output << std::endl; + } + return out.str(); + } +}; + +struct ParameterDim { + std::string flag; + std::vector values; +}; + +class QueryBenchmarkBase { + public: + virtual ~QueryBenchmarkBase() = default; + virtual void initialize(); + void shutdown(); + std::pair, std::vector> + run(const exec::test::TpchPlan& tpchPlan); + + virtual std::vector> listSplits( + const std::string& path, + int32_t numSplitsPerFile, + const exec::test::TpchPlan& plan); + + static void ensureTaskCompletion(exec::Task* task); + + static bool validateDataFormat( + const char* flagname, + const std::string& value); + + static void printResults( + const std::vector& results, + std::ostream& out); + + void readCombinations(); + + /// Entry point invoked with different settings to run the benchmark. + virtual void runMain(std::ostream& out, RunStats& runStats) = 0; + + void runOne(std::ostream& outtt, RunStats& stats); + + void runCombinations(int32_t level); + + void runAllCombinations(); + + protected: + std::unique_ptr ioExecutor_; + std::unique_ptr cacheExecutor_; + std::shared_ptr allocator_; + std::shared_ptr cache_; + // Parameter combinations to try. Each element specifies a flag and possible + // values. All permutations are tried. + std::vector parameters_; + + std::vector runStats_; +}; +} // namespace facebook::velox diff --git a/velox/benchmarks/basic/CMakeLists.txt b/velox/benchmarks/basic/CMakeLists.txt index 8b14e6c8245ef..45a57b0944fe2 100644 --- a/velox/benchmarks/basic/CMakeLists.txt +++ b/velox/benchmarks/basic/CMakeLists.txt @@ -21,6 +21,7 @@ set(velox_benchmark_deps velox_parse_expression velox_serialization velox_benchmark_builder + velox_vector_test_lib Folly::folly ${FOLLY_BENCHMARK} ${DOUBLE_CONVERSION} @@ -28,48 +29,74 @@ set(velox_benchmark_deps glog::glog) add_executable(velox_benchmark_basic_simple_arithmetic SimpleArithmetic.cpp) -target_link_libraries(velox_benchmark_basic_simple_arithmetic - ${velox_benchmark_deps}) +target_link_libraries( + velox_benchmark_basic_simple_arithmetic ${velox_benchmark_deps}) add_executable(velox_benchmark_basic_comparison_conjunct ComparisonConjunct.cpp) -target_link_libraries(velox_benchmark_basic_comparison_conjunct - ${velox_benchmark_deps}) +target_link_libraries( + velox_benchmark_basic_comparison_conjunct ${velox_benchmark_deps}) add_executable(velox_benchmark_basic_simple_cast SimpleCastExpr.cpp) -target_link_libraries(velox_benchmark_basic_simple_cast ${velox_benchmark_deps}) +target_link_libraries( + velox_benchmark_basic_simple_cast ${velox_benchmark_deps}) add_executable(velox_benchmark_basic_decoded_vector DecodedVector.cpp) -target_link_libraries(velox_benchmark_basic_decoded_vector - ${velox_benchmark_deps}) +target_link_libraries( + velox_benchmark_basic_decoded_vector ${velox_benchmark_deps}) add_executable(velox_benchmark_basic_selectivity_vector SelectivityVector.cpp) -target_link_libraries(velox_benchmark_basic_selectivity_vector - ${velox_benchmark_deps}) +target_link_libraries( + velox_benchmark_basic_selectivity_vector ${velox_benchmark_deps}) add_executable(velox_benchmark_basic_vector_compare VectorCompare.cpp) -target_link_libraries(velox_benchmark_basic_vector_compare - ${velox_benchmark_deps} velox_vector_test_lib) +target_link_libraries( + velox_benchmark_basic_vector_compare ${velox_benchmark_deps} + velox_vector_test_lib) add_executable(velox_benchmark_basic_vector_slice VectorSlice.cpp) -target_link_libraries(velox_benchmark_basic_vector_slice - ${velox_benchmark_deps} velox_vector_test_lib) +target_link_libraries( + velox_benchmark_basic_vector_slice ${velox_benchmark_deps} + velox_vector_test_lib) add_executable(velox_benchmark_feature_normalization FeatureNormalization.cpp) -target_link_libraries(velox_benchmark_feature_normalization - ${velox_benchmark_deps} velox_functions_prestosql) +target_link_libraries( + velox_benchmark_feature_normalization ${velox_benchmark_deps} + velox_functions_prestosql) add_executable(velox_benchmark_basic_preproc Preproc.cpp) -target_link_libraries(velox_benchmark_basic_preproc ${velox_benchmark_deps} - velox_functions_prestosql velox_vector_test_lib) +target_link_libraries( + velox_benchmark_basic_preproc ${velox_benchmark_deps} + velox_functions_prestosql velox_vector_test_lib) -add_executable(velox_like_functions_benchmark LikeFunctionsBenchmark.cpp) -target_link_libraries(velox_like_functions_benchmark ${velox_benchmark_deps} - velox_functions_lib velox_tpch_gen velox_vector_test_lib) +add_executable(velox_like_tpch_benchmark LikeTpchBenchmark.cpp) +target_link_libraries( + velox_like_tpch_benchmark + ${velox_benchmark_deps} + velox_functions_lib + velox_tpch_gen + velox_vector_test_lib) + +add_executable(velox_like_benchmark LikeBenchmark.cpp) +target_link_libraries( + velox_like_benchmark + ${velox_benchmark_deps} + velox_functions_lib + velox_functions_prestosql + velox_vector_test_lib) add_executable(velox_benchmark_basic_vector_fuzzer VectorFuzzer.cpp) -target_link_libraries(velox_benchmark_basic_vector_fuzzer - ${velox_benchmark_deps} velox_vector_test_lib) +target_link_libraries( + velox_benchmark_basic_vector_fuzzer ${velox_benchmark_deps} + velox_vector_test_lib) add_executable(velox_cast_benchmark CastBenchmark.cpp) -target_link_libraries(velox_cast_benchmark ${velox_benchmark_deps} - velox_vector_test_lib) +target_link_libraries( + velox_cast_benchmark ${velox_benchmark_deps} velox_vector_test_lib) + +add_executable(velox_format_datetime_benchmark FormatDateTimeBenchmark.cpp) +target_link_libraries( + velox_format_datetime_benchmark + ${velox_benchmark_deps} + velox_vector_test_lib + velox_functions_spark + velox_functions_prestosql) diff --git a/velox/benchmarks/basic/CastBenchmark.cpp b/velox/benchmarks/basic/CastBenchmark.cpp index 192997e01570f..f937f354f8935 100644 --- a/velox/benchmarks/basic/CastBenchmark.cpp +++ b/velox/benchmarks/basic/CastBenchmark.cpp @@ -25,13 +25,37 @@ using namespace facebook::velox; int main(int argc, char** argv) { folly::Init init(&argc, &argv); + memory::MemoryManager::initialize({}); ExpressionBenchmarkBuilder benchmarkBuilder; const vector_size_t vectorSize = 1000; auto vectorMaker = benchmarkBuilder.vectorMaker(); - auto invalidInput = vectorMaker.flatVector({""}); - auto validInput = vectorMaker.flatVector({""}); - auto nanInput = vectorMaker.flatVector({""}); + auto emptyInput = vectorMaker.flatVector( + vectorSize, [](auto /*row*/) { return ""; }); + auto validInput = vectorMaker.flatVector( + vectorSize, [](auto row) { return std::to_string(row); }); + auto invalidInput = vectorMaker.flatVector( + vectorSize, [](auto /*row*/) { return "$"; }); + auto validDoubleStringInput = vectorMaker.flatVector( + vectorSize, [](auto row) { return fmt::format("{}.12345678910", row); }); + auto validNaNInput = vectorMaker.flatVector( + vectorSize, [](auto /*row*/) { return "NaN"; }); + auto validInfinityInput = vectorMaker.flatVector( + vectorSize, [](auto /*row*/) { return "Infinity"; }); + auto invalidNaNInput = vectorMaker.flatVector( + vectorSize, [](auto /*row*/) { return "nan"; }); + auto invalidInfinityInput = vectorMaker.flatVector( + vectorSize, [](auto /*row*/) { return "infinity"; }); + auto spaceInput = vectorMaker.flatVector( + vectorSize, [](auto /*row*/) { return " "; }); + auto integerInput = vectorMaker.flatVector( + vectorSize, [&](auto j) { return 12345 * j; }, nullptr); + auto bigintInput = vectorMaker.flatVector( + vectorSize, + [&](auto j) { + return facebook::velox::HugeInt::build(12345 * j, 56789 * j + 12345); + }, + nullptr); auto decimalInput = vectorMaker.flatVector( vectorSize, [&](auto j) { return 12345 * j; }, nullptr, DECIMAL(9, 2)); auto shortDecimalInput = vectorMaker.flatVector( @@ -46,20 +70,84 @@ int main(int argc, char** argv) { }, nullptr, DECIMAL(38, 16)); + auto largeRealInput = vectorMaker.flatVector( + vectorSize, [&](auto j) { return 12345678.0 * j; }); + auto smallRealInput = vectorMaker.flatVector( + vectorSize, [&](auto j) { return 1.2345678 * j; }); + auto smallDoubleInput = vectorMaker.flatVector( + vectorSize, [&](auto j) { return -0.00012345678 / j; }); + auto largeDoubleInput = vectorMaker.flatVector( + vectorSize, [&](auto j) { return -123456.7 / j; }); auto timestampInput = vectorMaker.flatVector(vectorSize, [&](auto j) { return Timestamp(1695859694 + j / 1000, j % 1000 * 1'000'000); }); + auto validDateStrings = vectorMaker.flatVector( + vectorSize, + [](auto row) { return fmt::format("2024-05-{:02d}", 1 + row % 30); }); + auto invalidDateStrings = vectorMaker.flatVector( + vectorSize, [](auto row) { return fmt::format("2024-05...{}", row); }); + + benchmarkBuilder + .addBenchmarkSet( + "cast_varhar_as_date", + vectorMaker.rowVector( + {"empty", "invalid_date", "valid_date"}, + {emptyInput, invalidDateStrings, validDateStrings})) + .addExpression("try_cast_invalid_empty_input", "try_cast(empty as date) ") + .addExpression( + "tryexpr_cast_invalid_empty_input", "try(cast (empty as date))") + .addExpression( + "try_cast_invalid_input", "try_cast(invalid_date as date) ") + .addExpression( + "tryexpr_cast_invalid_input", "try(cast (invalid_date as date))") + .addExpression("cast_valid", "cast(valid_date as date)"); + + benchmarkBuilder + .addBenchmarkSet( + "cast_varhar_as_timestamp", + vectorMaker.rowVector( + {"empty", "invalid_date", "valid_date"}, + {emptyInput, invalidDateStrings, validDateStrings})) + .addExpression( + "try_cast_invalid_empty_input", "try_cast(empty as timestamp) ") + .addExpression( + "tryexpr_cast_invalid_empty_input", "try(cast (empty as timestamp))") + .addExpression( + "try_cast_invalid_input", "try_cast(invalid_date as timestamp) ") + .addExpression( + "tryexpr_cast_invalid_input", "try(cast (invalid_date as timestamp))") + .addExpression("cast_valid", "cast(valid_date as timestamp)"); - invalidInput->resize(vectorSize); - validInput->resize(vectorSize); - nanInput->resize(vectorSize); + benchmarkBuilder + .addBenchmarkSet( + "cast_timestamp_as_varchar", + vectorMaker.rowVector({"timestamp"}, {timestampInput})) + .addExpression("cast", "cast (timestamp as varchar)"); - for (int i = 0; i < vectorSize; i++) { - nanInput->set(i, "$"_sv); - invalidInput->set(i, StringView::makeInline(std::string(""))); - validInput->set(i, StringView::makeInline(std::to_string(i))); - } + benchmarkBuilder + .addBenchmarkSet( + "cast_varchar_as_double", + vectorMaker.rowVector( + {"valid", + "valid_nan", + "valid_infinity", + "invalid_nan", + "invalid_infinity", + "space"}, + {validDoubleStringInput, + validNaNInput, + validInfinityInput, + invalidNaNInput, + invalidInfinityInput, + spaceInput})) + .addExpression("cast_valid", "cast (valid as double)") + .addExpression("cast_valid_nan", "cast (valid_nan as double)") + .addExpression("cast_valid_infinity", "cast (valid_infinity as double)") + .addExpression("try_cast_invalid_nan", "try_cast (invalid_nan as double)") + .addExpression( + "try_cast_invalid_infinity", "try_cast (invalid_infinity as double)") + .addExpression("try_cast_space", "try_cast (space as double)"); benchmarkBuilder .addBenchmarkSet( @@ -67,23 +155,34 @@ int main(int argc, char** argv) { vectorMaker.rowVector( {"valid", "empty", - "nan", + "invalid", + "integer", + "bigint", "decimal", "short_decimal", "long_decimal", - "timestamp"}, + "large_real", + "small_real", + "small_double", + "large_double"}, {validInput, + emptyInput, invalidInput, - nanInput, + integerInput, + bigintInput, decimalInput, shortDecimalInput, longDecimalInput, - timestampInput})) + largeRealInput, + smallRealInput, + smallDoubleInput, + largeDoubleInput})) .addExpression("try_cast_invalid_empty_input", "try_cast (empty as int) ") .addExpression( "tryexpr_cast_invalid_empty_input", "try (cast (empty as int))") - .addExpression("try_cast_invalid_nan", "try_cast (nan as int)") - .addExpression("tryexpr_cast_invalid_nan", "try (cast (nan as int))") + .addExpression("try_cast_invalid_number", "try_cast (invalid as int)") + .addExpression( + "tryexpr_cast_invalid_number", "try (cast (invalid as int))") .addExpression("try_cast_valid", "try_cast (valid as int)") .addExpression("tryexpr_cast_valid", "try (cast (valid as int))") .addExpression("cast_valid", "cast(valid as int)") @@ -91,7 +190,27 @@ int main(int argc, char** argv) { "cast_decimal_to_inline_string", "cast (decimal as varchar)") .addExpression("cast_short_decimal", "cast (short_decimal as varchar)") .addExpression("cast_long_decimal", "cast (long_decimal as varchar)") - .addExpression("cast_timestamp", "cast (timestamp as varchar)") + .addExpression( + "cast_large_real_to_scientific_notation", + "cast(large_real as varchar)") + .addExpression( + "cast_small_real_to_standard_notation", "cast(small_real as varchar)") + .addExpression( + "cast_small_double_to_scientific_notation", + "cast(small_double as varchar)") + .addExpression( + "cast_large_double_to_standard_notation", + "cast(large_double as varchar)") + .addExpression("cast_real_as_int", "cast (small_real as integer)") + .addExpression("cast_decimal_as_bigint", "cast (short_decimal as bigint)") + .addExpression( + "cast_int_as_short_decimal", "cast (integer as decimal(18,6))") + .addExpression( + "cast_int_as_long_decimal", "cast (integer as decimal(38,16))") + .addExpression( + "cast_bigint_as_short_decimal", "cast (bigint as decimal(18,6))") + .addExpression( + "cast_bigint_as_long_decimal", "cast (bigint as decimal(38,16))") .withIterations(100) .disableTesting(); diff --git a/velox/benchmarks/basic/ComparisonConjunct.cpp b/velox/benchmarks/basic/ComparisonConjunct.cpp index 24aafb96c2ce2..cf40330eef4bc 100644 --- a/velox/benchmarks/basic/ComparisonConjunct.cpp +++ b/velox/benchmarks/basic/ComparisonConjunct.cpp @@ -175,9 +175,9 @@ BENCHMARK(conjunctsNested) { } // namespace int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; gflags::ParseCommandLineFlags(&argc, &argv, true); - + memory::MemoryManager::initialize({}); benchmark = std::make_unique(1'000); folly::runBenchmarks(); benchmark.reset(); diff --git a/velox/benchmarks/basic/DecodedVector.cpp b/velox/benchmarks/basic/DecodedVector.cpp index ef13901452c7f..3489967da7a24 100644 --- a/velox/benchmarks/basic/DecodedVector.cpp +++ b/velox/benchmarks/basic/DecodedVector.cpp @@ -188,9 +188,9 @@ BENCHMARK(decodeDictionary5Nested) { } // namespace int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; gflags::ParseCommandLineFlags(&argc, &argv, true); - + memory::MemoryManager::initialize({}); benchmark = std::make_unique(10'000); folly::runBenchmarks(); benchmark.reset(); diff --git a/velox/benchmarks/basic/FeatureNormalization.cpp b/velox/benchmarks/basic/FeatureNormalization.cpp index 4730510f43243..405e14819aa6b 100644 --- a/velox/benchmarks/basic/FeatureNormalization.cpp +++ b/velox/benchmarks/basic/FeatureNormalization.cpp @@ -110,9 +110,9 @@ BENCHMARK(normalizeConstant) { } // namespace int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; gflags::ParseCommandLineFlags(&argc, &argv, true); - + memory::MemoryManager::initialize({}); benchmark = std::make_unique(); folly::runBenchmarks(); benchmark.reset(); diff --git a/velox/benchmarks/basic/FormatDateTimeBenchmark.cpp b/velox/benchmarks/basic/FormatDateTimeBenchmark.cpp new file mode 100644 index 0000000000000..a53b8eb770878 --- /dev/null +++ b/velox/benchmarks/basic/FormatDateTimeBenchmark.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "velox/benchmarks/ExpressionBenchmarkBuilder.h" +#include "velox/functions/prestosql/registration/RegistrationFunctions.h" + +using namespace facebook; + +using namespace facebook::velox; + +int main(int argc, char** argv) { + folly::Init init(&argc, &argv); + memory::MemoryManager::initialize({}); + functions::prestosql::registerDateTimeFunctions(""); + + ExpressionBenchmarkBuilder benchmarkBuilder; + VectorFuzzer::Options options; + options.vectorSize = 1000; + auto* pool = benchmarkBuilder.pool(); + VectorFuzzer fuzzer(options, pool); + auto vectorMaker = benchmarkBuilder.vectorMaker(); + + benchmarkBuilder + .addBenchmarkSet( + "Benchmark format_datetime", + vectorMaker.rowVector({fuzzer.fuzz(TIMESTAMP())})) + .addExpression("", "format_datetime(c0, 'yyyy-MM-dd HH:mm:ss.SSS')") + .disableTesting(); + + benchmarkBuilder.registerBenchmarks(); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/benchmarks/basic/LikeBenchmark.cpp b/velox/benchmarks/basic/LikeBenchmark.cpp new file mode 100644 index 0000000000000..615cf625e37b7 --- /dev/null +++ b/velox/benchmarks/basic/LikeBenchmark.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/benchmarks/ExpressionBenchmarkBuilder.h" +#include "velox/functions/lib/Re2Functions.h" +#include "velox/functions/prestosql/registration/RegistrationFunctions.h" + +using namespace facebook; +using namespace facebook::velox; +using namespace facebook::velox::functions; +using namespace facebook::velox::functions::test; +using namespace facebook::velox::memory; +using namespace facebook::velox; + +int main(int argc, char** argv) { + folly::Init init(&argc, &argv); + memory::MemoryManager::initialize({}); + exec::registerStatefulVectorFunction("like", likeSignatures(), makeLike); + // Register the scalar functions. + prestosql::registerAllScalarFunctions(""); + + ExpressionBenchmarkBuilder benchmarkBuilder; + const vector_size_t vectorSize = 1000; + auto vectorMaker = benchmarkBuilder.vectorMaker(); + + auto makeInput = [&](vector_size_t vectorSize, + bool padAtHead, + bool padAtTail, + std::string content = "a_b_c", + std::string paddingStr = "xxx") { + return vectorMaker.flatVector(vectorSize, [&](auto row) { + // Strings in even rows contain/start with/end with a_b_c depends on + // value of padAtHead && padAtTail. + + // Calculates the padding. + std::ostringstream os; + for (auto i = 0; i < row / 2 + 1; ++i) { + os << paddingStr; + } + auto padding = os.str(); + + if (row % 2 == 0) { + if (padAtHead && padAtTail) { + return fmt::format("{}{}{}", padding, content, padding); + } else if (padAtHead) { + return fmt::format("{}{}", padding, content); + } else if (padAtTail) { + return fmt::format("{}{}", content, padding); + } else { + return content; + } + } else { + // Yes, two padding concatenated, since we have a '/2' above. + return padding + padding; + } + }); + }; + + auto substringInput = makeInput(vectorSize, true, true); + auto prefixInput = makeInput(vectorSize, false, true); + auto prefixUnicodeInput = makeInput(vectorSize, false, true, "你_好_啊"); + auto suffixInput = makeInput(vectorSize, true, false); + auto suffixUnicodeInput = makeInput(vectorSize, true, false, "你_好_啊"); + + benchmarkBuilder + .addBenchmarkSet( + "substring", vectorMaker.rowVector({"col0"}, {substringInput})) + .addExpression("substring", R"(like(col0, '%a\_b\_c%', '\'))") + .addExpression("strpos", R"(strpos(col0, 'a_b_c') > 0)"); + + benchmarkBuilder + .addBenchmarkSet( + "prefix", + vectorMaker.rowVector( + {"col0", "col1"}, {prefixInput, prefixUnicodeInput})) + .addExpression("prefix", R"(like(col0, 'a\_b\_c%', '\'))") + .addExpression("relaxed_prefix_1", R"(like(col0, 'a\__\_c%', '\'))") + .addExpression("relaxed_prefix_2", R"(like(col0, '_\__\_c%', '\'))") + .addExpression( + "relaxed_prefix_unicode_1", R"(like(col1, '你\__\_啊%', '\'))") + .addExpression( + "relaxed_prefix_unicode_2", R"(like(col1, '_\__\_啊%', '\'))") + .addExpression("starts_with", R"(starts_with(col0, 'a_b_c'))"); + + benchmarkBuilder + .addBenchmarkSet( + "suffix", + vectorMaker.rowVector( + {"col0", "col1"}, {suffixInput, suffixUnicodeInput})) + .addExpression("suffix", R"(like(col0, '%a\_b\_c', '\'))") + .addExpression("relaxed_suffix_1", R"(like(col0, '%a\__\_c', '\'))") + .addExpression("relaxed_suffix_2", R"(like(col0, '%_\__\_c', '\'))") + .addExpression( + "relaxed_suffix_unicode_1", R"(like(col1, '%你\__\_啊', '\'))") + .addExpression( + "relaxed_suffix_unicode_2", R"(like(col1, '%_\__\_啊', '\'))") + .addExpression("ends_with", R"(ends_with(col0, 'a_b_c'))"); + + benchmarkBuilder + .addBenchmarkSet( + "generic", vectorMaker.rowVector({"col0"}, {substringInput})) + .addExpression("generic", R"(like(col0, '%a%b%c'))"); + + benchmarkBuilder.registerBenchmarks(); + benchmarkBuilder.testBenchmarks(); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/benchmarks/basic/LikeFunctionsBenchmark.cpp b/velox/benchmarks/basic/LikeFunctionsBenchmark.cpp deleted file mode 100644 index 1931585fdc9a6..0000000000000 --- a/velox/benchmarks/basic/LikeFunctionsBenchmark.cpp +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include "velox/functions/lib/Re2Functions.h" -#include "velox/functions/lib/benchmarks/FunctionBenchmarkBase.h" -#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h" -#include "velox/tpch/gen/TpchGen.h" -#include "velox/vector/fuzzer/VectorFuzzer.h" - -using namespace facebook::velox; -using namespace facebook::velox::tpch; -using namespace facebook::velox::functions; -using namespace facebook::velox::functions::test; -using namespace facebook::velox::memory; - -DEFINE_int32(vector_size, 10000, "Vector size"); -DEFINE_int32(num_runs, 100, "Number of runs"); -DEFINE_int32(num_rows, 10000, "Number of rows"); - -namespace { - -enum class TpchBenchmarkCase { - TpchQuery2, - TpchQuery9, - TpchQuery13, - TpchQuery14, - TpchQuery16Part, - TpchQuery16Supplier, - TpchQuery20, -}; - -class LikeFunctionsBenchmark : public FunctionBaseTest, - public FunctionBenchmarkBase { - public: - explicit LikeFunctionsBenchmark() { - exec::registerStatefulVectorFunction("like", likeSignatures(), makeLike); - - VectorFuzzer::Options opts; - opts.vectorSize = FLAGS_vector_size; - VectorFuzzer fuzzer(opts, FunctionBenchmarkBase::pool()); - inputFuzzer_ = fuzzer.fuzzFlat(VARCHAR()); - } - - // Generate random string using characters from characterSet. - std::string generateRandomString(const char* characterSet) { - auto characterSetLength = strlen(characterSet); - auto minimumLength = 1; - auto maximumLength = 10; - auto outputStringLength = rand() % maximumLength + minimumLength; - std::string output; - - for (int i = 0; i < outputStringLength; i++) { - output += characterSet[rand() % characterSetLength]; - } - return output; - } - - std::string generatePattern( - PatternKind patternKind, - const std::string& inputString) { - switch (patternKind) { - case PatternKind::kExactlyN: - return std::string(inputString.size(), '_'); - case PatternKind::kAtLeastN: - return generateRandomString(kWildcardCharacterSet); - case PatternKind::kPrefix: { - auto fixedPatternLength = - std::min(vector_size_t(inputString.size()), 10); - auto fixedPatternString = inputString.substr(0, fixedPatternLength); - return fixedPatternString + generateRandomString(kAnyWildcardCharacter); - } - case PatternKind::kSuffix: { - auto fixedPatternStartIdx = - std::max(vector_size_t(inputString.size() - 10), 0); - auto fixedPatternString = inputString.substr(fixedPatternStartIdx, 10); - return generateRandomString(kAnyWildcardCharacter) + fixedPatternString; - } - default: - return inputString; - } - } - - const VectorPtr getTpchData(const TpchBenchmarkCase tpchCase) { - switch (tpchCase) { - case TpchBenchmarkCase::TpchQuery2: - case TpchBenchmarkCase::TpchQuery14: - case TpchBenchmarkCase::TpchQuery16Part: { - auto tpchPart = genTpchPart(pool_.get(), FLAGS_num_rows); - return tpchPart->childAt(4); - } - case TpchBenchmarkCase::TpchQuery9: - case TpchBenchmarkCase::TpchQuery20: { - auto tpchPart = genTpchPart(pool_.get(), FLAGS_num_rows); - return tpchPart->childAt(1); - } - case TpchBenchmarkCase::TpchQuery13: { - auto tpchOrders = genTpchOrders(pool_.get(), FLAGS_num_rows); - return tpchOrders->childAt(8); - } - case TpchBenchmarkCase::TpchQuery16Supplier: { - auto tpchSupplier = genTpchSupplier(pool_.get(), FLAGS_num_rows); - return tpchSupplier->childAt(6); - } - default: - VELOX_FAIL(fmt::format( - "Tpch data generation for case {} is not supported", tpchCase)); - } - } - - size_t run(const TpchBenchmarkCase tpchCase, const StringView patternString) { - folly::BenchmarkSuspender kSuspender; - const auto input = getTpchData(tpchCase); - const auto data = makeRowVector({input}); - auto likeExpression = fmt::format("like(c0, '{}')", patternString); - auto rowType = std::dynamic_pointer_cast(data->type()); - exec::ExprSet exprSet = - FunctionBenchmarkBase::compileExpression(likeExpression, rowType); - kSuspender.dismiss(); - - size_t cnt = 0; - for (auto i = 0; i < FLAGS_num_runs; i++) { - auto result = FunctionBenchmarkBase::evaluate(exprSet, data); - cnt += result->size(); - } - folly::doNotOptimizeAway(cnt); - - return cnt; - } - - size_t run(PatternKind patternKind) { - folly::BenchmarkSuspender kSuspender; - const auto input = inputFuzzer_->values()->as(); - auto patternString = generatePattern(patternKind, input[0].str()); - std::vector patternVector(FLAGS_vector_size, patternString); - const auto data = makeRowVector({inputFuzzer_}); - auto likeExpression = fmt::format("like(c0, '{}')", patternString); - auto rowType = std::dynamic_pointer_cast(data->type()); - exec::ExprSet exprSet = - FunctionBenchmarkBase::compileExpression(likeExpression, rowType); - kSuspender.dismiss(); - - size_t cnt = 0; - for (auto i = 0; i < FLAGS_num_runs; i++) { - auto result = FunctionBenchmarkBase::evaluate(exprSet, data); - cnt += result->size(); - } - folly::doNotOptimizeAway(cnt); - - return cnt; - } - - // We inherit from FunctionBaseTest so that we can get access to the helpers - // it defines, but since it is supposed to be a test fixture TestBody() is - // declared pure virtual. We must provide an implementation here. - void TestBody() override {} - - private: - static constexpr const char* kWildcardCharacterSet = "%_"; - static constexpr const char* kAnyWildcardCharacter = "%"; - std::shared_ptr pool_{addDefaultLeafMemoryPool()}; - VectorPtr inputFuzzer_; -}; - -std::unique_ptr benchmark; - -BENCHMARK(wildcardExactlyN) { - benchmark->run(PatternKind::kExactlyN); -} - -BENCHMARK(wildcardAtLeastN) { - benchmark->run(PatternKind::kAtLeastN); -} - -BENCHMARK(fixedPattern) { - benchmark->run(PatternKind::kFixed); -} - -BENCHMARK(prefixPattern) { - benchmark->run(PatternKind::kPrefix); -} - -BENCHMARK(suffixPattern) { - benchmark->run(PatternKind::kSuffix); -} - -BENCHMARK_DRAW_LINE(); - -BENCHMARK(tpchQuery2) { - benchmark->run(TpchBenchmarkCase::TpchQuery2, "%BRASS"); -} - -BENCHMARK(tpchQuery9) { - benchmark->run(TpchBenchmarkCase::TpchQuery9, "%green%"); -} - -BENCHMARK(tpchQuery13) { - benchmark->run(TpchBenchmarkCase::TpchQuery13, "%special%requests%"); -} - -BENCHMARK(tpchQuery14) { - benchmark->run(TpchBenchmarkCase::TpchQuery14, "PROMO%"); -} - -BENCHMARK(tpchQuery16Part) { - benchmark->run(TpchBenchmarkCase::TpchQuery16Part, "MEDIUM POLISHED%"); -} - -BENCHMARK(tpchQuery16Supplier) { - benchmark->run( - TpchBenchmarkCase::TpchQuery16Supplier, "%Customer%Complaints%"); -} - -BENCHMARK(tpchQuery20) { - benchmark->run(TpchBenchmarkCase::TpchQuery20, "forest%"); -} - -} // namespace - -int main(int argc, char* argv[]) { - folly::init(&argc, &argv, true); - benchmark = std::make_unique(); - folly::runBenchmarks(); - benchmark.reset(); - - return 0; -} diff --git a/velox/benchmarks/basic/LikeTpchBenchmark.cpp b/velox/benchmarks/basic/LikeTpchBenchmark.cpp new file mode 100644 index 0000000000000..29cde56e51e37 --- /dev/null +++ b/velox/benchmarks/basic/LikeTpchBenchmark.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/functions/lib/Re2Functions.h" +#include "velox/functions/lib/benchmarks/FunctionBenchmarkBase.h" +#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h" +#include "velox/tpch/gen/TpchGen.h" +#include "velox/vector/fuzzer/VectorFuzzer.h" + +using namespace facebook::velox; +using namespace facebook::velox::tpch; +using namespace facebook::velox::functions; +using namespace facebook::velox::functions::test; +using namespace facebook::velox::memory; + +DEFINE_int32(vector_size, 10000, "Vector size"); +DEFINE_int32(num_runs, 100, "Number of runs"); +DEFINE_int32(num_rows, 10000, "Number of rows"); + +namespace { + +enum class TpchBenchmarkCase { + TpchQuery2, + TpchQuery9, + TpchQuery13, + TpchQuery14, + TpchQuery16Part, + TpchQuery16Supplier, + TpchQuery20, +}; +} + +template <> +struct fmt::formatter : fmt::formatter { + auto format(const TpchBenchmarkCase& s, format_context& ctx) const { + return formatter::format(static_cast(s), ctx); + } +}; + +namespace { +class LikeFunctionsBenchmark : public FunctionBaseTest, + public FunctionBenchmarkBase { + public: + explicit LikeFunctionsBenchmark() { + exec::registerStatefulVectorFunction("like", likeSignatures(), makeLike); + + VectorFuzzer::Options opts; + opts.vectorSize = FLAGS_vector_size; + VectorFuzzer fuzzer(opts, FunctionBenchmarkBase::pool()); + inputFuzzer_ = fuzzer.fuzzFlat(VARCHAR()); + } + + // Generate random string using characters from characterSet. + std::string generateRandomString(const char* characterSet) { + auto characterSetLength = strlen(characterSet); + auto minimumLength = 1; + auto maximumLength = 10; + auto outputStringLength = rand() % maximumLength + minimumLength; + std::string output; + + for (int i = 0; i < outputStringLength; i++) { + output += characterSet[rand() % characterSetLength]; + } + return output; + } + + std::string generatePattern( + PatternKind patternKind, + const std::string& inputString) { + switch (patternKind) { + case PatternKind::kExactlyN: + return std::string(inputString.size(), '_'); + case PatternKind::kAtLeastN: + return generateRandomString(kWildcardCharacterSet); + case PatternKind::kPrefix: { + auto fixedPatternLength = + std::min(vector_size_t(inputString.size()), 10); + auto fixedPatternString = inputString.substr(0, fixedPatternLength); + return fixedPatternString + generateRandomString(kAnyWildcardCharacter); + } + case PatternKind::kSuffix: { + auto fixedPatternStartIdx = + std::max(vector_size_t(inputString.size() - 10), 0); + auto fixedPatternString = inputString.substr(fixedPatternStartIdx, 10); + return generateRandomString(kAnyWildcardCharacter) + fixedPatternString; + } + default: + return inputString; + } + } + + const VectorPtr getTpchData(const TpchBenchmarkCase tpchCase) { + switch (tpchCase) { + case TpchBenchmarkCase::TpchQuery2: + case TpchBenchmarkCase::TpchQuery14: + case TpchBenchmarkCase::TpchQuery16Part: { + auto tpchPart = genTpchPart(pool_.get(), FLAGS_num_rows); + return tpchPart->childAt(4); + } + case TpchBenchmarkCase::TpchQuery9: + case TpchBenchmarkCase::TpchQuery20: { + auto tpchPart = genTpchPart(pool_.get(), FLAGS_num_rows); + return tpchPart->childAt(1); + } + case TpchBenchmarkCase::TpchQuery13: { + auto tpchOrders = genTpchOrders(pool_.get(), FLAGS_num_rows); + return tpchOrders->childAt(8); + } + case TpchBenchmarkCase::TpchQuery16Supplier: { + auto tpchSupplier = genTpchSupplier(pool_.get(), FLAGS_num_rows); + return tpchSupplier->childAt(6); + } + default: + VELOX_FAIL(fmt::format( + "Tpch data generation for case {} is not supported", tpchCase)); + } + } + + size_t run(const TpchBenchmarkCase tpchCase, const StringView patternString) { + folly::BenchmarkSuspender kSuspender; + const auto input = getTpchData(tpchCase); + const auto data = makeRowVector({input}); + auto likeExpression = fmt::format("like(c0, '{}')", patternString); + auto rowType = std::dynamic_pointer_cast(data->type()); + exec::ExprSet exprSet = + FunctionBenchmarkBase::compileExpression(likeExpression, rowType); + kSuspender.dismiss(); + + size_t cnt = 0; + for (auto i = 0; i < FLAGS_num_runs; i++) { + auto result = FunctionBenchmarkBase::evaluate(exprSet, data); + cnt += result->size(); + } + folly::doNotOptimizeAway(cnt); + + return cnt; + } + + size_t run(PatternKind patternKind) { + folly::BenchmarkSuspender kSuspender; + const auto input = inputFuzzer_->values()->as(); + auto patternString = generatePattern(patternKind, input[0].str()); + std::vector patternVector(FLAGS_vector_size, patternString); + const auto data = makeRowVector({inputFuzzer_}); + auto likeExpression = fmt::format("like(c0, '{}')", patternString); + auto rowType = std::dynamic_pointer_cast(data->type()); + exec::ExprSet exprSet = + FunctionBenchmarkBase::compileExpression(likeExpression, rowType); + kSuspender.dismiss(); + + size_t cnt = 0; + for (auto i = 0; i < FLAGS_num_runs; i++) { + auto result = FunctionBenchmarkBase::evaluate(exprSet, data); + cnt += result->size(); + } + folly::doNotOptimizeAway(cnt); + + return cnt; + } + + // We inherit from FunctionBaseTest so that we can get access to the helpers + // it defines, but since it is supposed to be a test fixture TestBody() is + // declared pure virtual. We must provide an implementation here. + void TestBody() override {} + + private: + static constexpr const char* kWildcardCharacterSet = "%_"; + static constexpr const char* kAnyWildcardCharacter = "%"; + std::shared_ptr pool_{memory::memoryManager()->addLeafPool()}; + VectorPtr inputFuzzer_; +}; + +std::unique_ptr benchmark; + +BENCHMARK(wildcardExactlyN) { + benchmark->run(PatternKind::kExactlyN); +} + +BENCHMARK(wildcardAtLeastN) { + benchmark->run(PatternKind::kAtLeastN); +} + +BENCHMARK(fixedPattern) { + benchmark->run(PatternKind::kFixed); +} + +BENCHMARK(prefixPattern) { + benchmark->run(PatternKind::kPrefix); +} + +BENCHMARK(suffixPattern) { + benchmark->run(PatternKind::kSuffix); +} + +BENCHMARK_DRAW_LINE(); + +BENCHMARK(tpchQuery2) { + benchmark->run(TpchBenchmarkCase::TpchQuery2, "%BRASS"); +} + +BENCHMARK(tpchQuery9) { + benchmark->run(TpchBenchmarkCase::TpchQuery9, "%green%"); +} + +BENCHMARK(tpchQuery13) { + benchmark->run(TpchBenchmarkCase::TpchQuery13, "%special%requests%"); +} + +BENCHMARK(tpchQuery14) { + benchmark->run(TpchBenchmarkCase::TpchQuery14, "PROMO%"); +} + +BENCHMARK(tpchQuery16Part) { + benchmark->run(TpchBenchmarkCase::TpchQuery16Part, "MEDIUM POLISHED%"); +} + +BENCHMARK(tpchQuery16Supplier) { + benchmark->run( + TpchBenchmarkCase::TpchQuery16Supplier, "%Customer%Complaints%"); +} + +BENCHMARK(tpchQuery20) { + benchmark->run(TpchBenchmarkCase::TpchQuery20, "forest%"); +} + +} // namespace + +int main(int argc, char* argv[]) { + folly::Init init{&argc, &argv, true}; + memory::MemoryManager::initialize({}); + benchmark = std::make_unique(); + folly::runBenchmarks(); + benchmark.reset(); + + return 0; +} diff --git a/velox/benchmarks/basic/Preproc.cpp b/velox/benchmarks/basic/Preproc.cpp index 214be1fe773e2..528e1a6461f9e 100644 --- a/velox/benchmarks/basic/Preproc.cpp +++ b/velox/benchmarks/basic/Preproc.cpp @@ -444,8 +444,8 @@ BENCHMARK(allFusedWithNulls) { } // namespace int main(int argc, char** argv) { - folly::init(&argc, &argv); - + folly::Init init{&argc, &argv}; + memory::MemoryManager::initialize({}); benchmark = std::make_unique(); // Verify that benchmark calculations are correct. benchmark->test(); diff --git a/velox/benchmarks/basic/SelectivityVector.cpp b/velox/benchmarks/basic/SelectivityVector.cpp index 22273d3d77906..cef36633157eb 100644 --- a/velox/benchmarks/basic/SelectivityVector.cpp +++ b/velox/benchmarks/basic/SelectivityVector.cpp @@ -164,9 +164,9 @@ BENCHMARK(sumSelectivity1PerCent) { } // namespace int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; gflags::ParseCommandLineFlags(&argc, &argv, true); - + memory::MemoryManager::initialize({}); benchmark = std::make_unique(10'000); folly::runBenchmarks(); benchmark.reset(); diff --git a/velox/benchmarks/basic/SimpleArithmetic.cpp b/velox/benchmarks/basic/SimpleArithmetic.cpp index c2715fdca028c..4da3453a1863d 100644 --- a/velox/benchmarks/basic/SimpleArithmetic.cpp +++ b/velox/benchmarks/basic/SimpleArithmetic.cpp @@ -20,9 +20,9 @@ #include #include "velox/functions/Registerer.h" +#include "velox/functions/lib/CheckedArithmeticImpl.h" #include "velox/functions/lib/benchmarks/FunctionBenchmarkBase.h" #include "velox/functions/prestosql/ArithmeticImpl.h" -#include "velox/functions/prestosql/CheckedArithmeticImpl.h" #include "velox/vector/fuzzer/VectorFuzzer.h" DEFINE_int64(fuzzer_seed, 99887766, "Seed for random input dataset generator"); @@ -343,9 +343,9 @@ BENCHMARK(plusCheckedLarge) { } // namespace int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; gflags::ParseCommandLineFlags(&argc, &argv, true); - + memory::MemoryManager::initialize({}); benchmark = std::make_unique(); folly::runBenchmarks(); benchmark.reset(); diff --git a/velox/benchmarks/basic/SimpleCastExpr.cpp b/velox/benchmarks/basic/SimpleCastExpr.cpp index 5858d5602bba2..497b66d8c0b88 100644 --- a/velox/benchmarks/basic/SimpleCastExpr.cpp +++ b/velox/benchmarks/basic/SimpleCastExpr.cpp @@ -19,9 +19,7 @@ #include -#include "velox/functions/Registerer.h" #include "velox/functions/lib/benchmarks/FunctionBenchmarkBase.h" -#include "velox/functions/prestosql/ArithmeticImpl.h" #include "velox/vector/fuzzer/VectorFuzzer.h" DEFINE_int64(fuzzer_seed, 99887766, "Seed for random input dataset generator"); @@ -31,83 +29,230 @@ using namespace facebook::velox::exec; using namespace facebook::velox::test; namespace { +const std::string_view kColName{"a"}; + class SimpleCastBenchmark : public functions::test::FunctionBenchmarkBase { public: explicit SimpleCastBenchmark() : FunctionBenchmarkBase() {} - RowVectorPtr makeRowVector(vector_size_t size) { + RowVectorPtr makeRowVector(const TypePtr& type, vector_size_t size) { VectorFuzzer::Options opts; opts.vectorSize = size; opts.nullRatio = 0; - VectorFuzzer fuzzer(opts, pool(), FLAGS_fuzzer_seed); - std::vector children; - children.emplace_back(fuzzer.fuzzFlat(TIMESTAMP())); // Col a - return std::make_shared( - pool(), inputType_, nullptr, size, std::move(children)); + VectorFuzzer fuzzer(std::move(opts), pool(), FLAGS_fuzzer_seed); + VectorPtr input = fuzzer.fuzzFlat(type); // Col a + return vectorMaker_.rowVector( + std::vector{std::string(kColName)}, + std::vector{std::move(input)}); } - static constexpr auto kNumSmallRuns = 10'000; + static constexpr auto kNumSmallRuns = 100; static constexpr auto kNumMediumRuns = 1000; - static constexpr auto kNumLargeRuns = 100; + static constexpr auto kNumLargeRuns = 10'000; - void runSmall(const std::string& expression) { - run(expression, kNumSmallRuns, smallRowVector_); + void runSmall(const TypePtr& inputType, const TypePtr& outputType) { + run(inputType, outputType, kNumSmallRuns); } - void runMedium(const std::string& expression) { - run(expression, kNumMediumRuns, mediumRowVector_); + void runMedium(const TypePtr& inputType, const TypePtr& outputType) { + run(inputType, outputType, kNumMediumRuns); } - void runLarge(const std::string& expression) { - run(expression, kNumLargeRuns, largeRowVector_); + void runLarge(const TypePtr& inputType, const TypePtr& outputType) { + run(inputType, outputType, kNumLargeRuns); } - // Compiles and runs the `expression` `iterations` number of times. - size_t run( - const std::string& expression, - size_t iterations, - const RowVectorPtr& input) { + size_t + run(const TypePtr& inputType, const TypePtr& outputType, size_t batchSize) { folly::BenchmarkSuspender suspender; - auto exprSet = compileExpression(expression, inputType_); + auto input = makeRowVector(inputType, batchSize); + auto castInput = + std::make_shared( + inputType, std::string(kColName)); + std::vector expr{ + std::make_shared( + outputType, castInput, false)}; + exec::ExprSet exprSet(expr, &execCtx_); + SelectivityVector rows(input->size()); + VectorPtr result; suspender.dismiss(); size_t count = 0; - for (auto i = 0; i < iterations; i++) { - count += evaluate(exprSet, input)->size(); + for (auto i = 0; i < 1000; i++) { + evaluate(exprSet, input, rows, result); + count += result->size(); } return count; } - - private: - const TypePtr inputType_ = ROW({ - {"a", TIMESTAMP()}, - }); - const RowVectorPtr smallRowVector_ = makeRowVector(100); - const RowVectorPtr mediumRowVector_ = makeRowVector(1'000); - const RowVectorPtr largeRowVector_ = makeRowVector(10'000); }; +TypePtr buildStructType( + std::function&& nameGenerator, + const TypePtr& fieldType, + size_t numChildren) { + std::vector names; + std::vector types; + for (int i = 0; i < numChildren; i++) { + names.push_back(nameGenerator(i)); + types.push_back(fieldType); + } + return ROW(std::move(names), std::move(types)); +} + std::unique_ptr benchmark; -BENCHMARK(castTimestampDate) { +BENCHMARK(castTimestampDateSmall) { + benchmark->setAdjustTimestampToTimezone("false"); + benchmark->runSmall(TIMESTAMP(), DATE()); +} + +BENCHMARK(castTimestampDateMedium) { + benchmark->setAdjustTimestampToTimezone("false"); + benchmark->runMedium(TIMESTAMP(), DATE()); +} + +BENCHMARK(castTimestampDateLarge) { benchmark->setAdjustTimestampToTimezone("false"); - benchmark->runSmall("cast (a as date)"); - benchmark->runMedium("cast (a as date)"); - benchmark->runLarge("cast (a as date)"); + benchmark->runLarge(TIMESTAMP(), DATE()); } -BENCHMARK(castTimestampDateAdjustTimeZone) { +BENCHMARK(castTimestampDateAdjustTimeZoneSmall) { benchmark->setTimezone("America/Los_Angeles"); benchmark->setAdjustTimestampToTimezone("true"); - benchmark->runSmall("cast (a as date)"); - benchmark->runMedium("cast (a as date)"); - benchmark->runLarge("cast (a as date)"); + benchmark->runSmall(TIMESTAMP(), DATE()); +} + +BENCHMARK(castTimestampDateAdjustTimeZoneMedium) { + benchmark->setTimezone("America/Los_Angeles"); + benchmark->setAdjustTimestampToTimezone("true"); + benchmark->runMedium(TIMESTAMP(), DATE()); +} + +BENCHMARK(castTimestampDateAdjustTimeZoneLarge) { + benchmark->setTimezone("America/Los_Angeles"); + benchmark->setAdjustTimestampToTimezone("true"); + benchmark->runLarge(TIMESTAMP(), DATE()); +} + +BENCHMARK(castStructFewFieldsRenameSmall) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 3); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, INTEGER(), 3); + suspender.dismiss(); + + benchmark->runSmall(oldType, newType); +} + +BENCHMARK(castStructFewFieldsRenameMedium) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 3); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, INTEGER(), 3); + suspender.dismiss(); + + benchmark->runMedium(oldType, newType); +} + +BENCHMARK(castStructFewFieldsRenameLarge) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 3); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, INTEGER(), 3); + suspender.dismiss(); + + benchmark->runLarge(oldType, newType); +} + +BENCHMARK(castStructManyFieldsRenameSmall) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 1000); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, INTEGER(), 1000); + suspender.dismiss(); + + benchmark->runSmall(oldType, newType); +} + +BENCHMARK(castStructManyFieldsRenameMedium) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 1000); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, INTEGER(), 1000); + suspender.dismiss(); + + benchmark->runMedium(oldType, newType); } + +BENCHMARK(castStructManyFieldsRenameLarge) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 1000); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, INTEGER(), 1000); + suspender.dismiss(); + + benchmark->runLarge(oldType, newType); +} + +BENCHMARK(castStructFewFieldsNestedCastSmall) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 3); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, BIGINT(), 3); + suspender.dismiss(); + + benchmark->runSmall(oldType, newType); +} + +BENCHMARK(castStructFewFieldsNestedCastMedium) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 3); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, BIGINT(), 3); + suspender.dismiss(); + + benchmark->runMedium(oldType, newType); +} + +BENCHMARK(castStructFewFieldsNestedCastLarge) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 3); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, BIGINT(), 3); + suspender.dismiss(); + + benchmark->runLarge(oldType, newType); +} + +BENCHMARK(castStructManyFieldsNestedCastSmall) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 1000); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, BIGINT(), 1000); + suspender.dismiss(); + + benchmark->runSmall(oldType, newType); +} + +BENCHMARK(castStructManyFieldsNestedCastMedium) { + folly::BenchmarkSuspender suspender; + auto oldType = buildStructType([](int) { return ""; }, INTEGER(), 1000); + auto newType = buildStructType( + [](int i) { return fmt::format("col{}", i); }, BIGINT(), 1000); + suspender.dismiss(); + + benchmark->runMedium(oldType, newType); +} + +// castStructManyFieldsNestedCastLarge is skipped because it takes too long to +// run. + } // namespace int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; gflags::ParseCommandLineFlags(&argc, &argv, true); + memory::MemoryManager::initialize({}); benchmark = std::make_unique(); folly::runBenchmarks(); benchmark.reset(); diff --git a/velox/benchmarks/basic/VectorCompare.cpp b/velox/benchmarks/basic/VectorCompare.cpp index b967b49ffbb6a..6b67e267d8820 100644 --- a/velox/benchmarks/basic/VectorCompare.cpp +++ b/velox/benchmarks/basic/VectorCompare.cpp @@ -81,7 +81,7 @@ class VectorCompareBenchmark : public functions::test::FunctionBenchmarkBase { true, true, false, - CompareFlags::NullHandlingMode::NoStop}; + CompareFlags::NullHandlingMode::kNullAsValue}; const size_t vectorSize_; SelectivityVector rows_; @@ -114,9 +114,9 @@ BENCHMARK_DRAW_LINE(); } // namespace int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; gflags::ParseCommandLineFlags(&argc, &argv, true); - + memory::MemoryManager::initialize({}); benchmark = std::make_unique(1000); folly::runBenchmarks(); benchmark.reset(); diff --git a/velox/benchmarks/basic/VectorFuzzer.cpp b/velox/benchmarks/basic/VectorFuzzer.cpp index ba76519e9a2f9..2196e8bf9952d 100644 --- a/velox/benchmarks/basic/VectorFuzzer.cpp +++ b/velox/benchmarks/basic/VectorFuzzer.cpp @@ -28,7 +28,10 @@ namespace { using namespace facebook::velox; -std::shared_ptr pool{memory::addDefaultLeafMemoryPool()}; +memory::MemoryPool* pool() { + static auto leaf = memory::MemoryManager::getInstance()->addLeafPool(); + return leaf.get(); +} VectorFuzzer::Options getOpts(size_t n, double nullRatio = 0) { VectorFuzzer::Options opts; @@ -38,25 +41,25 @@ VectorFuzzer::Options getOpts(size_t n, double nullRatio = 0) { } BENCHMARK_MULTI(flatInteger, n) { - VectorFuzzer fuzzer(getOpts(n), pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(getOpts(n), pool(), FLAGS_fuzzer_seed); folly::doNotOptimizeAway(fuzzer.fuzzFlat(BIGINT())); return n; } BENCHMARK_RELATIVE_MULTI(flatIntegerHalfNull, n) { - VectorFuzzer fuzzer(getOpts(n, 0.5), pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(getOpts(n, 0.5), pool(), FLAGS_fuzzer_seed); folly::doNotOptimizeAway(fuzzer.fuzzFlat(BIGINT())); return n; } BENCHMARK_RELATIVE_MULTI(flatDouble, n) { - VectorFuzzer fuzzer(getOpts(n), pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(getOpts(n), pool(), FLAGS_fuzzer_seed); folly::doNotOptimizeAway(fuzzer.fuzzFlat(DOUBLE())); return n; } BENCHMARK_RELATIVE_MULTI(flatBool, n) { - VectorFuzzer fuzzer(getOpts(n), pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(getOpts(n), pool(), FLAGS_fuzzer_seed); folly::doNotOptimizeAway(fuzzer.fuzzFlat(BOOLEAN())); return n; } @@ -65,7 +68,7 @@ BENCHMARK_RELATIVE_MULTI(flatVarcharAscii, n) { auto opts = getOpts(n); opts.charEncodings = {UTF8CharList::ASCII}; - VectorFuzzer fuzzer(opts, pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(opts, pool(), FLAGS_fuzzer_seed); folly::doNotOptimizeAway(fuzzer.fuzzFlat(VARCHAR())); return n; } @@ -74,7 +77,7 @@ BENCHMARK_RELATIVE_MULTI(flatVarcharUtf8, n) { auto opts = getOpts(n); opts.charEncodings = {UTF8CharList::EXTENDED_UNICODE}; - VectorFuzzer fuzzer(opts, pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(opts, pool(), FLAGS_fuzzer_seed); folly::doNotOptimizeAway(fuzzer.fuzzFlat(VARCHAR())); return n; } @@ -82,13 +85,13 @@ BENCHMARK_RELATIVE_MULTI(flatVarcharUtf8, n) { BENCHMARK_DRAW_LINE(); BENCHMARK_RELATIVE_MULTI(constantInteger, n) { - VectorFuzzer fuzzer(getOpts(n), pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(getOpts(n), pool(), FLAGS_fuzzer_seed); folly::doNotOptimizeAway(fuzzer.fuzzConstant(BIGINT())); return n; } BENCHMARK_RELATIVE_MULTI(dictionaryInteger, n) { - VectorFuzzer fuzzer(getOpts(n), pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(getOpts(n), pool(), FLAGS_fuzzer_seed); folly::doNotOptimizeAway(fuzzer.fuzzDictionary(fuzzer.fuzzFlat(BIGINT()))); return n; } @@ -96,7 +99,7 @@ BENCHMARK_RELATIVE_MULTI(dictionaryInteger, n) { BENCHMARK_DRAW_LINE(); BENCHMARK_RELATIVE_MULTI(flatArray, n) { - VectorFuzzer fuzzer(getOpts(n), pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(getOpts(n), pool(), FLAGS_fuzzer_seed); const size_t elementsSize = n * fuzzer.getOptions().containerLength; folly::doNotOptimizeAway( fuzzer.fuzzArray(fuzzer.fuzzFlat(BIGINT(), elementsSize), n)); @@ -104,7 +107,7 @@ BENCHMARK_RELATIVE_MULTI(flatArray, n) { } BENCHMARK_RELATIVE_MULTI(flatMap, n) { - VectorFuzzer fuzzer(getOpts(n), pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(getOpts(n), pool(), FLAGS_fuzzer_seed); const size_t elementsSize = n * fuzzer.getOptions().containerLength; folly::doNotOptimizeAway(fuzzer.fuzzMap( fuzzer.fuzzFlat(BIGINT(), elementsSize), @@ -114,7 +117,7 @@ BENCHMARK_RELATIVE_MULTI(flatMap, n) { } BENCHMARK_RELATIVE_MULTI(flatMapArrayNested, n) { - VectorFuzzer fuzzer(getOpts(n), pool.get(), FLAGS_fuzzer_seed); + VectorFuzzer fuzzer(getOpts(n), pool(), FLAGS_fuzzer_seed); const size_t elementsSize = n * fuzzer.getOptions().containerLength; folly::doNotOptimizeAway(fuzzer.fuzzMap( @@ -128,8 +131,9 @@ BENCHMARK_RELATIVE_MULTI(flatMapArrayNested, n) { } // namespace int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; gflags::ParseCommandLineFlags(&argc, &argv, true); + memory::MemoryManager::initialize({}); folly::runBenchmarks(); return 0; } diff --git a/velox/benchmarks/basic/VectorSlice.cpp b/velox/benchmarks/basic/VectorSlice.cpp index 58fa5642abf25..d03cc06ea6afd 100644 --- a/velox/benchmarks/basic/VectorSlice.cpp +++ b/velox/benchmarks/basic/VectorSlice.cpp @@ -37,7 +37,7 @@ constexpr int kVectorSize = 16 << 10; struct BenchmarkData { BenchmarkData() - : pool_(memory::addDefaultLeafMemoryPool( + : pool_(memory::memoryManager()->addLeafPool( "BenchmarkData", FLAGS_use_thread_safe_memory_usage_track)) { VectorFuzzer::Options opts; @@ -110,10 +110,11 @@ DEFINE_BENCHMARKS(row) } // namespace facebook::velox int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; using namespace facebook::velox; gflags::ParseCommandLineFlags(&argc, &argv, true); VELOX_CHECK_LE(FLAGS_slice_size, kVectorSize); + memory::MemoryManager::initialize({}); data = std::make_unique(); folly::runBenchmarks(); data.reset(); diff --git a/velox/benchmarks/filesystem/CMakeLists.txt b/velox/benchmarks/filesystem/CMakeLists.txt new file mode 100644 index 0000000000000..10a5ebdf710d0 --- /dev/null +++ b/velox/benchmarks/filesystem/CMakeLists.txt @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(velox_read_benchmark_lib ReadBenchmark.cpp) + +target_link_libraries( + velox_read_benchmark_lib + PUBLIC velox_file velox_time Folly::folly gflags::gflags) + +add_executable(velox_read_benchmark ReadBenchmarkMain.cpp) + +target_link_libraries( + velox_read_benchmark + PRIVATE + velox_read_benchmark_lib + velox_hive_config + velox_s3fs + velox_hdfs + velox_abfs + velox_gcs) diff --git a/velox/benchmarks/filesystem/ReadBenchmark.cpp b/velox/benchmarks/filesystem/ReadBenchmark.cpp new file mode 100644 index 0000000000000..f99639b8c9663 --- /dev/null +++ b/velox/benchmarks/filesystem/ReadBenchmark.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/benchmarks/filesystem/ReadBenchmark.h" + +#include "velox/common/config/Config.h" +#include "velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.h" +#include "velox/connectors/hive/storage_adapters/gcs/RegisterGCSFileSystem.h" +#include "velox/connectors/hive/storage_adapters/hdfs/RegisterHdfsFileSystem.h" +#include "velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.h" + +DEFINE_string(path, "", "Path of the input file"); +DEFINE_int64( + file_size_gb, + 0, + "Limits the test to the first --file_size_gb " + "of --path. 0 means use the whole file"); +DEFINE_int32(num_threads, 16, "Test paralelism"); +DEFINE_int32(seed, 0, "Random seed, 0 means no seed"); +DEFINE_bool(odirect, false, "Use O_DIRECT"); + +DEFINE_int32( + bytes, + 0, + "If 0, runs through a set of predefined read patterns. " + "If non-0, this is the size of a single read. The reads are " + "made in --num_in_run consecutive batchhes with --gap bytes between each read"); +DEFINE_int32(gap, 0, "Gap between consecutive reads if --bytes is non-0"); +DEFINE_int32( + num_in_run, + 10, + "Number of consecutive reads of --bytes separated by --gap bytes"); +DEFINE_int32( + measurement_size, + 100 << 20, + "Total reads per thread when throughput for a --bytes/--gap/--/gap/" + "--num_in_run combination"); +DEFINE_string(config, "", "Path of the config file"); + +namespace { +static bool notEmpty(const char* /*flagName*/, const std::string& value) { + return !value.empty(); +} +} // namespace + +DEFINE_validator(path, ¬Empty); + +namespace facebook::velox { + +std::shared_ptr readConfig(const std::string& filePath) { + std::ifstream configFile(filePath); + if (!configFile.is_open()) { + throw std::runtime_error( + fmt::format("Couldn't open config file {} for reading.", filePath)); + } + + std::unordered_map properties; + std::string line; + while (getline(configFile, line)) { + line.erase(std::remove_if(line.begin(), line.end(), isspace), line.end()); + if (line[0] == '#' || line.empty()) { + continue; + } + auto delimiterPos = line.find('='); + auto name = line.substr(0, delimiterPos); + auto value = line.substr(delimiterPos + 1); + properties.emplace(name, value); + } + + return std::make_shared(std::move(properties)); +} + +// Initialize a LocalReadFile instance for the specified 'path'. +void ReadBenchmark::initialize() { + executor_ = std::make_unique(FLAGS_num_threads); + if (FLAGS_odirect) { + int32_t o_direct = +#ifdef linux + O_DIRECT; +#else + 0; +#endif + fd_ = open( + FLAGS_path.c_str(), + O_CREAT | O_RDWR | (FLAGS_odirect ? o_direct : 0), + S_IRUSR | S_IWUSR); + if (fd_ < 0) { + LOG(ERROR) << "Could not open " << FLAGS_path; + exit(1); + } + readFile_ = std::make_unique(fd_); + } else { + filesystems::registerLocalFileSystem(); + filesystems::registerS3FileSystem(); + filesystems::registerGCSFileSystem(); + filesystems::registerHdfsFileSystem(); + filesystems::abfs::registerAbfsFileSystem(); + std::shared_ptr config; + if (!FLAGS_config.empty()) { + config = readConfig(FLAGS_config); + } + auto fs = filesystems::getFileSystem(FLAGS_path, config); + readFile_ = fs->openFileForRead(FLAGS_path); + fileSize_ = readFile_->size(); + if (FLAGS_file_size_gb) { + fileSize_ = std::min(FLAGS_file_size_gb << 30, fileSize_); + } + } + + if (fileSize_ <= FLAGS_measurement_size) { + LOG(ERROR) << "File size " << fileSize_ << " is <= then --measurement_size " + << FLAGS_measurement_size; + exit(1); + } + if (FLAGS_seed) { + rng_.seed(FLAGS_seed); + } +} + +void ReadBenchmark::finalize() { + filesystems::finalizeS3FileSystem(); +} + +void ReadBenchmark::run() { + if (FLAGS_bytes) { + modes(FLAGS_bytes, FLAGS_gap, FLAGS_num_in_run); + return; + } + modes(1100, 0, 10); + modes(1100, 1200, 10); + modes(16 * 1024, 0, 10); + modes(16 * 1024, 10000, 10); + modes(1000000, 0, 8); + modes(1000000, 100000, 8); +} +} // namespace facebook::velox diff --git a/velox/common/file/benchmark/ReadBenchmark.h b/velox/benchmarks/filesystem/ReadBenchmark.h similarity index 88% rename from velox/common/file/benchmark/ReadBenchmark.h rename to velox/benchmarks/filesystem/ReadBenchmark.h index 5a59e09e3cb0f..e033bc953acac 100644 --- a/velox/common/file/benchmark/ReadBenchmark.h +++ b/velox/benchmarks/filesystem/ReadBenchmark.h @@ -43,6 +43,7 @@ DECLARE_int32(gap); DECLARE_int32(num_in_run); DECLARE_int32(measurement_size); +DECLARE_string(config); namespace facebook::velox { @@ -62,46 +63,9 @@ class ReadBenchmark { public: virtual ~ReadBenchmark() = default; - // Initialize a LocalReadFile instance for the specified 'path'. - virtual void initialize() { - executor_ = - std::make_unique(FLAGS_num_threads); - if (FLAGS_odirect) { - int32_t o_direct = -#ifdef linux - O_DIRECT; -#else - 0; -#endif - fd_ = open( - FLAGS_path.c_str(), - O_CREAT | O_RDWR | (FLAGS_odirect ? o_direct : 0), - S_IRUSR | S_IWUSR); - if (fd_ < 0) { - LOG(ERROR) << "Could not open " << FLAGS_path; - exit(1); - } - readFile_ = std::make_unique(fd_); + virtual void initialize(); - } else { - filesystems::registerLocalFileSystem(); - auto lfs = filesystems::getFileSystem(FLAGS_path, nullptr); - readFile_ = lfs->openFileForRead(FLAGS_path); - } - fileSize_ = readFile_->size(); - if (FLAGS_file_size_gb) { - fileSize_ = std::min(FLAGS_file_size_gb << 30, fileSize_); - } - - if (fileSize_ <= FLAGS_measurement_size) { - LOG(ERROR) << "File size " << fileSize_ - << " is <= then --measurement_size " << FLAGS_measurement_size; - exit(1); - } - if (FLAGS_seed) { - rng_.seed(FLAGS_seed); - } - } + virtual void finalize(); void clearCache() { #ifdef linux diff --git a/velox/common/file/benchmark/ReadBenchmarkMain.cpp b/velox/benchmarks/filesystem/ReadBenchmarkMain.cpp similarity index 90% rename from velox/common/file/benchmark/ReadBenchmarkMain.cpp rename to velox/benchmarks/filesystem/ReadBenchmarkMain.cpp index 848321c92de17..0556ff3a6d1f2 100644 --- a/velox/common/file/benchmark/ReadBenchmarkMain.cpp +++ b/velox/benchmarks/filesystem/ReadBenchmarkMain.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "velox/common/file/benchmark/ReadBenchmark.h" +#include "velox/benchmarks/filesystem/ReadBenchmark.h" using namespace facebook::velox; @@ -24,8 +24,9 @@ using namespace facebook::velox; // and the IO throughput is 100 MBps, then it takes 10 seconds to just read the // data. int main(int argc, char** argv) { - folly::init(&argc, &argv, false); + folly::Init init{&argc, &argv, false}; ReadBenchmark bm; bm.initialize(); bm.run(); + bm.finalize(); } diff --git a/velox/benchmarks/tpch/CMakeLists.txt b/velox/benchmarks/tpch/CMakeLists.txt index ef0147ad5c93c..3cbe9a0a6a6ba 100644 --- a/velox/benchmarks/tpch/CMakeLists.txt +++ b/velox/benchmarks/tpch/CMakeLists.txt @@ -16,6 +16,7 @@ add_library(velox_tpch_benchmark_lib TpchBenchmark.cpp) target_link_libraries( velox_tpch_benchmark_lib + velox_query_benchmark velox_aggregates velox_exec velox_exec_test_lib @@ -33,10 +34,11 @@ target_link_libraries( velox_type_fbhive velox_caching velox_vector_test_lib - Folly::folly ${FOLLY_BENCHMARK} + Folly::folly fmt::fmt) add_executable(velox_tpch_benchmark TpchBenchmarkMain.cpp) -target_link_libraries(velox_tpch_benchmark velox_tpch_benchmark_lib) +target_link_libraries( + velox_tpch_benchmark velox_tpch_benchmark_lib) diff --git a/velox/benchmarks/tpch/TpchBenchmark.cpp b/velox/benchmarks/tpch/TpchBenchmark.cpp index 9ce35e2ec334f..25d02224fc58d 100644 --- a/velox/benchmarks/tpch/TpchBenchmark.cpp +++ b/velox/benchmarks/tpch/TpchBenchmark.cpp @@ -14,75 +14,13 @@ * limitations under the License. */ -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "velox/common/base/SuccinctPrinter.h" -#include "velox/common/file/FileSystems.h" -#include "velox/common/memory/MmapAllocator.h" -#include "velox/connectors/hive/HiveConfig.h" -#include "velox/connectors/hive/HiveConnector.h" -#include "velox/dwio/common/Options.h" -#include "velox/exec/PlanNodeStats.h" -#include "velox/exec/Split.h" -#include "velox/exec/tests/utils/HiveConnectorTestBase.h" -#include "velox/exec/tests/utils/TpchQueryBuilder.h" -#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h" -#include "velox/functions/prestosql/registration/RegistrationFunctions.h" -#include "velox/parse/TypeResolver.h" +#include "velox/benchmarks/QueryBenchmarkBase.h" using namespace facebook::velox; using namespace facebook::velox::exec; using namespace facebook::velox::exec::test; using namespace facebook::velox::dwio::common; -namespace { -static bool notEmpty(const char* /*flagName*/, const std::string& value) { - return !value.empty(); -} - -static bool validateDataFormat(const char* flagname, const std::string& value) { - if ((value.compare("parquet") == 0) || (value.compare("dwrf") == 0)) { - return true; - } - std::cout - << fmt::format( - "Invalid value for --{}: {}. Allowed values are [\"parquet\", \"dwrf\"]", - flagname, - value) - << std::endl; - return false; -} - -void ensureTaskCompletion(exec::Task* task) { - // ASSERT_TRUE requires a function with return type void. - ASSERT_TRUE(waitForTaskCompletion(task)); -} - -void printResults(const std::vector& results, std::ostream& out) { - out << "Results:" << std::endl; - bool printType = true; - for (const auto& vector : results) { - // Print RowType only once. - if (printType) { - out << vector->type()->asRow().toString() << std::endl; - printType = false; - } - for (vector_size_t i = 0; i < vector->size(); ++i) { - out << vector->toString(i) << std::endl; - } - } -} -} // namespace - DEFINE_string( data_path, "", @@ -100,6 +38,13 @@ DEFINE_string( "each table. If they are files, they contain a file system path for each " "data file, one per line. This allows running against cloud storage or " "HDFS"); +namespace { +static bool notEmpty(const char* /*flagName*/, const std::string& value) { + return !value.empty(); +} +} // namespace + +DEFINE_validator(data_path, ¬Empty); DEFINE_int32( run_query_verbose, @@ -112,207 +57,11 @@ DEFINE_int32( "include in IO meter query. The columns are sorted by name and the n% first " "are scanned"); -DEFINE_bool( - include_custom_stats, - false, - "Include custom statistics along with execution statistics"); -DEFINE_bool(include_results, false, "Include results in the output"); -DEFINE_int32(num_drivers, 4, "Number of drivers"); -DEFINE_string(data_format, "parquet", "Data format"); -DEFINE_int32(num_splits_per_file, 10, "Number of splits per file"); -DEFINE_int32( - cache_gb, - 0, - "GB of process memory for cache and query.. if " - "non-0, uses mmap to allocator and in-process data cache."); -DEFINE_int32(num_repeats, 1, "Number of times to run each query"); -DEFINE_int32(num_io_threads, 8, "Threads for speculative IO"); -DEFINE_string( - test_flags_file, - "", - "Path to a file containing gflafs and " - "values to try. Produces results for each flag combination " - "sorted on performance"); -DEFINE_bool( - full_sorted_stats, - true, - "Add full stats to the report on --test_flags_file"); - -DEFINE_string(ssd_path, "", "Directory for local SSD cache"); -DEFINE_int32(ssd_cache_gb, 0, "Size of local SSD cache in GB"); -DEFINE_int32( - ssd_checkpoint_interval_gb, - 8, - "Checkpoint every n " - "GB new data in cache"); -DEFINE_bool( - clear_ram_cache, - false, - "Clear RAM cache before each query." - "Flushes in process and OS file system cache (if root on Linux)"); -DEFINE_bool( - clear_ssd_cache, - false, - "Clears SSD cache before " - "each query"); - -DEFINE_bool( - warmup_after_clear, - false, - "Runs one warmup of the query before " - "measured run. Use to run warm after clearing caches."); - -DEFINE_validator(data_path, ¬Empty); -DEFINE_validator(data_format, &validateDataFormat); - -DEFINE_int64( - max_coalesced_bytes, - 128 << 20, - "Maximum size of single coalesced IO"); - -DEFINE_int32( - max_coalesced_distance_bytes, - 512 << 10, - "Maximum distance in bytes in which coalesce will combine requests"); - -DEFINE_int32( - parquet_prefetch_rowgroups, - 1, - "Number of next row groups to " - "prefetch. 1 means prefetch the next row group before decoding " - "the current one"); - -struct RunStats { - std::map flags; - int64_t micros{0}; - int64_t rawInputBytes{0}; - int64_t userNanos{0}; - int64_t systemNanos{0}; - std::string output; - - std::string toString(bool detail) { - std::stringstream out; - out << succinctNanos(micros * 1000) << " " - << succinctBytes(rawInputBytes / (micros / 1000000.0)) << "/s raw, " - << succinctNanos(userNanos) << " user " << succinctNanos(systemNanos) - << " system (" << (100 * (userNanos + systemNanos) / (micros * 1000)) - << "%), flags: "; - for (auto& pair : flags) { - out << pair.first << "=" << pair.second << " "; - } - out << std::endl << "======" << std::endl; - if (detail) { - out << std::endl << output << std::endl; - } - return out.str(); - } -}; - -struct ParameterDim { - std::string flag; - std::vector values; -}; - std::shared_ptr queryBuilder; -class TpchBenchmark { +class TpchBenchmark : public QueryBenchmarkBase { public: - void initialize() { - if (FLAGS_cache_gb) { - int64_t memoryBytes = FLAGS_cache_gb * (1LL << 30); - memory::MmapAllocator::Options options; - options.capacity = memoryBytes; - options.useMmapArena = true; - options.mmapArenaCapacityRatio = 1; - std::unique_ptr ssdCache; - if (FLAGS_ssd_cache_gb) { - constexpr int32_t kNumSsdShards = 16; - cacheExecutor_ = - std::make_unique(kNumSsdShards); - ssdCache = std::make_unique( - FLAGS_ssd_path, - static_cast(FLAGS_ssd_cache_gb) << 30, - kNumSsdShards, - cacheExecutor_.get(), - static_cast(FLAGS_ssd_checkpoint_interval_gb) << 30); - } - - allocator_ = std::make_shared(options); - cache_ = - cache::AsyncDataCache::create(allocator_.get(), std::move(ssdCache)); - cache::AsyncDataCache::setInstance(cache_.get()); - memory::MemoryAllocator::setDefaultInstance(allocator_.get()); - } - functions::prestosql::registerAllScalarFunctions(); - aggregate::prestosql::registerAllAggregateFunctions(); - parse::registerTypeResolver(); - filesystems::registerLocalFileSystem(); - - ioExecutor_ = - std::make_unique(FLAGS_num_io_threads); - - // Add new values into the hive configuration... - auto configurationValues = std::unordered_map(); - configurationValues[connector::hive::HiveConfig::kMaxCoalescedBytes] = - std::to_string(FLAGS_max_coalesced_bytes); - configurationValues - [connector::hive::HiveConfig::kMaxCoalescedDistanceBytes] = - std::to_string(FLAGS_max_coalesced_distance_bytes); - auto properties = - std::make_shared(configurationValues); - - // Create hive connector with config... - auto hiveConnector = - connector::getConnectorFactory( - connector::hive::HiveConnectorFactory::kHiveConnectorName) - ->newConnector(kHiveConnectorId, properties, ioExecutor_.get()); - connector::registerConnector(hiveConnector); - } - - void shutdown() { - cache_->shutdown(); - } - - std::pair, std::vector> run( - const TpchPlan& tpchPlan) { - int32_t repeat = 0; - try { - for (;;) { - CursorParameters params; - params.maxDrivers = FLAGS_num_drivers; - params.planNode = tpchPlan.plan; - const int numSplitsPerFile = FLAGS_num_splits_per_file; - - bool noMoreSplits = false; - auto addSplits = [&](exec::Task* task) { - if (!noMoreSplits) { - for (const auto& entry : tpchPlan.dataFiles) { - for (const auto& path : entry.second) { - auto const splits = - HiveConnectorTestBase::makeHiveConnectorSplits( - path, numSplitsPerFile, tpchPlan.dataFileFormat); - for (const auto& split : splits) { - task->addSplit(entry.first, exec::Split(split)); - } - } - task->noMoreSplits(entry.first); - } - } - noMoreSplits = true; - }; - auto result = readCursor(params, addSplits); - ensureTaskCompletion(result.first->task().get()); - if (++repeat >= FLAGS_num_repeats) { - return result; - } - } - } catch (const std::exception& e) { - LOG(ERROR) << "Query terminated with: " << e.what(); - return {nullptr, std::vector()}; - } - } - - void runMain(std::ostream& out, RunStats& runStats) { + void runMain(std::ostream& out, RunStats& runStats) override { if (FLAGS_run_query_verbose == -1 && FLAGS_io_meter_column_pct == 0) { folly::runBenchmarks(); } else { @@ -354,130 +103,6 @@ class TpchBenchmark { << std::endl; } } - - void readCombinations() { - std::ifstream file(FLAGS_test_flags_file); - std::string line; - while (std::getline(file, line)) { - ParameterDim dim; - int32_t previous = 0; - for (auto i = 0; i < line.size(); ++i) { - if (line[i] == ':') { - dim.flag = line.substr(0, i); - previous = i + 1; - } else if (line[i] == ',') { - dim.values.push_back(line.substr(previous, i - previous)); - previous = i + 1; - } - } - if (previous < line.size()) { - dim.values.push_back(line.substr(previous, line.size() - previous)); - } - - parameters_.push_back(dim); - } - } - - void runCombinations(int32_t level) { - if (level == parameters_.size()) { - if (FLAGS_clear_ram_cache) { -#ifdef linux - // system("echo 3 >/proc/sys/vm/drop_caches"); - bool success = false; - auto fd = open("/proc//sys/vm/drop_caches", O_WRONLY); - if (fd > 0) { - success = write(fd, "3", 1) == 1; - close(fd); - } - if (!success) { - LOG(ERROR) << "Failed to clear OS disk cache: errno=" << errno; - } -#endif - - if (cache_) { - cache_->clear(); - } - } - if (FLAGS_clear_ssd_cache) { - if (cache_) { - auto ssdCache = cache_->ssdCache(); - if (ssdCache) { - ssdCache->clear(); - } - } - } - if (FLAGS_warmup_after_clear) { - std::stringstream result; - RunStats ignore; - runMain(result, ignore); - } - RunStats stats; - std::stringstream result; - uint64_t micros = 0; - { - struct rusage start; - getrusage(RUSAGE_SELF, &start); - MicrosecondTimer timer(µs); - runMain(result, stats); - struct rusage final; - getrusage(RUSAGE_SELF, &final); - auto tvNanos = [](struct timeval tv) { - return tv.tv_sec * 1000000000 + tv.tv_usec * 1000; - }; - stats.userNanos = tvNanos(final.ru_utime) - tvNanos(start.ru_utime); - stats.systemNanos = tvNanos(final.ru_stime) - tvNanos(start.ru_stime); - } - stats.micros = micros; - stats.output = result.str(); - for (auto i = 0; i < parameters_.size(); ++i) { - std::string name; - gflags::GetCommandLineOption(parameters_[i].flag.c_str(), &name); - stats.flags[parameters_[i].flag] = name; - } - runStats_.push_back(std::move(stats)); - } else { - auto& flag = parameters_[level].flag; - for (auto& value : parameters_[level].values) { - std::string result = - gflags::SetCommandLineOption(flag.c_str(), value.c_str()); - if (result.empty()) { - LOG(ERROR) << "Failed to set " << flag << "=" << value; - } - std::cout << result << std::endl; - runCombinations(level + 1); - } - } - } - - void runAllCombinations() { - readCombinations(); - runCombinations(0); - std::sort( - runStats_.begin(), - runStats_.end(), - [](const RunStats& left, const RunStats& right) { - return left.micros < right.micros; - }); - for (auto& stats : runStats_) { - std::cout << stats.toString(false); - } - if (FLAGS_full_sorted_stats) { - std::cout << "Detail for stats:" << std::endl; - for (auto& stats : runStats_) { - std::cout << stats.toString(true); - } - } - } - - std::unique_ptr ioExecutor_; - std::unique_ptr cacheExecutor_; - std::shared_ptr allocator_; - std::shared_ptr cache_; - // Parameter combinations to try. Each element specifies a flag and possible - // values. All permutations are tried. - std::vector parameters_; - - std::vector runStats_; }; TpchBenchmark benchmark; @@ -487,11 +112,21 @@ BENCHMARK(q1) { benchmark.run(planContext); } +BENCHMARK(q2) { + const auto planContext = queryBuilder->getQueryPlan(2); + benchmark.run(planContext); +} + BENCHMARK(q3) { const auto planContext = queryBuilder->getQueryPlan(3); benchmark.run(planContext); } +BENCHMARK(q4) { + const auto planContext = queryBuilder->getQueryPlan(4); + benchmark.run(planContext); +} + BENCHMARK(q5) { const auto planContext = queryBuilder->getQueryPlan(5); benchmark.run(planContext); @@ -522,6 +157,11 @@ BENCHMARK(q10) { benchmark.run(planContext); } +BENCHMARK(q11) { + const auto planContext = queryBuilder->getQueryPlan(11); + benchmark.run(planContext); +} + BENCHMARK(q12) { const auto planContext = queryBuilder->getQueryPlan(12); benchmark.run(planContext); diff --git a/velox/benchmarks/tpch/TpchBenchmarkMain.cpp b/velox/benchmarks/tpch/TpchBenchmarkMain.cpp index acd59b3878fc8..4477455d8f3c6 100644 --- a/velox/benchmarks/tpch/TpchBenchmarkMain.cpp +++ b/velox/benchmarks/tpch/TpchBenchmarkMain.cpp @@ -23,6 +23,6 @@ int main(int argc, char** argv) { std::string kUsage( "This program benchmarks TPC-H queries. Run 'velox_tpch_benchmark -helpon=TpchBenchmark' for available options.\n"); gflags::SetUsageMessage(kUsage); - folly::init(&argc, &argv, false); + folly::Init init{&argc, &argv, false}; tpchBenchmarkMain(); } diff --git a/velox/benchmarks/unstable/CMakeLists.txt b/velox/benchmarks/unstable/CMakeLists.txt index fc192487f8f19..e3d264786d151 100644 --- a/velox/benchmarks/unstable/CMakeLists.txt +++ b/velox/benchmarks/unstable/CMakeLists.txt @@ -27,5 +27,5 @@ set(velox_benchmark_deps glog::glog) add_executable(velox_memory_alloc_benchmark MemoryAllocationBenchmark.cpp) -target_link_libraries(velox_memory_alloc_benchmark ${velox_benchmark_deps} - velox_memory pthread) +target_link_libraries( + velox_memory_alloc_benchmark ${velox_benchmark_deps} velox_memory pthread) diff --git a/velox/benchmarks/unstable/MemoryAllocationBenchmark.cpp b/velox/benchmarks/unstable/MemoryAllocationBenchmark.cpp index d267274ce56f0..6038c358dfb0d 100644 --- a/velox/benchmarks/unstable/MemoryAllocationBenchmark.cpp +++ b/velox/benchmarks/unstable/MemoryAllocationBenchmark.cpp @@ -435,7 +435,7 @@ BENCHMARK_RELATIVE_MULTI(MmapReallocateMix64) { } // namespace int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; // TODO: add to run benchmark as a standalone program with multithreading as // well as actual memory access to trigger minor page faults in OS which traps // into kernel context to setup physical pages for the lazy-mapped virtual diff --git a/velox/buffer/Buffer.h b/velox/buffer/Buffer.h index cecf03359c6a5..b7aad5d075306 100644 --- a/velox/buffer/Buffer.h +++ b/velox/buffer/Buffer.h @@ -56,9 +56,9 @@ class Buffer { // and trivially copyable (so memcpy works) template static inline constexpr bool is_pod_like_v = - std::is_trivially_destructible_v&& std::is_trivially_copyable_v; + std::is_trivially_destructible_v && std::is_trivially_copyable_v; - virtual ~Buffer(){}; + virtual ~Buffer() {} void addRef() { referenceCount_.fetch_add(1); @@ -381,34 +381,19 @@ class AlignedBuffer : public Buffer { auto oldCapacity = checkedPlus(old->capacity(), kPaddedSize); auto preferredSize = pool->preferredSize(checkedPlus(size, kPaddedSize)); - // Make the buffer no longer owned by '*buffer' because reallocate - // may free the old buffer. Reassigning the new buffer to - // '*buffer' would be a double free. + + void* newPtr = pool->reallocate(old, oldCapacity, preferredSize); + + // Make the old buffer no longer owned by '*buffer' because reallocate + // freed the old buffer. Reassigning the new buffer to + // '*buffer' would be a double free if we didn't do this. buffer->detach(); - // Decrement the reference count. No need to check, we just - // checked old->unique(). - old->referenceCount_.fetch_sub(1); - void* newPtr; - try { - newPtr = pool->reallocate(old, oldCapacity, preferredSize); - } catch (const std::exception&) { - *buffer = old; - throw; - } - if (newPtr == reinterpret_cast(old)) { - // The pointer did not change. Put the old pointer back in the - // smart pointer and adjust capacity. - *buffer = old; - (*buffer)->setCapacity(preferredSize - kPaddedSize); - (*buffer)->setSize(size); - reinterpret_cast(buffer->get()) - ->fillNewMemory(oldSize, size, initValue); - return; - } + auto newBuffer = new (newPtr) AlignedBuffer(pool, preferredSize - kPaddedSize); newBuffer->setSize(size); newBuffer->fillNewMemory(oldSize, size, initValue); + *buffer = newBuffer; } diff --git a/velox/buffer/CMakeLists.txt b/velox/buffer/CMakeLists.txt index 8a6df1c1e87ee..2b1e8ea65a78b 100644 --- a/velox/buffer/CMakeLists.txt +++ b/velox/buffer/CMakeLists.txt @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_buffer StringViewBufferHolder.cpp) +velox_add_library(velox_buffer StringViewBufferHolder.cpp) -target_link_libraries(velox_buffer velox_memory velox_common_base Folly::folly) +velox_link_libraries(velox_buffer velox_memory velox_common_base Folly::folly) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) diff --git a/velox/buffer/tests/BufferTest.cpp b/velox/buffer/tests/BufferTest.cpp index 90ce32f93170b..2f6107f551ed1 100644 --- a/velox/buffer/tests/BufferTest.cpp +++ b/velox/buffer/tests/BufferTest.cpp @@ -17,6 +17,8 @@ #include "velox/buffer/Buffer.h" #include "folly/Range.h" +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/testutil/TestValue.h" #include "velox/type/StringView.h" #include @@ -67,7 +69,7 @@ TEST_F(BufferTest, testAlignedBuffer) { testString, testStringLength); other = buffer; - EXPECT_EQ(pool_->currentBytes(), pool_->preferredSize(sizeWithHeader)); + EXPECT_EQ(pool_->usedBytes(), pool_->preferredSize(sizeWithHeader)); AlignedBuffer::reallocate(&other, size * 3, 'e'); EXPECT_NE(other, buffer); @@ -83,18 +85,17 @@ TEST_F(BufferTest, testAlignedBuffer) { 0); EXPECT_EQ(other->as()[buffer->capacity()], 'e'); EXPECT_EQ( - pool_->currentBytes(), + pool_->usedBytes(), pool_->preferredSize(sizeWithHeader) + pool_->preferredSize(3 * size + kHeaderSize)); } - EXPECT_EQ( - pool_->currentBytes(), pool_->preferredSize(3 * size + kHeaderSize)); + EXPECT_EQ(pool_->usedBytes(), pool_->preferredSize(3 * size + kHeaderSize)); other = nullptr; BufferPtr bits = AlignedBuffer::allocate(65, pool_.get(), true); EXPECT_EQ(bits->size(), 9); EXPECT_EQ(bits->as()[8], 0xff); bits = nullptr; - EXPECT_EQ(pool_->currentBytes(), 0); + EXPECT_EQ(pool_->usedBytes(), 0); } TEST_F(BufferTest, testAsRange) { @@ -177,11 +178,122 @@ TEST_F(BufferTest, testReallocate) { } } buffers.clear(); - EXPECT_EQ(pool_->currentBytes(), 0); + EXPECT_EQ(pool_->usedBytes(), 0); EXPECT_GT(numInPlace, 0); EXPECT_GT(numMoved, 0); } +TEST_F(BufferTest, testReallocateNoReuse) { + // This test checks that regardless of how we resize a Buffer in reallocate + // (up, down, the same) as long as we hit MemoryPool::reallocate, the Buffer + // always points to a new location in memory. If this test fails, it's not + // necessarily a problem, but it's worth looking at optimizations we could do + // in reallocate when the pointer doesn't change. + + enum BufferResizeOption { + BIGGER, + SMALLER, + SAME, + }; + + auto test = [&](BufferResizeOption bufferResizeOption, + bool useMmapAllocator) { + memory::MemoryManagerOptions options; + options.useMmapAllocator = useMmapAllocator; + options.allocatorCapacity = 1024 * 1024; + memory::MemoryManager memoryManager(options); + + auto pool = memoryManager.addLeafPool("testReallocateNoReuse"); + + const size_t originalBufferSize = 10; + auto buffer = AlignedBuffer::allocate(originalBufferSize, pool.get()); + auto* originalBufferPtr = buffer.get(); + + size_t newSize; + switch (bufferResizeOption) { + case SMALLER: + newSize = originalBufferSize - 1; + break; + case SAME: + newSize = originalBufferSize; + break; + case BIGGER: + // Make sure the new size is large enough that we hit + // MemoryPoolImpl::reallocate. + newSize = buffer->capacity() + 1; + break; + default: + VELOX_FAIL("Unexpected buffer resize option"); + } + + AlignedBuffer::reallocate(&buffer, newSize); + + EXPECT_NE(buffer.get(), originalBufferPtr); + }; + + test(SMALLER, true); + test(SAME, true); + test(BIGGER, true); + + test(SMALLER, false); + test(SAME, false); + test(BIGGER, false); +} + +DEBUG_ONLY_TEST_F(BufferTest, testReallocateFails) { + // Reallocating a buffer can cause an exception to be thrown e.g. if we + // run out of memory. If the buffer is left in an invalid state this can + // cause crahses, e.g. if VectorSaver attempts to write out a Vector that + // was in the midst of resizing. This test verifies the buffer is valid at + // different points in the exception's lifecycle. + + const size_t bufferSize = 10; + auto buffer = AlignedBuffer::allocate(bufferSize, pool_.get()); + + ::memset(buffer->asMutable(), 'a', bufferSize); + + common::testutil::TestValue::enable(); + + const std::string kErrorMessage = "Expected out of memory exception"; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::MemoryPoolImpl::reserveThreadSafe", + std::function([&](memory::MemoryPool*) { + VELOX_MEM_POOL_CAP_EXCEEDED(kErrorMessage); + })); + + { + ExceptionContextSetter setter( + {.messageFunc = [](VeloxException::Type, + void* untypedArg) -> std::string { + // Validate that the buffer is still valid at the point + // the exception is thrown. + auto bufferArg = *static_cast(untypedArg); + + const auto* bufferContents = bufferArg->as(); + VELOX_CHECK_EQ(bufferArg->size(), 10); + for (int i = 0; i < 10; i++) { + VELOX_CHECK_EQ(bufferContents[i], 'a'); + } + + return "Exception context message func called."; + }, + .arg = &buffer}); + + VELOX_ASSERT_THROW_CODE( + AlignedBuffer::reallocate( + &buffer, pool_->availableReservation() + 1), + error_code::kMemCapExceeded, + kErrorMessage); + } + + // Validate the buffer is valid after the exception is caught. + const auto* bufferContents = buffer->as(); + VELOX_CHECK_EQ(buffer->size(), bufferSize); + for (int i = 0; i < bufferSize; i++) { + VELOX_CHECK_EQ(bufferContents[i], 'a'); + } +} + struct MockCachePin { void addRef() { ++pinCount; @@ -327,9 +439,9 @@ TEST_F(BufferTest, testNonPOD) { TEST_F(BufferTest, testNonPODMemoryUsage) { using T = std::shared_ptr; - const int64_t currentBytes = pool_->currentBytes(); + const int64_t currentBytes = pool_->usedBytes(); { auto buffer = AlignedBuffer::allocate(0, pool_.get()); } - EXPECT_EQ(pool_->currentBytes(), currentBytes); + EXPECT_EQ(pool_->usedBytes(), currentBytes); } TEST_F(BufferTest, testAllocateSizeOverflow) { diff --git a/velox/buffer/tests/CMakeLists.txt b/velox/buffer/tests/CMakeLists.txt index a6029241f9a17..097f5ee3bb3f4 100644 --- a/velox/buffer/tests/CMakeLists.txt +++ b/velox/buffer/tests/CMakeLists.txt @@ -20,9 +20,9 @@ target_link_libraries( velox_memory velox_buffer velox_test_util - gtest - gtest_main - gmock + GTest::gtest + GTest::gtest_main + GTest::gmock glog::glog gflags::gflags pthread) diff --git a/velox/buffer/tests/StringViewBufferHolderTest.cpp b/velox/buffer/tests/StringViewBufferHolderTest.cpp index e67cae3af6f1e..eae13a75cc849 100644 --- a/velox/buffer/tests/StringViewBufferHolderTest.cpp +++ b/velox/buffer/tests/StringViewBufferHolderTest.cpp @@ -35,11 +35,16 @@ std::string inlinedString() { class StringViewBufferHolderTest : public testing::Test { protected: + static void SetUpTestCase() { + memory::MemoryManager::initialize({}); + } + StringViewBufferHolder makeHolder() { return StringViewBufferHolder(pool_.get()); } - std::shared_ptr pool_{memory::addDefaultLeafMemoryPool()}; + std::shared_ptr pool_{ + memory::memoryManager()->addLeafPool()}; }; TEST_F(StringViewBufferHolderTest, inlinedStringViewDoesNotCopyToBuffer) { diff --git a/velox/codegen/CMakeLists.txt b/velox/codegen/CMakeLists.txt deleted file mode 100644 index e54a0d133c07d..0000000000000 --- a/velox/codegen/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -add_library(velox_codegen Codegen.cpp) -if(${VELOX_CODEGEN_SUPPORT}) - target_link_libraries(velox_codegen velox_experimental_codegen) -else() - target_link_libraries(velox_codegen velox_core velox_exec velox_expression - glog::glog) -endif() diff --git a/velox/codegen/Codegen-Stubs.h b/velox/codegen/Codegen-Stubs.h deleted file mode 100644 index 44f371dc573be..0000000000000 --- a/velox/codegen/Codegen-Stubs.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#if CODEGEN_ENABLED == 1 -#error "This files shouldn't be included when the codegen is enabled" -#endif - -#include -#include -#include "velox/core/PlanNode.h" - -namespace facebook { -namespace velox { -namespace codegen { - -/// Main interface/entry point with the code generation system. -/// This is stub code used when the code generation is disabled. - -class Codegen { - public: - Codegen() {} - - bool initialize( - const std::string_view& codegenOptionsJson, - bool lazyLoading = true) { - LOG(INFO) << "Codegen disabled, doing nothing : " << std::endl; - return true; - } - - bool initializeFromFile( - const std::filesystem::path& codegenOptionsJsonFile, - bool lazyLoading = true) { - LOG(INFO) << "Codegen disabled, doing nothing : " << std::endl; - return true; - }; - - std::shared_ptr compile( - const core::PlanNode& planNode) { - LOG(INFO) << "Codegen disabled, doing nothing" << std::endl; - return nullptr; - } -}; - -} // namespace codegen -} // namespace velox -}; // namespace facebook diff --git a/velox/codegen/Codegen.cpp b/velox/codegen/Codegen.cpp deleted file mode 100644 index 4f6fe911619a1..0000000000000 --- a/velox/codegen/Codegen.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/codegen/Codegen.h" - -namespace facebook { -namespace velox { -namespace codegen {} -} // namespace velox -} // namespace facebook diff --git a/velox/codegen/Codegen.h b/velox/codegen/Codegen.h deleted file mode 100644 index 692fcf85a0502..0000000000000 --- a/velox/codegen/Codegen.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#if CODEGEN_ENABLED == 1 -#include "velox/experimental/codegen/Codegen.h" -#else -#include "velox/codegen/Codegen-Stubs.h" -#endif diff --git a/velox/codegen/platform-support.h b/velox/codegen/platform-support.h deleted file mode 100644 index 55243e89658a7..0000000000000 --- a/velox/codegen/platform-support.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -// Collection of preprocessor test - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-W#warnings" - -#ifndef __cpp_concepts -#warning "Concept support is preferred" -#else -#define ENABLE_CONCEPT 1 -#endif - -#ifndef __cpp_lib_ranges -#warning "Range support is preferred" -#else -#define USE_STD_RANGE 1 -#endif - -#pragma clang diagnostic pop - -#if __cplusplus < 201703L -#error "C++17 is required" -#endif diff --git a/velox/common/base/AsyncSource.h b/velox/common/base/AsyncSource.h index d650d4ebecca5..76740dd8681a6 100644 --- a/velox/common/base/AsyncSource.h +++ b/velox/common/base/AsyncSource.h @@ -21,10 +21,12 @@ #include #include #include +#include "velox/common/time/CpuWallTimer.h" #include "velox/common/base/Exceptions.h" #include "velox/common/base/Portability.h" #include "velox/common/future/VeloxPromise.h" +#include "velox/common/process/ThreadDebugInfo.h" #include "velox/common/testutil/TestValue.h" namespace facebook::velox { @@ -39,7 +41,24 @@ template class AsyncSource { public: explicit AsyncSource(std::function()> make) - : make_(make) {} + : make_(std::move(make)) { + if (process::GetThreadDebugInfo() != nullptr) { + auto* currentThreadDebugInfo = process::GetThreadDebugInfo(); + // We explicitly leave out the callback when copying the ThreadDebugInfo + // as that may have captured state that goes out of scope by the time + // _make is called. + threadDebugInfo_ = std::make_optional( + {currentThreadDebugInfo->queryId_, + currentThreadDebugInfo->taskId_, + nullptr}); + } + } + + ~AsyncSource() { + VELOX_CHECK( + moved_ || closed_, + "AsyncSource should be properly consumed or closed."); + } // Makes an item if it is not already made. To be called on a background // executor. @@ -57,8 +76,9 @@ class AsyncSource { } std::unique_ptr item; try { - item = make(); - } catch (std::exception& e) { + CpuWallTimer timer(timing_); + item = runMake(make); + } catch (std::exception&) { std::lock_guard l(mutex_); exception_ = std::current_exception(); } @@ -77,9 +97,9 @@ class AsyncSource { } } - // Returns the item to the first caller and nullptr to subsequent callers. If - // the item is preparing on the executor, waits for the item and otherwise - // makes it on the caller thread. + // Returns the item to the first caller and nullptr to subsequent callers. + // If the item is preparing on the executor, waits for the item and + // otherwise makes it on the caller thread. std::unique_ptr move() { common::testutil::TestValue::adjust( "facebook::velox::AsyncSource::move", this); @@ -87,6 +107,7 @@ class AsyncSource { ContinueFuture wait; { std::lock_guard l(mutex_); + moved_ = true; // 'making_' can be read atomically, 'exception' maybe not. So test // 'making' so as not to read half-assigned 'exception_'. if (!making_ && exception_) { @@ -112,8 +133,8 @@ class AsyncSource { // Outside of mutex_. if (make) { try { - return make(); - } catch (const std::exception& e) { + return runMake(make); + } catch (const std::exception&) { std::lock_guard l(mutex_); exception_ = std::current_exception(); throw; @@ -135,7 +156,57 @@ class AsyncSource { return item_ != nullptr || exception_ != nullptr; } + /// Returns the timing of prepare(). If the item was made on the calling + /// thread, the timing is 0 since only off-thread activity needs to be added + /// to the caller's timing. + const CpuWallTiming& prepareTiming() { + return timing_; + } + + /// This function assists the caller in ensuring that resources allocated in + /// AsyncSource are promptly released: + /// 1. Waits for the completion of the 'make_' function if it is executing + /// in the thread pool. + /// 2. Resets the 'make_' function if it has not started yet. + /// 3. Cleans up the 'item_' if 'make_' has completed, but the result + /// 'item_' has not been returned to the caller. + void close() { + if (closed_ || moved_) { + return; + } + ContinueFuture wait; + { + std::lock_guard l(mutex_); + if (making_) { + promise_ = std::make_unique(); + wait = promise_->getSemiFuture(); + } else if (make_) { + make_ = nullptr; + } + } + + auto& exec = folly::QueuedImmediateExecutor::instance(); + std::move(wait).via(&exec).wait(); + { + std::lock_guard l(mutex_); + if (item_) { + item_ = nullptr; + } + closed_ = true; + } + } + private: + std::unique_ptr runMake(std::function()>& make) { + process::ScopedThreadDebugInfo threadDebugInfo( + threadDebugInfo_.has_value() ? &threadDebugInfo_.value() : nullptr); + return make(); + } + + // Stored context (if present upon construction) so they can be restored when + // make_ is invoked. + std::optional threadDebugInfo_; + mutable std::mutex mutex_; // True if 'prepare() is making the item. bool making_{false}; @@ -143,5 +214,8 @@ class AsyncSource { std::unique_ptr item_; std::function()> make_; std::exception_ptr exception_; + CpuWallTiming timing_; + bool closed_{false}; + bool moved_{false}; }; } // namespace facebook::velox diff --git a/velox/common/base/BitSet.h b/velox/common/base/BitSet.h index baf2d9a6f2f05..bcdb6e0a38ba0 100644 --- a/velox/common/base/BitSet.h +++ b/velox/common/base/BitSet.h @@ -22,12 +22,12 @@ #include "velox/common/base/BitUtil.h" namespace facebook::velox { -// Dynamic size dense bit set that Keeps track of maximum set bit. +/// Dynamic size dense bit set that keeps track of maximum set bit. class BitSet { public: - // Constructs a bitSet. 'min' is the lowest possible member of the - // set. Values below this are not present and inserting these is a - // no-op. 'min' is used when using this as an IN predicate filter. + /// Constructs a bitSet. 'min' is the lowest possible member of the set. + /// Values below this are not present and inserting these is a no-op. 'min' is + /// used when using this as an IN predicate filter. explicit BitSet(int64_t min) : min_(min) {} void insert(int64_t index) { @@ -46,7 +46,7 @@ class BitSet { bits::setBit(bits_.data(), bit, true); } - bool contains(uint32_t index) { + bool contains(uint32_t index) const { uint64_t bit = index - min_; if (bit >= bits_.size() * 64) { // If index was < min_, bit will have wrapped around and will be > @@ -56,7 +56,7 @@ class BitSet { return bits::isBitSet(bits_.data(), bit); } - // Returns the largest element of the set or 'min_ - 1' if empty. + /// Returns the largest element of the set or 'min_ - 1' if empty. int64_t max() const { return lastSetBit_ + min_; } @@ -66,8 +66,8 @@ class BitSet { } private: - std::vector bits_; const int64_t min_; + std::vector bits_; int64_t lastSetBit_ = -1; }; diff --git a/velox/common/base/BitUtil.cpp b/velox/common/base/BitUtil.cpp index cb964e405d4dd..686bce02b1a2e 100644 --- a/velox/common/base/BitUtil.cpp +++ b/velox/common/base/BitUtil.cpp @@ -16,8 +16,11 @@ #include "velox/common/base/BitUtil.h" #include "velox/common/base/Exceptions.h" +#include "velox/common/base/SimdUtil.h" #include "velox/common/process/ProcessBase.h" +#include + namespace facebook::velox::bits { namespace { @@ -30,12 +33,13 @@ void scatterBitsSimple( char* target) { int64_t from = numSource - 1; for (int64_t to = numTarget - 1; to >= 0; to--) { - bool maskIsSet = bits::isBitSet(targetMask, to); + const bool maskIsSet = bits::isBitSet(targetMask, to); bits::setBit(target, to, maskIsSet && bits::isBitSet(source, from)); from -= maskIsSet ? 1 : 0; } } +#ifdef __BMI2__ // Fetches 'numBits' bits of data, from data starting at lastBit - // numbits (inclusive) and ending at lastBit (exclusive). 'lastBit' is // updated to be the bit offset of the lowest returned bit. Successive @@ -55,6 +59,7 @@ uint64_t getBitField(const char* data, int32_t numBits, int32_t& lastBit) { lastBit -= numBits; return bits; } +#endif // Copy bits backward while the remaining data is still larger than size of T. template @@ -117,6 +122,17 @@ void scatterBits( int32_t highBit = numTarget & 7; int lowByte = std::max(0, highByte - 7); auto maskAsBytes = reinterpret_cast(targetMask); +#if defined(__has_feature) +#if __has_feature(__address_sanitizer__) + int32_t sourceOffset = std::min(0, (numSource / 8) - 7) + 1; + folly::doNotOptimizeAway( + *reinterpret_cast(source + sourceOffset)); + folly::doNotOptimizeAway( + *reinterpret_cast(maskAsBytes + lowByte + 1)); + folly::doNotOptimizeAway(*reinterpret_cast(target + lowByte + 1)); +#endif +#endif + // Loop from top to bottom of 'targetMask' up to 64 bits at a time, // with a partial word at either end. Count the set bits and fetch // as many consecutive bits of source data. Scatter the source bits @@ -158,4 +174,49 @@ void scatterBits( #endif } +uint64_t hashBytes(uint64_t seed, const char* data, size_t size) { + auto begin = reinterpret_cast(data); + const uint64_t kMul = 0x9ddfea08eb382d69ULL; + if (size < 8) { + auto word = loadPartialWord(begin, size); + uint64_t crc = simd::crc32U64(seed, word); + uint64_t crc2 = simd::crc32U64(seed, word >> 32); + return crc | (crc2 << 32); + } + uint64_t a0 = seed; + uint64_t a1 = seed << 32; + uint64_t a2 = seed >> 16; + int32_t toGo = size; + auto words = reinterpret_cast(data); + while (toGo >= 24) { + a0 = simd::crc32U64(a0, words[0]); + a1 = simd::crc32U64(a1, words[1]); + a2 = simd::crc32U64(a2, words[2]); + words += 3; + toGo -= 24; + } + if (toGo > 16) { + a0 = simd::crc32U64(a0, words[0]); + a1 = simd::crc32U64(a1, words[1]); + a2 = simd::crc32U64( + a2, + loadPartialWord( + reinterpret_cast(words + 2), toGo - 16)); + } else if (toGo > 8) { + a0 = simd::crc32U64(a0, words[0]); + a1 = simd::crc32U64( + a1, + toGo == 16 + ? words[1] + : loadPartialWord( + reinterpret_cast(words + 1), toGo - 8)); + } else if (toGo > 0) { + a0 = simd::crc32U64( + a0, + toGo == 8 + ? words[0] + : loadPartialWord(reinterpret_cast(words), toGo)); + } + return a0 ^ ((a1 * kMul)) ^ (a2 * kMul); +} } // namespace facebook::velox::bits diff --git a/velox/common/base/BitUtil.h b/velox/common/base/BitUtil.h index 8bd07344aafbf..75b118ce094fc 100644 --- a/velox/common/base/BitUtil.h +++ b/velox/common/base/BitUtil.h @@ -16,6 +16,8 @@ #pragma once +#include "velox/common/base/Exceptions.h" + #include #include #include @@ -93,6 +95,11 @@ constexpr inline T roundUp(T value, U factor) { return (value + (factor - 1)) / factor * factor; } +template +constexpr inline T divRoundUp(T value, U factor) { + return (value + (factor - 1)) / factor; +} + constexpr inline uint64_t lowMask(int32_t bits) { return (1UL << bits) - 1; } @@ -714,7 +721,7 @@ inline uint64_t nextPowerOfTwo(uint64_t size) { return 0; } uint32_t bits = 63 - countLeadingZeros(size); - uint64_t lower = 1U << bits; + uint64_t lower = 1ULL << bits; // Size is a power of 2. if (lower == size) { return size; @@ -784,24 +791,7 @@ inline uint64_t loadPartialWord(const uint8_t* data, int32_t size) { return result; } -inline size_t hashBytes(size_t seed, const char* data, size_t size) { - auto begin = reinterpret_cast(data); - if (size < 8) { - return hashMix(seed, loadPartialWord(begin, size)); - } - auto result = seed; - auto end = begin + size; - while (begin + 8 <= end) { - result = hashMix(result, *reinterpret_cast(begin)); - begin += 8; - } - if (end != begin) { - // Accesses the last 64 bits. Some bytes may get processed twice but the - // access is safe. - result = hashMix(result, *reinterpret_cast(end - 8)); - } - return result; -} +uint64_t hashBytes(uint64_t seed, const char* data, size_t size); namespace detail { // Returns at least 'numBits' bits of data starting at bit 'bitOffset' @@ -890,13 +880,17 @@ void copyBitsBackward( uint64_t targetOffset, uint64_t numBits); -// Copies consecutive bits from 'source' to positions in 'target' -// where 'targetMask' has a 1. 'source' may be a prefix of 'target', -// so that contiguous bits of source are scattered in place. The -// positions of 'target' where 'targetMask' is 0 are 0. A sample use -// case is reading a column of boolean with nulls. The booleans -// from the column get inserted into the places given by ones in the -// present bitmap. +/// Copies consecutive bits from 'source' to positions in 'target' where +/// 'targetMask' has a 1. 'source' may be a prefix of 'target', so that +/// contiguous bits of source are scattered in place. The positions of 'target' +/// where 'targetMask' is 0 are 0. A sample use case is reading a column of +/// boolean with nulls. The booleans from the column get inserted into the +/// places given by ones in the present bitmap. All source, target and mask bit +/// arrays are accessed at 64 bit width and must have a minimum of 64 bits plus +/// one addressable byte after the last bit. Using std::vector as a +/// bit array without explicit padding, for example, can crash with +/// access to unmapped address if the vector happens to border on +/// unmapped memory. void scatterBits( int32_t numSource, int32_t numTarget, @@ -904,9 +898,9 @@ void scatterBits( const uint64_t* targetMask, char* target); -// Extract bits from integer 'a' at the corresponding bit locations -// specified by 'mask' to contiguous low bits in return value; the -// remaining upper bits in return value are set to zero. +/// Extract bits from integer 'a' at the corresponding bit locations specified +/// by 'mask' to contiguous low bits in return value; the remaining upper bits +/// in return value are set to zero. template inline T extractBits(T a, T mask); @@ -989,6 +983,26 @@ inline __int128_t builtin_bswap128(__int128_t value) { #endif } +/// Store `bits' into the memory region pointed by `byte', at `index' (bit +/// index). If `kSize' is 8, we store the whole byte directly; otherwise it +/// must be 4 and we store either the whole byte or the upper 4 bits only, +/// depending on the `index'. +template +void storeBitsToByte(uint8_t bits, uint8_t* bytes, unsigned index) { + VELOX_DCHECK_EQ(index % kSize, 0); + VELOX_DCHECK_EQ(bits >> kSize, 0); + if constexpr (kSize == 8) { + bytes[index / 8] = bits; + } else { + VELOX_DCHECK_EQ(kSize, 4); + if (index % 8 == 0) { + bytes[index / 8] = bits; + } else { + bytes[index / 8] |= bits << 4; + } + } +} + } // namespace bits } // namespace velox } // namespace facebook diff --git a/velox/common/base/BloomFilter.h b/velox/common/base/BloomFilter.h index 0ad1b94f66c7c..1d23e382834bb 100644 --- a/velox/common/base/BloomFilter.h +++ b/velox/common/base/BloomFilter.h @@ -114,16 +114,14 @@ class BloomFilter { } inline static void - set(uint64_t* FOLLY_NONNULL bloom, int32_t bloomSize, uint64_t hashCode) { + set(uint64_t* bloom, int32_t bloomSize, uint64_t hashCode) { auto mask = bloomMask(hashCode); auto index = bloomIndex(bloomSize, hashCode); bloom[index] |= mask; } - inline static bool test( - const uint64_t* FOLLY_NONNULL bloom, - int32_t bloomSize, - uint64_t hashCode) { + inline static bool + test(const uint64_t* bloom, int32_t bloomSize, uint64_t hashCode) { auto mask = bloomMask(hashCode); auto index = bloomIndex(bloomSize, hashCode); return mask == (bloom[index] & mask); diff --git a/velox/common/base/CMakeLists.txt b/velox/common/base/CMakeLists.txt index cc10739914017..6c26222ef56cd 100644 --- a/velox/common/base/CMakeLists.txt +++ b/velox/common/base/CMakeLists.txt @@ -12,27 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_exception Exceptions.cpp VeloxException.cpp Exceptions.h) -target_link_libraries( - velox_exception PUBLIC velox_flag_definitions velox_process Folly::folly - fmt::fmt gflags::gflags glog::glog) +velox_add_library(velox_exception Exceptions.cpp VeloxException.cpp + Exceptions.h) +velox_link_libraries( + velox_exception + PUBLIC velox_flag_definitions + velox_process + Folly::folly + fmt::fmt + gflags::gflags + glog::glog) -add_library( +velox_add_library( velox_common_base BitUtil.cpp Counters.cpp Fs.cpp + PeriodicStatsReporter.cpp RandomUtil.cpp RawVector.cpp RuntimeMetrics.cpp SimdUtil.cpp + SpillConfig.cpp + SpillStats.cpp StatsReporter.cpp SuccinctPrinter.cpp) -target_link_libraries( +velox_link_libraries( velox_common_base PUBLIC velox_exception Folly::folly fmt::fmt xsimd - PRIVATE velox_process velox_test_util glog::glog) + PRIVATE velox_common_compression velox_process velox_test_util glog::glog) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) @@ -42,8 +51,8 @@ if(${VELOX_ENABLE_BENCHMARKS}) add_subdirectory(benchmarks) endif() -add_library(velox_id_map BigintIdMap.cpp) -target_link_libraries( +velox_add_library(velox_id_map BigintIdMap.cpp) +velox_link_libraries( velox_id_map velox_memory velox_flag_definitions @@ -52,3 +61,9 @@ target_link_libraries( Folly::folly fmt::fmt gflags::gflags) + +velox_add_library(velox_status Status.cpp) +velox_link_libraries( + velox_status + PUBLIC fmt::fmt Folly::folly + PRIVATE glog::glog) diff --git a/velox/common/base/CoalesceIo.h b/velox/common/base/CoalesceIo.h index 718ddd5926df7..f5f58ba1092bf 100644 --- a/velox/common/base/CoalesceIo.h +++ b/velox/common/base/CoalesceIo.h @@ -19,9 +19,9 @@ #include #include namespace facebook::velox { -// Utility for combining IOs to nearby location into fewer coalesced -// IOs. This may increase data transfer but generally reduces -// latency and may reduce throttling. +/// Utility for combining IOs to nearby location into fewer coalesced IOs. This +/// may increase data transfer but generally reduces latency and may reduce +/// throttling. /// Describes the outcome of coalescedIo(). struct CoalesceIoStats { diff --git a/velox/common/base/CompareFlags.h b/velox/common/base/CompareFlags.h index 436e6c203ef75..265f84652f81d 100644 --- a/velox/common/base/CompareFlags.h +++ b/velox/common/base/CompareFlags.h @@ -14,6 +14,7 @@ * limitations under the License. */ #include +#include #include #include @@ -21,14 +22,10 @@ namespace facebook::velox { +constexpr auto kIndeterminate = std::nullopt; + // Describes value collation in comparison. struct CompareFlags { - // NoStop: The compare doesn't stop at null. - // StopAtNull: The compare returns std::nullopt if null is encountered in rhs - // or lhs. - enum class NullHandlingMode { NoStop, StopAtNull }; - - // This flag will be ignored if nullHandlingMode is true. bool nullsFirst = true; bool ascending = true; @@ -36,18 +33,138 @@ struct CompareFlags { // When true, comparison should return non-0 early when sizes mismatch. bool equalsOnly = false; - NullHandlingMode nullHandlingMode = NullHandlingMode::NoStop; + enum class NullHandlingMode { + + /// The default null handling mode where nulls are treated as values such + /// that: + /// - null == null is true, + /// - null == value is false. + /// - when equalsOnly=false null ordering is determined using the + /// nullsFirst flag. + kNullAsValue, + + /// Presto semantics for handling nulls. + /// It matches the behavior of ==, >, < functions and many other Presto + /// functions such as array_remove and array_contains. + /// + /// Under this mode, result of comparison can be indeterminate. + /// Such result is represented as std::nullopt and means that the + /// function can not decide on the result of the comparison due to some + /// existing nulls. Not every null results in indeterminate result. + /// + /// ## When equalsOnly=true: + /// The compare can return kIndeterminate, or a value according to the + /// following: + /// 1. Primitive types and top level nulls: + /// - Comparing null with anything else is indeterminate, otherwise a + /// value result is returned. + /// + /// - Comparing top level nulls in complex types is indeterminate. + /// + /// 2. Arrays: + /// - If the compared array sizes are different then result is false, + /// ex:[null] == [null, null] is false. + /// + /// - If any two elements compare result is false, result is false + /// ex: [null, 1] = [null, 2] is false. + /// + /// - If result is not false and any two elements compared result is + /// indeterminate, then result is indeterminate. + /// ex: [null, 1] = [null, 1] is indeterminate. + /// + /// - If elements compare results are true, then result is true. + /// ex: [1, 1] = [1, 1] is true. + /// + /// 3. Rows: Follows the same logic as array, with fields being elements. + /// - If any two fields compare result is false, result is false + /// ex: (null, 1) = (null, 2) is false. + /// + /// - If result is not false and any two fields compare result is + /// indeterminate, then the result is indeterminate. + /// ex: (null, 1) = (null, 1) is indeterminate. + /// + /// - If all fields compare results are true, then result is true. + /// ex: (1, 1) = (1, 1) is indeterminate. + /// + /// 4. Maps: + /// - Keys are compared first, if keys are not equal values are not + /// checked even if they have nulls and result is false. + /// + /// - If keys are the same for all maps, values are compared by applying + /// the array compare logic explained above observing each map as an + /// array of values. + /// ex: {1:null, 2:2} = {1:null, 2:3} is false. + /// ex: {1:null, 2:2} = {1:null, 2:2} is indeterminate. + /// ex: {1:1, 2:2} = {1:1, 2:2} is true. + /// + /// ## When equalsOnly=false: + /// The compare either returns a value or throws a user error if a null is + /// encountered before result is determined, as explained below: + /// + /// 1. Primitive types and top level nulls: + /// - Comparing null with anything else throws (note that functions like + /// >, < do not pass nulls to compare since they are default nulls + /// functions). + /// + /// - Comparing top level nulls also throws. + /// + /// 2. Arrays: + /// - Only elements up to index min(rhs.size(), lhs.size()) are + /// compared. + /// + /// - Elements are compared in order starting from index 0. + /// + /// - If all elements in the range above are the same, then sizes are + /// compared. + /// ex: [1, null] > [1] -> result is true, and null is not read. + /// + /// - If any two elements are different but not null, then result is + /// determined and remaining elements are not considered for + /// comparison. + /// ex: [1, null] > [2, null] -> result is false and nulls are not + /// compared. + /// + /// - If a null element is encountered before result is determined then + /// the compare throws a user exception. + /// ex: [1, null, null] > [1, 2]-> throws. + /// + /// 3. Rows: + /// - Fields are compared in order starting from index 0. + /// + /// - If any two fields are different but not null, then result is + /// determined and remaining elements are not considered for + /// comparison. + /// ex: (1, null) > (2, null) -> result is false and nulls not + /// compared. + /// + /// - If a null element is encountered before result is determined then + /// the compare throws a user exception. + /// ex: (1, null) > (1, 2) -> throws. + /// + /// 4. Maps: + /// - This mode does not allow ordering maps. + kNullAsIndeterminate + }; + + NullHandlingMode nullHandlingMode = NullHandlingMode::kNullAsValue; + + bool nullAsValue() const { + return nullHandlingMode == CompareFlags::NullHandlingMode::kNullAsValue; + } - bool mayStopAtNull() { - return nullHandlingMode == CompareFlags::NullHandlingMode::StopAtNull; + // Helper method to construct compare flags with equalsOnly = true, in that + // case nullsFirst and ascending are not needed. + static constexpr CompareFlags equality(NullHandlingMode nullHandlingMode) { + return CompareFlags{ + .equalsOnly = true, .nullHandlingMode = nullHandlingMode}; } static std::string nullHandlingModeToStr(NullHandlingMode mode) { switch (mode) { - case CompareFlags::NullHandlingMode::NoStop: - return "NoStop"; - case CompareFlags::NullHandlingMode::StopAtNull: - return "StopAtNull"; + case CompareFlags::NullHandlingMode::kNullAsValue: + return "NullAsValue"; + case CompareFlags::NullHandlingMode::kNullAsIndeterminate: + return "NullAsIndeterminate"; default: return fmt::format( "Unknown Null Handling mode {}", static_cast(mode)); diff --git a/velox/common/base/ConcurrentCounter.h b/velox/common/base/ConcurrentCounter.h new file mode 100644 index 0000000000000..689c91d152519 --- /dev/null +++ b/velox/common/base/ConcurrentCounter.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "velox/common/base/BitUtil.h" +#include "velox/common/base/Exceptions.h" + +namespace facebook::velox { + +/// The class provides concurrent updates to a counter with minimum lock +/// contention. The template argument T specifies the counter type. The counter +/// is N-way sharded internally. Each update goes to one of the sharded counters +/// based on the update thread id. +template +class ConcurrentCounter { + public: + /// Creates a concurrent counter with specified number of shards. + /// + /// NOTE: the constructor sets the actual number of shards to be the next + /// power of 2. + explicit ConcurrentCounter(size_t numShards) + : numShards_(bits::nextPowerOfTwo(numShards)), + shardMask_(numShards_ - 1), + counters_(numShards_) { + VELOX_CHECK_GE(numShards_, 1); + for (auto& counter : counters_) { + counter.value = T(); + } + } + + ConcurrentCounter(const ConcurrentCounter&) = delete; + ConcurrentCounter& operator=(const ConcurrentCounter&) = delete; + + /// Invoked to read the sum of values from 'counters_'. + T read() const { + T sum = T(); + for (size_t i = 0; i < numShards_; ++i) { + sum += counters_[i].read(); + } + return sum; + } + + /// Invoked to update with 'delta'. + void update(T delta) { + counters_[shardIndex()].update(delta); + } + + /// Invoked to update with 'delta' and user provided 'updateFn'. The function + /// picks up the shard to apply the customized update. + using UpdateFn = std::function; + bool update(T delta, const UpdateFn& updateFn) { + return counters_[shardIndex()].update(delta, updateFn); + } + + void testingClear() { + for (auto& counter : counters_) { + counter.value = T(); + } + } + + T testingRead(size_t index) const { + return counters_[index].read(); + } + + bool testingUpdate(size_t index, T delta, const UpdateFn& updateFn) { + return counters_[index].update(delta, updateFn); + } + + private: + struct alignas(folly::hardware_destructive_interference_size) Counter { + mutable std::mutex lock; + T value; + + T read() const { + std::lock_guard l(lock); + return value; + } + + void update(T delta) { + std::lock_guard l(lock); + value += delta; + } + + bool update(T delta, const UpdateFn& updateFn) { + return updateFn(value, delta, lock); + } + }; + + size_t shardIndex() const { + const size_t hash = + std::hash{}(std::this_thread::get_id()); + const size_t index = hash & shardMask_; + VELOX_DCHECK_LT(index, counters_.size()); + return index; + } + + const size_t numShards_; + const size_t shardMask_; + + std::vector counters_; +}; +} // namespace facebook::velox diff --git a/velox/common/base/CountBits.h b/velox/common/base/CountBits.h new file mode 100644 index 0000000000000..b267d2f636ef0 --- /dev/null +++ b/velox/common/base/CountBits.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace facebook::velox { + +// Copied from format.h of fmt. +FOLLY_ALWAYS_INLINE int countDigits(__uint128_t n) { + int count = 1; + for (;;) { + if (n < 10) { + return count; + } + if (n < 100) { + return count + 1; + } + if (n < 1000) { + return count + 2; + } + if (n < 10000) { + return count + 3; + } + n /= 10000u; + count += 4; + } +} + +} // namespace facebook::velox diff --git a/velox/common/base/Counters.cpp b/velox/common/base/Counters.cpp index a7c1b931a4544..67421ced6be9f 100644 --- a/velox/common/base/Counters.cpp +++ b/velox/common/base/Counters.cpp @@ -19,11 +19,503 @@ namespace facebook::velox { -void registerVeloxCounters() { - // Track hive handle generation latency in range of [0, 100s] and reports +void registerVeloxMetrics() { + /// ================== Task Execution Counters ================= + // The number of driver yield count when exceeds the per-driver cpu time slice + // limit if enforced. + DEFINE_METRIC(kMetricDriverYieldCount, facebook::velox::StatType::COUNT); + + // Tracks driver queue latency in range of [0, 10s] with 20 buckets and + // reports P50, P90, P99, and P100. + DEFINE_HISTOGRAM_METRIC( + kMetricDriverQueueTimeMs, 500, 0, 10'000, 50, 90, 99, 100); + + // Tracks driver execution latency in range of [0, 30s] with 30 buckets and + // reports P50, P90, P99, and P100. + DEFINE_HISTOGRAM_METRIC( + kMetricDriverExecTimeMs, 1'000, 0, 30'000, 50, 90, 99, 100); + + /// ================== Cache Counters ================= + + // Tracks hive handle generation latency in range of [0, 100s] and reports // P50, P90, P99, and P100. - REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE( - kCounterHiveFileHandleGenerateLatencyMs, 10, 0, 100000, 50, 90, 99, 100); -} + DEFINE_HISTOGRAM_METRIC( + kMetricHiveFileHandleGenerateLatencyMs, + 10'000, + 0, + 100'000, + 50, + 90, + 99, + 100); + + DEFINE_METRIC(kMetricCacheShrinkCount, facebook::velox::StatType::COUNT); + + // Tracks cache shrink latency in range of [0, 100s] with 10 buckets and + // reports P50, P90, P99, and P100. + DEFINE_HISTOGRAM_METRIC( + kMetricCacheShrinkTimeMs, 10'000, 0, 100'000, 50, 90, 99, 100); + + /// ================== Memory Allocator Counters ================= + + // Number of bytes currently mapped in MemoryAllocator. These bytes represent + // the bytes that are either currently being allocated or were in the past + // allocated, not yet been returned back to the operating system, in the + // form of 'Allocation' or 'ContiguousAllocation'. + DEFINE_METRIC(kMetricMappedMemoryBytes, facebook::velox::StatType::AVG); + + // Number of bytes currently allocated (used) from MemoryAllocator in the form + // of 'Allocation' or 'ContiguousAllocation'. + DEFINE_METRIC(kMetricAllocatedMemoryBytes, facebook::velox::StatType::AVG); + + // Number of bytes currently mapped in MmapAllocator, in the form of + // 'ContiguousAllocation'. + // + // NOTE: This applies only to MmapAllocator + DEFINE_METRIC(kMetricMmapExternalMappedBytes, facebook::velox::StatType::AVG); + + // Number of bytes currently allocated from MmapAllocator directly from raw + // allocateBytes() interface, and internally allocated by malloc. Only small + // chunks of memory are delegated to malloc. + // + // NOTE: This applies only to MmapAllocator + DEFINE_METRIC(kMetricMmapDelegatedAllocBytes, facebook::velox::StatType::AVG); + + /// ================== AsyncDataCache Counters ================= + + // Max possible age of AsyncDataCache and SsdCache entries since the raw file + // was opened to load the cache. + DEFINE_METRIC(kMetricCacheMaxAgeSecs, facebook::velox::StatType::AVG); + + // Total number of cache entries. + DEFINE_METRIC(kMetricMemoryCacheNumEntries, facebook::velox::StatType::AVG); + + // Total number of cache entries that do not cache anything. + DEFINE_METRIC( + kMetricMemoryCacheNumEmptyEntries, facebook::velox::StatType::AVG); + + // Total number of cache entries that are pinned for shared access. + DEFINE_METRIC( + kMetricMemoryCacheNumSharedEntries, facebook::velox::StatType::AVG); + + // Total number of cache entries that are pinned for exclusive access. + DEFINE_METRIC( + kMetricMemoryCacheNumExclusiveEntries, facebook::velox::StatType::AVG); + + // Total number of cache entries that are being or have been prefetched but + // have not been hit. + DEFINE_METRIC( + kMetricMemoryCacheNumPrefetchedEntries, facebook::velox::StatType::AVG); + + // Total number of bytes of the cached data that is much smaller than + // kTinyDataSize. + DEFINE_METRIC( + kMetricMemoryCacheTotalTinyBytes, facebook::velox::StatType::AVG); + + // Total number of bytes of the cached data excluding + // 'kMetricMemoryCacheTotalTinyBytes'. + DEFINE_METRIC( + kMetricMemoryCacheTotalLargeBytes, facebook::velox::StatType::AVG); + + // Total unused capacity bytes in 'kMetricMemoryCacheTotalTinyBytes'. + DEFINE_METRIC( + kMetricMemoryCacheTotalTinyPaddingBytes, facebook::velox::StatType::AVG); + + // Total unused capacity bytes in 'kMetricMemoryCacheTotalLargeBytes'. + DEFINE_METRIC( + kMetricMemoryCacheTotalLargePaddingBytes, facebook::velox::StatType::AVG); + + // Total bytes of cache entries in prefetch state. + DEFINE_METRIC( + kMetricMemoryCacheTotalPrefetchBytes, facebook::velox::StatType::AVG); + + // Sum of scores of evicted entries. This serves to infer an average lifetime + // for entries in cache. + DEFINE_METRIC( + kMetricMemoryCacheSumEvictScore, facebook::velox::StatType::SUM); + + // Number of hits (saved IO) since last counter retrieval. The first hit to a + // prefetched entry does not count. + DEFINE_METRIC(kMetricMemoryCacheNumHits, facebook::velox::StatType::SUM); + + // Amount of hit bytes (saved IO) since last counter retrieval. The first hit + // to a prefetched entry does not count. + DEFINE_METRIC(kMetricMemoryCacheHitBytes, facebook::velox::StatType::SUM); + + // Number of new entries created since last counter retrieval. + DEFINE_METRIC(kMetricMemoryCacheNumNew, facebook::velox::StatType::SUM); + + // Number of times a valid entry was removed in order to make space, since + // last counter retrieval. + DEFINE_METRIC(kMetricMemoryCacheNumEvicts, facebook::velox::StatType::SUM); + + // Number of times a valid entry was removed in order to make space but has + // not been saved to SSD yet, since last counter retrieval. + DEFINE_METRIC( + kMetricMemoryCacheNumSavableEvicts, facebook::velox::StatType::SUM); + + // Number of entries considered for evicting, since last counter retrieval. + DEFINE_METRIC( + kMetricMemoryCacheNumEvictChecks, facebook::velox::StatType::SUM); + + // Number of times a user waited for an entry to transit from exclusive to + // shared mode, since last counter retrieval. + DEFINE_METRIC( + kMetricMemoryCacheNumWaitExclusive, facebook::velox::StatType::SUM); + + // Clocks spent in allocating or freeing memory for backing cache entries, + // since last counter retrieval + DEFINE_METRIC( + kMetricMemoryCacheNumAllocClocks, facebook::velox::StatType::SUM); + + // Number of AsyncDataCache entries that are aged out and evicted + // given configured TTL. + DEFINE_METRIC( + kMetricMemoryCacheNumAgedOutEntries, facebook::velox::StatType::SUM); + + // Number of AsyncDataCache entries that are stale because of cache request + // size mismatch. + DEFINE_METRIC( + kMetricMemoryCacheNumStaleEntries, facebook::velox::StatType::COUNT); + + /// ================== SsdCache Counters ================== + + // Number of regions currently cached by SSD. + DEFINE_METRIC(kMetricSsdCacheCachedRegions, facebook::velox::StatType::AVG); + + // Number of entries currently cached by SSD. + DEFINE_METRIC(kMetricSsdCacheCachedEntries, facebook::velox::StatType::AVG); + + // Total bytes currently cached by SSD. + DEFINE_METRIC(kMetricSsdCacheCachedBytes, facebook::velox::StatType::AVG); + + // Total number of entries read from SSD. + DEFINE_METRIC(kMetricSsdCacheReadEntries, facebook::velox::StatType::SUM); + + // Total number of bytes read from SSD. + DEFINE_METRIC(kMetricSsdCacheReadBytes, facebook::velox::StatType::SUM); + + // Total number of entries written to SSD. + DEFINE_METRIC(kMetricSsdCacheWrittenEntries, facebook::velox::StatType::SUM); + + // Total number of bytes written to SSD. + DEFINE_METRIC(kMetricSsdCacheWrittenBytes, facebook::velox::StatType::SUM); + + // Total number of SsdCache entries that are aged out and evicted given + // configured TTL. + DEFINE_METRIC(kMetricSsdCacheAgedOutEntries, facebook::velox::StatType::SUM); + + // Total number of SsdCache regions that are aged out and evicted given + // configured TTL. + DEFINE_METRIC(kMetricSsdCacheAgedOutRegions, facebook::velox::StatType::SUM); + + // Total number of SSD file open errors. + DEFINE_METRIC(kMetricSsdCacheOpenSsdErrors, facebook::velox::StatType::SUM); + + // Total number of SSD checkpoint file open errors. + DEFINE_METRIC( + kMetricSsdCacheOpenCheckpointErrors, facebook::velox::StatType::SUM); + + // Total number of SSD evict log file open errors. + DEFINE_METRIC(kMetricSsdCacheOpenLogErrors, facebook::velox::StatType::SUM); + + // Total number of errors while deleting SSD checkpoint files. + DEFINE_METRIC( + kMetricSsdCacheDeleteCheckpointErrors, facebook::velox::StatType::SUM); + + // Total number of errors while growing SSD cache files. + DEFINE_METRIC(kMetricSsdCacheGrowFileErrors, facebook::velox::StatType::SUM); + + // Total number of error while writing to SSD cache files. + DEFINE_METRIC(kMetricSsdCacheWriteSsdErrors, facebook::velox::StatType::SUM); + + // Total number of errors while writing SSD checkpoint file. + DEFINE_METRIC( + kMetricSsdCacheWriteCheckpointErrors, facebook::velox::StatType::SUM); + + // Total number of writes dropped due to no cache space. + DEFINE_METRIC(kMetricSsdCacheWriteSsdDropped, facebook::velox::StatType::SUM); + + // Total number of errors while reading from SSD cache files. + DEFINE_METRIC(kMetricSsdCacheReadSsdErrors, facebook::velox::StatType::SUM); + + // Total number of corrupted SSD data read detected by checksum. + DEFINE_METRIC(kMetricSsdCacheReadCorruptions, facebook::velox::StatType::SUM); + // Total number of errors while reading from SSD checkpoint files. + DEFINE_METRIC( + kMetricSsdCacheReadCheckpointErrors, facebook::velox::StatType::SUM); + + // Total number of SSD cache reads without checksum verification due to + // mismatch in SSD cache request size. + DEFINE_METRIC( + kMetricSsdCacheReadWithoutChecksum, facebook::velox::StatType::SUM); + + // Total number of checkpoints read. + DEFINE_METRIC(kMetricSsdCacheCheckpointsRead, facebook::velox::StatType::SUM); + + // Total number of checkpoints written. + DEFINE_METRIC( + kMetricSsdCacheCheckpointsWritten, facebook::velox::StatType::SUM); + + // Total number of cache regions evicted. + DEFINE_METRIC(kMetricSsdCacheRegionsEvicted, facebook::velox::StatType::SUM); + + // Total number of cache entries recovered from checkpoint. + DEFINE_METRIC( + kMetricSsdCacheRecoveredEntries, facebook::velox::StatType::SUM); + + /// ================== Memory Arbitration Counters ================= + + // The number of arbitration requests. + DEFINE_METRIC( + kMetricArbitratorRequestsCount, facebook::velox::StatType::COUNT); + + // The number of times a query level memory pool is aborted as a result of a + // memory arbitration process. The memory pool aborted will eventually result + // in a cancelling of the original query. + DEFINE_METRIC( + kMetricArbitratorAbortedCount, facebook::velox::StatType::COUNT); + + // The number of times a memory arbitration request failed. This may occur + // either because the requester was terminated during the processing of its + // request, the arbitration request would surpass the maximum allowed capacity + // for the requester, or the arbitration process couldn't release the + // requested amount of memory. + DEFINE_METRIC( + kMetricArbitratorFailuresCount, facebook::velox::StatType::COUNT); + + // Tracks the memory reclaim count on an operator. + DEFINE_METRIC(kMetricMemoryReclaimCount, facebook::velox::StatType::COUNT); + + // Tracks op memory reclaim exec time in range of [0, 600s] with 20 buckets + // and reports P50, P90, P99, and P100. + DEFINE_HISTOGRAM_METRIC( + kMetricMemoryReclaimExecTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100); + + // Tracks op memory reclaim bytes distribution in range of [0, 4GB] with 64 + // buckets and reports P50, P90, P99, and P100 + DEFINE_HISTOGRAM_METRIC( + kMetricMemoryReclaimedBytes, + 67'108'864, + 0, + 4'294'967'296, + 50, + 90, + 99, + 100); + + // Tracks the memory reclaim count on an operator. + DEFINE_METRIC( + kMetricTaskMemoryReclaimCount, facebook::velox::StatType::COUNT); + + // Tracks memory reclaim task wait time in range of [0, 60s] with 60 buckets + // and reports P50, P90, P99, and P100. + DEFINE_HISTOGRAM_METRIC( + kMetricTaskMemoryReclaimWaitTimeMs, 1'000, 0, 60'000, 50, 90, 99, 100); + + // Tracks memory reclaim task wait time in range of [0, 240s] with 60 buckets + // and reports P50, P90, P99, and P100. + DEFINE_HISTOGRAM_METRIC( + kMetricTaskMemoryReclaimExecTimeMs, 4'000, 0, 240'000, 50, 90, 99, 100); + + // Tracks the number of times that the task memory reclaim wait timeouts. + DEFINE_METRIC( + kMetricTaskMemoryReclaimWaitTimeoutCount, + facebook::velox::StatType::COUNT); + + // The number of times that the memory reclaim fails because the operator is + // executing a non-reclaimable section where it is expected to have reserved + // enough memory to execute without asking for more. Therefore, it is an + // indicator that the memory reservation is not sufficient. It excludes + // counting instances where the operator is in a non-reclaimable state due to + // currently being on-thread and running or being already cancelled. + DEFINE_METRIC( + kMetricMemoryNonReclaimableCount, facebook::velox::StatType::COUNT); + + // The number of arbitration that reclaims the used memory from the query + // which initiates the memory arbitration request itself. It ensures the + // memory arbitration request won't exceed its per-query memory capacity + // limit. + DEFINE_METRIC( + kMetricArbitratorLocalArbitrationCount, facebook::velox::StatType::COUNT); + + // The number of arbitration which ensures the total allocated query capacity + // won't exceed the arbitrator capacity limit. It may or may not reclaim + // memory from the query which initiate the memory arbitration request. This + // indicates the velox runtime doesn't have enough memory to run all the + // queries at their peak memory usage. We have to trigger spilling to let them + // run through completion. + DEFINE_METRIC( + kMetricArbitratorGlobalArbitrationCount, + facebook::velox::StatType::COUNT); + + // The number of global arbitration that reclaims used memory by slow disk + // spilling. + DEFINE_METRIC( + kMetricArbitratorSlowGlobalArbitrationCount, + facebook::velox::StatType::COUNT); + + // The distribution of the amount of time an arbitration operation stays in + // arbitration queues and waits the arbitration r/w locks in range of [0, + // 600s] with 20 buckets. It is configured to report the latency at P50, P90, + // P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricArbitratorWaitTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100); + + // The distribution of the amount of time it takes to complete a single + // arbitration request stays queued in range of [0, 600s] with 20 + // buckets. It is configured to report the latency at P50, P90, P99, + // and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricArbitratorArbitrationTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100); + + // Tracks the average of free memory capacity managed by the arbitrator in + // bytes. + DEFINE_METRIC( + kMetricArbitratorFreeCapacityBytes, facebook::velox::StatType::AVG); + + DEFINE_METRIC( + kMetricArbitratorFreeReservedCapacityBytes, + facebook::velox::StatType::AVG); + + // Tracks the leaf memory pool usage leak in bytes. + DEFINE_METRIC( + kMetricMemoryPoolUsageLeakBytes, facebook::velox::StatType::SUM); + + // Tracks the leaf memory pool reservation leak in bytes. + DEFINE_METRIC( + kMetricMemoryPoolReservationLeakBytes, facebook::velox::StatType::SUM); + + // The distribution of a root memory pool's initial capacity in range of [0, + // 256MB] with 32 buckets. It is configured to report the capacity at P50, + // P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricMemoryPoolInitialCapacityBytes, + 8L << 20, + 0, + 256L << 20, + 50, + 90, + 99, + 100); + + // The distribution of a root memory pool cappacity growth attempts through + // memory arbitration in range of [0, 256] with 32 buckets. It is configured + // to report the count at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricMemoryPoolCapacityGrowCount, 8, 0, 256, 50, 90, 99, 100); + + // Tracks the count of double frees in memory allocator, indicating the + // possibility of buffer ownership issues when a buffer is freed more than + // once. + DEFINE_METRIC( + kMetricMemoryAllocatorDoubleFreeCount, facebook::velox::StatType::COUNT); + + /// ================== Spill related Counters ================= + + // The number of bytes in memory to spill. + DEFINE_METRIC(kMetricSpilledInputBytes, facebook::velox::StatType::SUM); + + // The number of bytes spilled to disk which can be number of compressed + // bytes if compression is enabled. + DEFINE_METRIC(kMetricSpilledBytes, facebook::velox::StatType::SUM); + + // The number of spilled rows. + DEFINE_METRIC(kMetricSpilledRowsCount, facebook::velox::StatType::COUNT); + + // The number of spilled files. + DEFINE_METRIC(kMetricSpilledFilesCount, facebook::velox::StatType::COUNT); + + // The distribution of the amount of time spent on filling rows for spilling. + // in range of [0, 600s] with 20 buckets. It is configured to report the + // latency at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillFillTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100); + + // The distribution of the amount of time spent on sorting rows for spilling + // in range of [0, 600s] with 20 buckets. It is configured to report the + // latency at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillSortTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100); + + // The distribution of the amount of time spent on serializing rows for + // spilling in range of [0, 600s] with 20 buckets. It is configured to report + // the latency at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillSerializationTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100); + + // The number of spill writes to storage, which is the number of write calls + // to velox filesystem. + DEFINE_METRIC(kMetricSpillWritesCount, facebook::velox::StatType::COUNT); + + // The distribution of the amount of time spent on copy out serialized + // rows for disk write in range of [0, 600s] with 20 buckets. It is configured + // to report the latency at P50, P90, P99, and P100 percentiles. Note: If + // compression is enabled, this includes the compression time. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillFlushTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100); + + // The distribution of the amount of time spent on writing spilled rows to + // disk in range of [0, 600s] with 20 buckets. It is configured to report the + // latency at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillWriteTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100); + + // Tracks the number of times that we hit the max spill level limit. + DEFINE_METRIC( + kMetricMaxSpillLevelExceededCount, facebook::velox::StatType::COUNT); + + // Tracks the total number of bytes in file writers that's pre-maturely + // flushed due to memory reclaiming. + DEFINE_METRIC( + kMetricFileWriterEarlyFlushedRawBytes, facebook::velox::StatType::SUM); + + // The current spilling memory usage in bytes. + DEFINE_METRIC(kMetricSpillMemoryBytes, facebook::velox::StatType::AVG); + + // The peak spilling memory usage in bytes. + DEFINE_METRIC(kMetricSpillPeakMemoryBytes, facebook::velox::StatType::AVG); + + // The data exchange time distribution in range of [0, 5s] with 50 buckets. It + // is configured to report the latency at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricExchangeDataTimeMs, 1'00, 0, 5'000, 50, 90, 99, 100); + + // The exchange data size in bytes. + DEFINE_METRIC(kMetricExchangeDataBytes, facebook::velox::StatType::SUM); + + // The number of data exchange requests. + DEFINE_METRIC(kMetricExchangeDataCount, facebook::velox::StatType::COUNT); + + // The data exchange size time distribution in range of [0, 5s] with 50 + // buckets. It is configured to report the latency at P50, P90, P99, and P100 + // percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricExchangeDataSizeTimeMs, 1'00, 0, 5'000, 50, 90, 99, 100); + + // The distribution of exchange data size in range of [0, 128MB] with 128 + // buckets. It is configured to report the capacity at P50, P90, P99, and P100 + // percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricExchangeDataSize, 1L << 20, 0, 128L << 20, 50, 90, 99, 100); + + // The number of data size exchange requests. + DEFINE_METRIC(kMetricExchangeDataSizeCount, facebook::velox::StatType::COUNT); + + /// ================== Storage Counters ================= + + // The time distribution of storage IO throttled duration in range of [0, 30s] + // with 30 buckets. It is configured to report the capacity at P50, P90, P99, + // and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricStorageThrottledDurationMs, 1'000, 0, 30'000, 50, 90, 99, 100); + + // The number of times that storage IOs get throttled in a storage directory. + DEFINE_METRIC(kMetricStorageLocalThrottled, facebook::velox::StatType::COUNT); + + // The number of times that storage IOs get throttled in a storage cluster. + DEFINE_METRIC( + kMetricStorageGlobalThrottled, facebook::velox::StatType::COUNT); +} } // namespace facebook::velox diff --git a/velox/common/base/Counters.h b/velox/common/base/Counters.h index bd661709fddcd..eb1e46d971a53 100644 --- a/velox/common/base/Counters.h +++ b/velox/common/base/Counters.h @@ -20,9 +20,309 @@ namespace facebook::velox { -// Velox Counter Registration -void registerVeloxCounters(); +/// Velox metrics Registration. +void registerVeloxMetrics(); -constexpr folly::StringPiece kCounterHiveFileHandleGenerateLatencyMs{ +constexpr folly::StringPiece kMetricHiveFileHandleGenerateLatencyMs{ "velox.hive_file_handle_generate_latency_ms"}; + +constexpr folly::StringPiece kMetricCacheShrinkCount{ + "velox.cache_shrink_count"}; + +constexpr folly::StringPiece kMetricCacheShrinkTimeMs{"velox.cache_shrink_ms"}; + +constexpr folly::StringPiece kMetricMaxSpillLevelExceededCount{ + "velox.spill_max_level_exceeded_count"}; + +constexpr folly::StringPiece kMetricMemoryReclaimExecTimeMs{ + "velox.memory_reclaim_exec_ms"}; + +constexpr folly::StringPiece kMetricMemoryReclaimedBytes{ + "velox.memory_reclaim_bytes"}; + +constexpr folly::StringPiece kMetricMemoryReclaimCount{ + "velox.memory_reclaim_count"}; + +constexpr folly::StringPiece kMetricTaskMemoryReclaimCount{ + "velox.task_memory_reclaim_count"}; + +constexpr folly::StringPiece kMetricTaskMemoryReclaimWaitTimeMs{ + "velox.task_memory_reclaim_wait_ms"}; + +constexpr folly::StringPiece kMetricTaskMemoryReclaimExecTimeMs{ + "velox.task_memory_reclaim_exec_ms"}; + +constexpr folly::StringPiece kMetricTaskMemoryReclaimWaitTimeoutCount{ + "velox.task_memory_reclaim_wait_timeout_count"}; + +constexpr folly::StringPiece kMetricMemoryNonReclaimableCount{ + "velox.memory_non_reclaimable_count"}; + +constexpr folly::StringPiece kMetricMemoryPoolInitialCapacityBytes{ + "velox.memory_pool_initial_capacity_bytes"}; + +constexpr folly::StringPiece kMetricMemoryPoolCapacityGrowCount{ + "velox.memory_pool_capacity_growth_count"}; + +constexpr folly::StringPiece kMetricMemoryPoolUsageLeakBytes{ + "velox.memory_pool_usage_leak_bytes"}; + +constexpr folly::StringPiece kMetricMemoryPoolReservationLeakBytes{ + "velox.memory_pool_reservation_leak_bytes"}; + +constexpr folly::StringPiece kMetricMemoryAllocatorDoubleFreeCount{ + "velox.memory_allocator_double_free_count"}; + +constexpr folly::StringPiece kMetricArbitratorLocalArbitrationCount{ + "velox.arbitrator_local_arbitration_count"}; + +constexpr folly::StringPiece kMetricArbitratorGlobalArbitrationCount{ + "velox.arbitrator_global_arbitration_count"}; + +constexpr folly::StringPiece kMetricArbitratorSlowGlobalArbitrationCount{ + "velox.arbitrator_slow_global_arbitration_count"}; + +constexpr folly::StringPiece kMetricArbitratorAbortedCount{ + "velox.arbitrator_aborted_count"}; + +constexpr folly::StringPiece kMetricArbitratorFailuresCount{ + "velox.arbitrator_failures_count"}; + +constexpr folly::StringPiece kMetricArbitratorArbitrationTimeMs{ + "velox.arbitrator_arbitration_time_ms"}; + +constexpr folly::StringPiece kMetricArbitratorWaitTimeMs{ + "velox.arbitrator_wait_time_ms"}; + +constexpr folly::StringPiece kMetricArbitratorFreeCapacityBytes{ + "velox.arbitrator_free_capacity_bytes"}; + +constexpr folly::StringPiece kMetricArbitratorFreeReservedCapacityBytes{ + "velox.arbitrator_free_reserved_capacity_bytes"}; + +constexpr folly::StringPiece kMetricDriverYieldCount{ + "velox.driver_yield_count"}; + +constexpr folly::StringPiece kMetricDriverQueueTimeMs{ + "velox.driver_queue_time_ms"}; + +constexpr folly::StringPiece kMetricDriverExecTimeMs{ + "velox.driver_exec_time_ms"}; + +constexpr folly::StringPiece kMetricSpilledInputBytes{ + "velox.spill_input_bytes"}; + +constexpr folly::StringPiece kMetricSpilledBytes{"velox.spill_bytes"}; + +constexpr folly::StringPiece kMetricSpilledRowsCount{"velox.spill_rows_count"}; + +constexpr folly::StringPiece kMetricSpilledFilesCount{ + "velox.spill_files_count"}; + +constexpr folly::StringPiece kMetricSpillFillTimeMs{"velox.spill_fill_time_ms"}; + +constexpr folly::StringPiece kMetricSpillSortTimeMs{"velox.spill_sort_time_ms"}; + +constexpr folly::StringPiece kMetricSpillSerializationTimeMs{ + "velox.spill_serialization_time_ms"}; + +constexpr folly::StringPiece kMetricSpillWritesCount{ + "velox.spill_writes_count"}; + +constexpr folly::StringPiece kMetricSpillFlushTimeMs{ + "velox.spill_flush_time_ms"}; + +constexpr folly::StringPiece kMetricSpillWriteTimeMs{ + "velox.spill_write_time_ms"}; + +constexpr folly::StringPiece kMetricSpillMemoryBytes{ + "velox.spill_memory_bytes"}; + +constexpr folly::StringPiece kMetricSpillPeakMemoryBytes{ + "velox.spill_peak_memory_bytes"}; + +constexpr folly::StringPiece kMetricFileWriterEarlyFlushedRawBytes{ + "velox.file_writer_early_flushed_raw_bytes"}; + +constexpr folly::StringPiece kMetricArbitratorRequestsCount{ + "velox.arbitrator_requests_count"}; + +constexpr folly::StringPiece kMetricMappedMemoryBytes{ + "velox.memory_allocator_mapped_bytes"}; + +constexpr folly::StringPiece kMetricAllocatedMemoryBytes{ + "velox.memory_allocator_alloc_bytes"}; + +constexpr folly::StringPiece kMetricMmapExternalMappedBytes{ + "velox.mmap_allocator_external_mapped_bytes"}; + +constexpr folly::StringPiece kMetricMmapDelegatedAllocBytes{ + "velox.mmap_allocator_delegated_alloc_bytes"}; + +constexpr folly::StringPiece kMetricCacheMaxAgeSecs{"velox.cache_max_age_secs"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumEntries{ + "velox.memory_cache_num_entries"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumEmptyEntries{ + "velox.memory_cache_num_empty_entries"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumSharedEntries{ + "velox.memory_cache_num_shared_entries"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumExclusiveEntries{ + "velox.memory_cache_num_exclusive_entries"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumPrefetchedEntries{ + "velox.memory_cache_num_prefetched_entries"}; + +constexpr folly::StringPiece kMetricMemoryCacheTotalTinyBytes{ + "velox.memory_cache_total_tiny_bytes"}; + +constexpr folly::StringPiece kMetricMemoryCacheTotalLargeBytes{ + "velox.memory_cache_total_large_bytes"}; + +constexpr folly::StringPiece kMetricMemoryCacheTotalTinyPaddingBytes{ + "velox.memory_cache_total_tiny_padding_bytes"}; + +constexpr folly::StringPiece kMetricMemoryCacheTotalLargePaddingBytes{ + "velox.memory_cache_total_large_padding_bytes"}; + +constexpr folly::StringPiece kMetricMemoryCacheTotalPrefetchBytes{ + "velox.memory_cache_total_prefetched_bytes"}; + +constexpr folly::StringPiece kMetricMemoryCacheSumEvictScore{ + "velox.memory_cache_sum_evict_score"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumHits{ + "velox.memory_cache_num_hits"}; + +constexpr folly::StringPiece kMetricMemoryCacheHitBytes{ + "velox.memory_cache_hit_bytes"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumNew{ + "velox.memory_cache_num_new"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumEvicts{ + "velox.memory_cache_num_evicts"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumSavableEvicts{ + "velox.memory_cache_num_savable_evicts"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumEvictChecks{ + "velox.memory_cache_num_evict_checks"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumWaitExclusive{ + "velox.memory_cache_num_wait_exclusive"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumAllocClocks{ + "velox.memory_cache_num_alloc_clocks"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumAgedOutEntries{ + "velox.memory_cache_num_aged_out_entries"}; + +constexpr folly::StringPiece kMetricMemoryCacheNumStaleEntries{ + "velox.memory_cache_num_stale_entries"}; + +constexpr folly::StringPiece kMetricSsdCacheCachedRegions{ + "velox.ssd_cache_cached_regions"}; + +constexpr folly::StringPiece kMetricSsdCacheCachedEntries{ + "velox.ssd_cache_cached_entries"}; + +constexpr folly::StringPiece kMetricSsdCacheCachedBytes{ + "velox.ssd_cache_cached_bytes"}; + +constexpr folly::StringPiece kMetricSsdCacheReadEntries{ + "velox.ssd_cache_read_entries"}; + +constexpr folly::StringPiece kMetricSsdCacheReadBytes{ + "velox.ssd_cache_read_bytes"}; + +constexpr folly::StringPiece kMetricSsdCacheWrittenEntries{ + "velox.ssd_cache_written_entries"}; + +constexpr folly::StringPiece kMetricSsdCacheWrittenBytes{ + "velox.ssd_cache_written_bytes"}; + +constexpr folly::StringPiece kMetricSsdCacheAgedOutEntries{ + "velox.ssd_cache_aged_out_entries"}; + +constexpr folly::StringPiece kMetricSsdCacheAgedOutRegions{ + "velox.ssd_cache_aged_out_regions"}; + +constexpr folly::StringPiece kMetricSsdCacheOpenSsdErrors{ + "velox.ssd_cache_open_ssd_errors"}; + +constexpr folly::StringPiece kMetricSsdCacheOpenCheckpointErrors{ + "velox.ssd_cache_open_checkpoint_errors"}; + +constexpr folly::StringPiece kMetricSsdCacheOpenLogErrors{ + "velox.ssd_cache_open_log_errors"}; + +constexpr folly::StringPiece kMetricSsdCacheDeleteCheckpointErrors{ + "velox.ssd_cache_delete_checkpoint_errors"}; + +constexpr folly::StringPiece kMetricSsdCacheGrowFileErrors{ + "velox.ssd_cache_grow_file_errors"}; + +constexpr folly::StringPiece kMetricSsdCacheWriteSsdErrors{ + "velox.ssd_cache_write_ssd_errors"}; + +constexpr folly::StringPiece kMetricSsdCacheWriteSsdDropped{ + "velox.ssd_cache_write_ssd_dropped"}; + +constexpr folly::StringPiece kMetricSsdCacheWriteCheckpointErrors{ + "velox.ssd_cache_write_checkpoint_errors"}; + +constexpr folly::StringPiece kMetricSsdCacheReadCorruptions{ + "velox.ssd_cache_read_corruptions"}; + +constexpr folly::StringPiece kMetricSsdCacheReadSsdErrors{ + "velox.ssd_cache_read_ssd_errors"}; + +constexpr folly::StringPiece kMetricSsdCacheReadCheckpointErrors{ + "velox.ssd_cache_read_checkpoint_errors"}; + +constexpr folly::StringPiece kMetricSsdCacheReadWithoutChecksum{ + "velox.ssd_cache_read_without_checksum"}; + +constexpr folly::StringPiece kMetricSsdCacheCheckpointsRead{ + "velox.ssd_cache_checkpoints_read"}; + +constexpr folly::StringPiece kMetricSsdCacheCheckpointsWritten{ + "velox.ssd_cache_checkpoints_written"}; + +constexpr folly::StringPiece kMetricSsdCacheRegionsEvicted{ + "velox.ssd_cache_regions_evicted"}; + +constexpr folly::StringPiece kMetricSsdCacheRecoveredEntries{ + "velox.ssd_cache_recovered_entries"}; + +constexpr folly::StringPiece kMetricExchangeDataTimeMs{ + "velox.exchange_data_time_ms"}; + +constexpr folly::StringPiece kMetricExchangeDataBytes{ + "velox.exchange_data_bytes"}; + +constexpr folly::StringPiece kMetricExchangeDataSize{ + "velox.exchange_data_size"}; + +constexpr folly::StringPiece kMetricExchangeDataCount{ + "velox.exchange_data_count"}; + +constexpr folly::StringPiece kMetricExchangeDataSizeTimeMs{ + "velox.exchange_data_size_time_ms"}; + +constexpr folly::StringPiece kMetricExchangeDataSizeCount{ + "velox.exchange_data_size_count"}; + +constexpr folly::StringPiece kMetricStorageThrottledDurationMs{ + "velox.storage_throttled_duration_ms"}; + +constexpr folly::StringPiece kMetricStorageLocalThrottled{ + "velox.storage_local_throttled_count"}; + +constexpr folly::StringPiece kMetricStorageGlobalThrottled{ + "velox.storage_global_throttled_count"}; } // namespace facebook::velox diff --git a/velox/common/base/Doubles.h b/velox/common/base/Doubles.h new file mode 100644 index 0000000000000..d400d92ba1885 --- /dev/null +++ b/velox/common/base/Doubles.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace facebook::velox { + +/// Comparing double against int64:max() like the following would get compile +/// time error on some compilers(e.g. clang 12 and above): +/// +/// ``` +/// double_value <= int64::max() +/// double_value >= int64::max() +/// ``` +/// +/// Here `int64::max()` will be implicitly converted to double, but due to the +/// floating point nature of double, converting `int64::max()` to double lose +/// precision(see [1]), so instead of comparing double with int64:max(), we +/// suggest compare it with the max double value below int64:max() 2 ^ 63 - +/// 1024 for < or <=, 2 ^ 63 for >, >=. +/// +/// [1].https://en.wikipedia.org/wiki/Double-precision_floating-point_format#Precision_limitations_on_integer_values + +/// 2 ^ 63 - 1024 +constexpr double kMaxDoubleBelowInt64Max = 9223372036854774784.0; +/// 2 ^ 63 +constexpr double kMinDoubleAboveInt64Max = 9223372036854775808.0; + +/// For int128::max() +/// 2 ^ 127 - 2 ^ 74 +constexpr double kMaxDoubleBelowInt128Max = + 170141183460469212842221372237303250944.0; +} // namespace facebook::velox diff --git a/velox/common/base/Exceptions.cpp b/velox/common/base/Exceptions.cpp index 62a28388496c6..0d6e65b1d76c2 100644 --- a/velox/common/base/Exceptions.cpp +++ b/velox/common/base/Exceptions.cpp @@ -16,13 +16,9 @@ #include "velox/common/base/Exceptions.h" -namespace facebook { -namespace velox { -namespace detail { +namespace facebook::velox::detail { DEFINE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxRuntimeError); DEFINE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxUserError); -} // namespace detail -} // namespace velox -} // namespace facebook +} // namespace facebook::velox::detail diff --git a/velox/common/base/Exceptions.h b/velox/common/base/Exceptions.h index 1a96c5e59da5f..47aedd2af7b7a 100644 --- a/velox/common/base/Exceptions.h +++ b/velox/common/base/Exceptions.h @@ -21,15 +21,12 @@ #include #include -#include - -#include -#include #include + +#include "velox/common/base/FmtStdFormatters.h" #include "velox/common/base/VeloxException.h" -namespace facebook { -namespace velox { +namespace facebook::velox { namespace detail { struct VeloxCheckFailArgs { @@ -157,15 +154,15 @@ std::string errorMessage(fmt::string_view fmt, const Args&... args) { } // namespace detail #define _VELOX_THROW_IMPL( \ - exception, expr_str, errorSource, errorCode, isRetriable, ...) \ - { \ + exception, exprStr, errorSource, errorCode, isRetriable, ...) \ + do { \ /* GCC 9.2.1 doesn't accept this code with constexpr. */ \ static const ::facebook::velox::detail::VeloxCheckFailArgs \ veloxCheckFailArgs = { \ __FILE__, \ __LINE__, \ __FUNCTION__, \ - expr_str, \ + exprStr, \ errorSource, \ errorCode, \ isRetriable}; \ @@ -174,35 +171,53 @@ std::string errorMessage(fmt::string_view fmt, const Args&... args) { exception, \ typename ::facebook::velox::detail::VeloxCheckFailStringType< \ decltype(message)>::type>(veloxCheckFailArgs, message); \ - } - -#define _VELOX_CHECK_AND_THROW_IMPL( \ - expr, expr_str, exception, errorSource, errorCode, isRetriable, ...) \ - if (UNLIKELY(!(expr))) { \ - _VELOX_THROW_IMPL( \ - exception, \ - expr_str, \ - errorSource, \ - errorCode, \ - isRetriable, \ - __VA_ARGS__); \ - } + } while (0) + +#define _VELOX_CHECK_AND_THROW_IMPL( \ + expr, exprStr, exception, errorSource, errorCode, isRetriable, ...) \ + do { \ + if (UNLIKELY(!(expr))) { \ + _VELOX_THROW_IMPL( \ + exception, \ + exprStr, \ + errorSource, \ + errorCode, \ + isRetriable, \ + __VA_ARGS__); \ + } \ + } while (0) #define _VELOX_THROW(exception, ...) \ _VELOX_THROW_IMPL(exception, "", ##__VA_ARGS__) DECLARE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxRuntimeError); -#define _VELOX_CHECK_IMPL(expr, expr_str, ...) \ +#define _VELOX_CHECK_IMPL(expr, exprStr, ...) \ _VELOX_CHECK_AND_THROW_IMPL( \ expr, \ - expr_str, \ + exprStr, \ ::facebook::velox::VeloxRuntimeError, \ ::facebook::velox::error_source::kErrorSourceRuntime.c_str(), \ ::facebook::velox::error_code::kInvalidState.c_str(), \ /* isRetriable */ false, \ ##__VA_ARGS__) +/// Throws VeloxRuntimeError when functions receive input values out of the +/// supported range. This should only be used when we want to force TRY() to not +/// suppress the error. +#define VELOX_CHECK_UNSUPPORTED_INPUT_UNCATCHABLE(expr, ...) \ + do { \ + if (UNLIKELY(!(expr))) { \ + _VELOX_THROW_IMPL( \ + ::facebook::velox::VeloxRuntimeError, \ + #expr, \ + ::facebook::velox::error_source::kErrorSourceRuntime.c_str(), \ + ::facebook::velox::error_code::kUnsupportedInputUncatchable.c_str(), \ + /* isRetriable */ false, \ + __VA_ARGS__); \ + } \ + } while (0) + // If the caller passes a custom message (4 *or more* arguments), we // have to construct a format string from ours ("({} vs. {})") plus // theirs by adding a space and shuffling arguments. If they don't (exactly 3 @@ -220,25 +235,27 @@ DECLARE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxRuntimeError); ##__VA_ARGS__) #define _VELOX_CHECK_OP_HELPER(implmacro, expr1, expr2, op, ...) \ - if constexpr (FOLLY_PP_DETAIL_NARGS(__VA_ARGS__) > 0) { \ - _VELOX_CHECK_OP_WITH_USER_FMT_HELPER( \ - implmacro, expr1, expr2, op, __VA_ARGS__); \ - } else { \ - implmacro( \ - (expr1)op(expr2), \ - #expr1 " " #op " " #expr2, \ - "({} vs. {})", \ - expr1, \ - expr2); \ - } + do { \ + if constexpr (FOLLY_PP_DETAIL_NARGS(__VA_ARGS__) > 0) { \ + _VELOX_CHECK_OP_WITH_USER_FMT_HELPER( \ + implmacro, expr1, expr2, op, __VA_ARGS__); \ + } else { \ + implmacro( \ + (expr1)op(expr2), \ + #expr1 " " #op " " #expr2, \ + "({} vs. {})", \ + expr1, \ + expr2); \ + } \ + } while (0) #define _VELOX_CHECK_OP(expr1, expr2, op, ...) \ _VELOX_CHECK_OP_HELPER(_VELOX_CHECK_IMPL, expr1, expr2, op, ##__VA_ARGS__) -#define _VELOX_USER_CHECK_IMPL(expr, expr_str, ...) \ +#define _VELOX_USER_CHECK_IMPL(expr, exprStr, ...) \ _VELOX_CHECK_AND_THROW_IMPL( \ expr, \ - expr_str, \ + exprStr, \ ::facebook::velox::VeloxUserError, \ ::facebook::velox::error_source::kErrorSourceUser.c_str(), \ ::facebook::velox::error_code::kInvalidArgument.c_str(), \ @@ -261,6 +278,12 @@ DECLARE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxRuntimeError); #define VELOX_CHECK_NULL(e, ...) VELOX_CHECK(e == nullptr, ##__VA_ARGS__) #define VELOX_CHECK_NOT_NULL(e, ...) VELOX_CHECK(e != nullptr, ##__VA_ARGS__) +#define VELOX_CHECK_OK(expr) \ + do { \ + ::facebook::velox::Status _s = (expr); \ + _VELOX_CHECK_IMPL(_s.ok(), #expr, _s.toString()); \ + } while (false) + #define VELOX_UNSUPPORTED(...) \ _VELOX_THROW( \ ::facebook::velox::VeloxUserError, \ @@ -285,6 +308,14 @@ DECLARE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxRuntimeError); /* isRetriable */ false, \ ##__VA_ARGS__) +#define VELOX_FILE_NOT_FOUND_ERROR(...) \ + _VELOX_THROW( \ + ::facebook::velox::VeloxRuntimeError, \ + ::facebook::velox::error_source::kErrorSourceRuntime.c_str(), \ + ::facebook::velox::error_code::kFileNotFound.c_str(), \ + /* isRetriable */ false, \ + ##__VA_ARGS__) + #define VELOX_UNREACHABLE(...) \ _VELOX_THROW( \ ::facebook::velox::VeloxRuntimeError, \ @@ -311,6 +342,7 @@ DECLARE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxRuntimeError); #define VELOX_DCHECK_LE(e1, e2, ...) VELOX_CHECK(true) #define VELOX_DCHECK_EQ(e1, e2, ...) VELOX_CHECK(true) #define VELOX_DCHECK_NE(e1, e2, ...) VELOX_CHECK(true) +#define VELOX_DCHECK_NULL(e, ...) VELOX_CHECK(true) #define VELOX_DCHECK_NOT_NULL(e, ...) VELOX_CHECK(true) #endif @@ -322,6 +354,17 @@ DECLARE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxRuntimeError); /* isRetriable */ false, \ ##__VA_ARGS__) +/// Throws VeloxRuntimeError when functions receive input values out of the +/// supported range. This should only be used when we want to force TRY() to not +/// suppress the error. +#define VELOX_FAIL_UNSUPPORTED_INPUT_UNCATCHABLE(...) \ + _VELOX_THROW( \ + ::facebook::velox::VeloxRuntimeError, \ + ::facebook::velox::error_source::kErrorSourceRuntime.c_str(), \ + ::facebook::velox::error_code::kUnsupportedInputUncatchable.c_str(), \ + /* isRetriable */ false, \ + ##__VA_ARGS__) + DECLARE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxUserError); // For all below macros, an additional message can be passed using a @@ -390,5 +433,4 @@ DECLARE_CHECK_FAIL_TEMPLATES(::facebook::velox::VeloxUserError); /* isRetriable */ false, \ ##__VA_ARGS__) -} // namespace velox -} // namespace facebook +} // namespace facebook::velox diff --git a/velox/common/base/FmtStdFormatters.h b/velox/common/base/FmtStdFormatters.h new file mode 100644 index 0000000000000..90784d8ce461a --- /dev/null +++ b/velox/common/base/FmtStdFormatters.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#if FMT_VERSION >= 100100 +#include +#endif + +#include +#include +#include +#include +#include +#include + +template +struct fmt::formatter + : formatter::type, Char> { + template + auto format(std::errc v, FormatContext& ctx) const -> decltype(ctx.out()) { + using underlying_type = std::underlying_type::type; + return formatter::format( + static_cast(v), ctx); + } +}; + +#if FMT_VERSION < 100100 +// This should be 100101 but FMT_VERSION was not bumped in 10.1.1 +// but under a month has passed since 10.1.0 release so we can assume 10.1.1 +// +// Backport from fmt 10.1.1 see fmtlib/fmt#3574 +// Formats std::atomic +template +struct fmt::formatter< + std::atomic, + Char, + std::enable_if_t::value>> + : formatter { + template + auto format(const std::atomic& v, FormatContext& ctx) const + -> decltype(ctx.out()) { + return formatter::format(v.load(), ctx); + } +}; +#endif + +#if FMT_VERSION < 100100 +// Backport from fmt 10.1 see fmtlib/fmt#3570 +// Formats std::vector +namespace fmt::detail { +template +struct has_flip : std::false_type {}; + +template +struct has_flip().flip())>> + : std::true_type {}; + +template +struct is_bit_reference_like { + static constexpr const bool value = std::is_convertible::value && + std::is_nothrow_assignable::value && has_flip::value; +}; + +#ifdef _LIBCPP_VERSION + +// Workaround for libc++ incompatibility with C++ standard. +// According to the Standard, `bitset::operator[] const` returns bool. +template +struct is_bit_reference_like> { + static constexpr const bool value = true; +}; + +#endif +} // namespace fmt::detail + +// We can't use std::vector::reference and +// std::bitset::reference because the compiler can't deduce Allocator and N +// in partial specialization. +template +struct fmt::formatter< + BitRef, + Char, + std::enable_if_t::value>> + : formatter { + template + FMT_CONSTEXPR auto format(const BitRef& v, FormatContext& ctx) const + -> decltype(ctx.out()) { + return formatter::format(v, ctx); + } +}; +#endif diff --git a/velox/common/base/Fs.cpp b/velox/common/base/Fs.cpp index 82c748fae843a..bbea7b77878ba 100644 --- a/velox/common/base/Fs.cpp +++ b/velox/common/base/Fs.cpp @@ -15,6 +15,7 @@ */ #include "velox/common/base/Fs.h" + #include #include diff --git a/velox/common/base/Macros.h b/velox/common/base/Macros.h index 03e45f7ce1f94..48bc0f38dcdfc 100644 --- a/velox/common/base/Macros.h +++ b/velox/common/base/Macros.h @@ -18,19 +18,13 @@ // Macros to disable deprecation warnings #ifdef __clang__ -#define VELOX_SUPPRESS_DEPRECATION_WARNING \ - _Pragma("clang diagnostic push"); \ - _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") -#define VELOX_UNSUPPRESS_DEPRECATION_WARNING _Pragma("clang diagnostic pop"); -#define VELOX_SUPPRESS_RETURN_LOCAL_ADDR_WARNING -#define VELOX_UNSUPPRESS_RETURN_LOCAL_ADDR_WARNING +#define VELOX_SUPPRESS_STRINGOP_OVERFLOW_WARNING +#define VELOX_UNSUPPRESS_STRINGOP_OVERFLOW_WARNING #else -#define VELOX_SUPPRESS_DEPRECATION_WARNING -#define VELOX_UNSUPPRESS_DEPRECATION_WARNING -#define VELOX_SUPPRESS_RETURN_LOCAL_ADDR_WARNING \ +#define VELOX_SUPPRESS_STRINGOP_OVERFLOW_WARNING \ _Pragma("GCC diagnostic push"); \ - _Pragma("GCC diagnostic ignored \"-Wreturn-local-addr\"") -#define VELOX_UNSUPPRESS_RETURN_LOCAL_ADDR_WARNING \ + _Pragma("GCC diagnostic ignored \"-Wstringop-overflow\"") +#define VELOX_UNSUPPRESS_STRINGOP_OVERFLOW_WARNING \ _Pragma("GCC diagnostic pop"); #endif diff --git a/velox/common/base/PeriodicStatsReporter.cpp b/velox/common/base/PeriodicStatsReporter.cpp new file mode 100644 index 0000000000000..5bd8781a6b083 --- /dev/null +++ b/velox/common/base/PeriodicStatsReporter.cpp @@ -0,0 +1,259 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/PeriodicStatsReporter.h" +#include "velox/common/base/Counters.h" +#include "velox/common/base/StatsReporter.h" +#include "velox/common/caching/CacheTTLController.h" +#include "velox/common/memory/Memory.h" +#include "velox/common/memory/MmapAllocator.h" + +namespace facebook::velox { + +namespace { +#define REPORT_IF_NOT_ZERO(name, counter) \ + if ((counter) != 0) { \ + RECORD_METRIC_VALUE((name), (counter)); \ + } + +std::mutex& instanceMutex() { + static std::mutex instanceMu; + return instanceMu; +} + +// Global instance. Must be called while holding a lock over instanceMutex(). +std::unique_ptr& instance() { + static std::unique_ptr reporter; + return reporter; +} +} // namespace + +void startPeriodicStatsReporter(const PeriodicStatsReporter::Options& options) { + std::lock_guard l(instanceMutex()); + auto& instanceRef = instance(); + VELOX_CHECK_NULL( + instanceRef, "The periodic stats reporter has already started."); + instanceRef = std::make_unique(options); + instanceRef->start(); +} + +void stopPeriodicStatsReporter() { + std::lock_guard l(instanceMutex()); + auto& instanceRef = instance(); + VELOX_CHECK_NOT_NULL(instanceRef, "No periodic stats reporter to stop."); + instanceRef->stop(); + instanceRef.reset(); +} + +PeriodicStatsReporter::PeriodicStatsReporter(const Options& options) + : allocator_(options.allocator), + cache_(options.cache), + arbitrator_(options.arbitrator), + spillMemoryPool_(options.spillMemoryPool), + options_(options) {} + +void PeriodicStatsReporter::start() { + LOG(INFO) << "Starting PeriodicStatsReporter with options " + << options_.toString(); + addTask( + "report_allocator_stats", + [this]() { reportAllocatorStats(); }, + options_.allocatorStatsIntervalMs); + addTask( + "report_cache_stats", + [this]() { reportCacheStats(); }, + options_.cacheStatsIntervalMs); + addTask( + "report_arbitrator_stats", + [this]() { reportArbitratorStats(); }, + options_.arbitratorStatsIntervalMs); + addTask( + "report_spill_stats", + [this]() { reportSpillStats(); }, + options_.spillStatsIntervalMs); +} + +void PeriodicStatsReporter::stop() { + LOG(INFO) << "Stopping PeriodicStatsReporter"; + scheduler_.stop(); +} + +void PeriodicStatsReporter::reportArbitratorStats() { + if (arbitrator_ == nullptr) { + return; + } + + const auto stats = arbitrator_->stats(); + RECORD_METRIC_VALUE( + kMetricArbitratorFreeCapacityBytes, + stats.freeCapacityBytes + stats.freeReservedCapacityBytes); + RECORD_METRIC_VALUE( + kMetricArbitratorFreeReservedCapacityBytes, + stats.freeReservedCapacityBytes); +} + +void PeriodicStatsReporter::reportAllocatorStats() { + if (allocator_ == nullptr) { + return; + } + RECORD_METRIC_VALUE( + kMetricMappedMemoryBytes, + (velox::memory::AllocationTraits::pageBytes(allocator_->numMapped()))); + RECORD_METRIC_VALUE( + kMetricAllocatedMemoryBytes, + (velox::memory::AllocationTraits::pageBytes(allocator_->numAllocated()))); + // TODO(jtan6): Remove condition after T150019700 is done + if (auto* mmapAllocator = + dynamic_cast(allocator_)) { + RECORD_METRIC_VALUE( + kMetricMmapDelegatedAllocBytes, (mmapAllocator->numMallocBytes())); + RECORD_METRIC_VALUE( + kMetricMmapExternalMappedBytes, + velox::memory::AllocationTraits::pageBytes( + (mmapAllocator->numExternalMapped()))); + } + // TODO(xiaoxmeng): add memory allocation size stats. +} + +void PeriodicStatsReporter::reportCacheStats() { + if (cache_ == nullptr) { + return; + } + const auto cacheStats = cache_->refreshStats(); + + // Memory cache snapshot stats. + RECORD_METRIC_VALUE(kMetricMemoryCacheNumEntries, cacheStats.numEntries); + RECORD_METRIC_VALUE( + kMetricMemoryCacheNumEmptyEntries, cacheStats.numEmptyEntries); + RECORD_METRIC_VALUE(kMetricMemoryCacheNumSharedEntries, cacheStats.numShared); + RECORD_METRIC_VALUE( + kMetricMemoryCacheNumExclusiveEntries, cacheStats.numExclusive); + RECORD_METRIC_VALUE( + kMetricMemoryCacheNumPrefetchedEntries, cacheStats.numPrefetch); + RECORD_METRIC_VALUE(kMetricMemoryCacheTotalTinyBytes, cacheStats.tinySize); + RECORD_METRIC_VALUE(kMetricMemoryCacheTotalLargeBytes, cacheStats.largeSize); + RECORD_METRIC_VALUE( + kMetricMemoryCacheTotalTinyPaddingBytes, cacheStats.tinyPadding); + RECORD_METRIC_VALUE( + kMetricMemoryCacheTotalLargePaddingBytes, cacheStats.largePadding); + RECORD_METRIC_VALUE( + kMetricMemoryCacheTotalPrefetchBytes, cacheStats.prefetchBytes); + + // Memory cache cumulative stats. + const auto deltaCacheStats = cacheStats - lastCacheStats_; + + REPORT_IF_NOT_ZERO(kMetricMemoryCacheNumHits, deltaCacheStats.numHit); + REPORT_IF_NOT_ZERO(kMetricMemoryCacheHitBytes, deltaCacheStats.hitBytes); + REPORT_IF_NOT_ZERO(kMetricMemoryCacheNumNew, deltaCacheStats.numNew); + REPORT_IF_NOT_ZERO(kMetricMemoryCacheNumEvicts, deltaCacheStats.numEvict); + REPORT_IF_NOT_ZERO( + kMetricMemoryCacheNumSavableEvicts, deltaCacheStats.numSavableEvict); + REPORT_IF_NOT_ZERO( + kMetricMemoryCacheNumEvictChecks, deltaCacheStats.numEvictChecks); + REPORT_IF_NOT_ZERO( + kMetricMemoryCacheNumWaitExclusive, deltaCacheStats.numWaitExclusive); + REPORT_IF_NOT_ZERO( + kMetricMemoryCacheNumAllocClocks, deltaCacheStats.allocClocks); + REPORT_IF_NOT_ZERO( + kMetricMemoryCacheNumAgedOutEntries, deltaCacheStats.numAgedOut); + REPORT_IF_NOT_ZERO( + kMetricMemoryCacheSumEvictScore, deltaCacheStats.sumEvictScore); + + // SSD cache snapshot stats. + if (cacheStats.ssdStats != nullptr) { + RECORD_METRIC_VALUE( + kMetricSsdCacheCachedEntries, cacheStats.ssdStats->entriesCached); + RECORD_METRIC_VALUE( + kMetricSsdCacheCachedRegions, cacheStats.ssdStats->regionsCached); + RECORD_METRIC_VALUE( + kMetricSsdCacheCachedBytes, cacheStats.ssdStats->bytesCached); + } + + // SSD cache cumulative stats. + if (deltaCacheStats.ssdStats != nullptr) { + const auto deltaSsdStats = *deltaCacheStats.ssdStats; + REPORT_IF_NOT_ZERO(kMetricSsdCacheReadEntries, deltaSsdStats.entriesRead) + REPORT_IF_NOT_ZERO(kMetricSsdCacheReadBytes, deltaSsdStats.bytesRead); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheWrittenEntries, deltaSsdStats.entriesWritten); + REPORT_IF_NOT_ZERO(kMetricSsdCacheWrittenBytes, deltaSsdStats.bytesWritten); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheOpenSsdErrors, deltaSsdStats.openFileErrors); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheOpenCheckpointErrors, + deltaSsdStats.openCheckpointErrors); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheOpenLogErrors, deltaSsdStats.openLogErrors); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheDeleteCheckpointErrors, + deltaSsdStats.deleteCheckpointErrors); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheGrowFileErrors, deltaSsdStats.growFileErrors); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheWriteSsdErrors, deltaSsdStats.writeSsdErrors); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheWriteSsdDropped, deltaSsdStats.writeSsdDropped); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheWriteCheckpointErrors, + deltaSsdStats.writeCheckpointErrors); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheReadSsdErrors, deltaSsdStats.readSsdErrors); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheReadCorruptions, deltaSsdStats.readSsdCorruptions); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheReadCheckpointErrors, + deltaSsdStats.readCheckpointErrors); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheCheckpointsRead, deltaSsdStats.checkpointsRead); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheCheckpointsWritten, deltaSsdStats.checkpointsWritten); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheRegionsEvicted, deltaSsdStats.regionsEvicted); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheAgedOutEntries, deltaSsdStats.entriesAgedOut) + REPORT_IF_NOT_ZERO( + kMetricSsdCacheAgedOutRegions, deltaSsdStats.regionsAgedOut); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheReadWithoutChecksum, + deltaSsdStats.readWithoutChecksumChecks); + REPORT_IF_NOT_ZERO( + kMetricSsdCacheRecoveredEntries, deltaSsdStats.entriesRecovered); + } + + // TTL controler snapshot stats. + if (auto* cacheTTLController = + velox::cache::CacheTTLController::getInstance()) { + RECORD_METRIC_VALUE( + kMetricCacheMaxAgeSecs, + cacheTTLController->getCacheAgeStats().maxAgeSecs); + } + + lastCacheStats_ = cacheStats; +} + +void PeriodicStatsReporter::reportSpillStats() { + if (spillMemoryPool_ == nullptr) { + return; + } + const auto spillMemoryStats = spillMemoryPool_->stats(); + LOG(INFO) << "Spill memory usage: current[" + << velox::succinctBytes(spillMemoryStats.usedBytes) << "] peak[" + << velox::succinctBytes(spillMemoryStats.peakBytes) << "]"; + RECORD_METRIC_VALUE(kMetricSpillMemoryBytes, spillMemoryStats.usedBytes); + RECORD_METRIC_VALUE(kMetricSpillPeakMemoryBytes, spillMemoryStats.peakBytes); +} + +} // namespace facebook::velox diff --git a/velox/common/base/PeriodicStatsReporter.h b/velox/common/base/PeriodicStatsReporter.h new file mode 100644 index 0000000000000..a902f3de8222b --- /dev/null +++ b/velox/common/base/PeriodicStatsReporter.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "velox/common/caching/AsyncDataCache.h" +#include "velox/common/caching/SsdFile.h" +#include "velox/common/memory/MemoryArbitrator.h" + +namespace folly { +class CPUThreadPoolExecutor; +} + +namespace facebook::velox { + +namespace memory { +class MemoryAllocator; +} + +namespace cache { +class AsyncDataCache; +} + +/// Manages a background daemon thread to report stats through 'StatsReporter'. +class PeriodicStatsReporter { + public: + struct Options { + Options() {} + + const velox::memory::MemoryAllocator* allocator{nullptr}; + uint64_t allocatorStatsIntervalMs{2'000}; + + const velox::cache::AsyncDataCache* cache{nullptr}; + uint64_t cacheStatsIntervalMs{60'000}; + + const memory::MemoryArbitrator* arbitrator{nullptr}; + uint64_t arbitratorStatsIntervalMs{60'000}; + + const memory::MemoryPool* spillMemoryPool{nullptr}; + uint64_t spillStatsIntervalMs{60'000}; + + std::string toString() const { + return fmt::format( + "allocatorStatsIntervalMs:{}, cacheStatsIntervalMs:{}, " + "arbitratorStatsIntervalMs:{}, spillStatsIntervalMs:{}", + allocatorStatsIntervalMs, + cacheStatsIntervalMs, + arbitratorStatsIntervalMs, + spillStatsIntervalMs); + } + }; + + PeriodicStatsReporter(const Options& options = Options()); + + /// Invoked to start the report daemon in background. + void start(); + + /// Invoked to stop the report daemon in background. + void stop(); + + private: + // Add a task to run periodically. + template + void addTask(const std::string& taskName, TFunc&& func, size_t intervalMs) { + scheduler_.add( + taskName, + [taskName, + intervalMs, + func = std::forward(func)]() mutable noexcept { + try { + func(); + } catch (const std::exception& e) { + LOG(ERROR) << "Error running periodic task " << taskName << ": " + << e.what(); + } + return std::chrono::milliseconds(intervalMs); + }); + } + + void reportCacheStats(); + void reportAllocatorStats(); + void reportArbitratorStats(); + void reportSpillStats(); + + const velox::memory::MemoryAllocator* const allocator_{nullptr}; + const velox::cache::AsyncDataCache* const cache_{nullptr}; + const velox::memory::MemoryArbitrator* const arbitrator_{nullptr}; + const velox::memory::MemoryPool* const spillMemoryPool_{nullptr}; + const Options options_; + + cache::CacheStats lastCacheStats_; + + folly::ThreadedRepeatingFunctionRunner scheduler_; +}; + +/// Initializes and starts the process-wide periodic stats reporter. Before +/// 'stopPeriodicStatsReporter()' is called, this method can only be called once +/// process-wide, and additional calls to this method will throw. +void startPeriodicStatsReporter(const PeriodicStatsReporter::Options& options); + +/// Stops the process-wide periodic stats reporter. +void stopPeriodicStatsReporter(); + +} // namespace facebook::velox diff --git a/velox/common/base/Pointers.h b/velox/common/base/Pointers.h new file mode 100644 index 0000000000000..50eb1bf0b2634 --- /dev/null +++ b/velox/common/base/Pointers.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "velox/common/base/Exceptions.h" + +namespace facebook::velox { + +/// Used to dynamically cast a unique pointer from 'SourceType' to +/// 'DestinationType'. The raw object is moved from 'srcPtr' to 'dstPtr' on +/// success. The function throws if the cast fails, and the raw object is also +/// freed. +template +inline void castUniquePointer( + std::unique_ptr&& srcPtr, + std::unique_ptr& dstPtr) { + auto* rawSrcPtr = srcPtr.release(); + VELOX_CHECK_NOT_NULL(rawSrcPtr); + try { + auto* rawDstPtr = dynamic_cast(rawSrcPtr); + VELOX_CHECK_NOT_NULL(rawDstPtr); + dstPtr.reset(rawDstPtr); + } catch (const std::exception& e) { + srcPtr.reset(rawSrcPtr); + throw; + } +} +} // namespace facebook::velox diff --git a/velox/common/base/PrefixSortConfig.h b/velox/common/base/PrefixSortConfig.h new file mode 100644 index 0000000000000..34adee1b7914a --- /dev/null +++ b/velox/common/base/PrefixSortConfig.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace facebook::velox::common { + +/// Specifies the config for prefix-sort. +struct PrefixSortConfig { + explicit PrefixSortConfig( + int64_t maxNormalizedKeySize, + int32_t threshold = 130) + : maxNormalizedKeySize(maxNormalizedKeySize), threshold(threshold) {} + + /// Max number of bytes can store normalized keys in prefix-sort buffer per + /// entry. + const int64_t maxNormalizedKeySize; + + /// PrefixSort will have performance regression when the dateset is too small. + const int32_t threshold; +}; +} // namespace facebook::velox::common diff --git a/velox/common/base/RandomUtil.cpp b/velox/common/base/RandomUtil.cpp index 7353109e263d7..5fe3dcf765989 100644 --- a/velox/common/base/RandomUtil.cpp +++ b/velox/common/base/RandomUtil.cpp @@ -16,10 +16,6 @@ #include "velox/common/base/RandomUtil.h" -#include - -#include - namespace facebook::velox::random { namespace { @@ -36,4 +32,13 @@ uint32_t getSeed() { return customSeed ? *customSeed : folly::Random::rand32(); } +RandomSkipTracker::RandomSkipTracker(double sampleRate) + : sampleRate_(sampleRate) { + VELOX_CHECK(0 <= sampleRate && sampleRate < 1); + if (sampleRate > 0) { + dist_ = std::geometric_distribution(sampleRate); + rng_.seed(getSeed()); + } +} + } // namespace facebook::velox::random diff --git a/velox/common/base/RandomUtil.h b/velox/common/base/RandomUtil.h index e9ea8f0d2173d..89645a33fa519 100644 --- a/velox/common/base/RandomUtil.h +++ b/velox/common/base/RandomUtil.h @@ -16,7 +16,13 @@ #pragma once +#include "velox/common/base/Exceptions.h" + +#include + #include +#include +#include namespace facebook::velox::random { @@ -27,4 +33,66 @@ void setSeed(uint32_t); // Return a true random seed unless setSeed() is called before. uint32_t getSeed(); +/// Utility class to accelerate random sampling based on Bernoulli trials. +/// Internally this keeps the number of skips for next hit. User can consume a +/// bulk of trials calling the `nextSkip' then `consume', or call `testOne` to +/// do the trials one by one. +class RandomSkipTracker { + public: + explicit RandomSkipTracker(double sampleRate); + + RandomSkipTracker(const RandomSkipTracker&) = delete; + RandomSkipTracker& operator=(const RandomSkipTracker&) = delete; + + /// Return the number of skips need to get a hit. Must be called before + /// calling `consume'. + uint64_t nextSkip() { + if (sampleRate_ == 0) { + return std::numeric_limits::max(); + } + if (skip_.has_value()) { + return *skip_; + } + skip_ = dist_(rng_); + return *skip_; + } + + /// Consume the remaining skips followed by at most one hit. + void consume(uint64_t numElements) { + if (sampleRate_ == 0) { + return; + } + VELOX_DCHECK(skip_.has_value()); + if (*skip_ >= numElements) { + *skip_ -= numElements; + } else { + VELOX_DCHECK_EQ(numElements - *skip_, 1); + skip_.reset(); + } + } + + /// Consume one trial and return the result. + bool testOne() { + if (sampleRate_ == 0) { + return false; + } + if (nextSkip() == 0) { + skip_.reset(); + return true; + } + --*skip_; + return false; + } + + double sampleRate() const { + return sampleRate_; + } + + private: + const double sampleRate_; + std::geometric_distribution dist_; + folly::Random::DefaultGenerator rng_; + std::optional skip_; +}; + } // namespace facebook::velox::random diff --git a/velox/common/base/Range.h b/velox/common/base/Range.h index bf061a398ab7b..cc68ccb119469 100644 --- a/velox/common/base/Range.h +++ b/velox/common/base/Range.h @@ -16,12 +16,10 @@ #pragma once -#include #include #include "velox/common/base/BitUtil.h" -namespace facebook { -namespace velox { +namespace facebook::velox { template class Range { @@ -63,12 +61,12 @@ inline bool Range::operator[](int32_t idx) const { template class WritablePosition { public: - WritablePosition(uint64_t* pointer, unsigned char bit) + WritablePosition(uint64_t* pointer, int8_t bitIndex) : pointer_( reinterpret_cast(pointer) | - (static_cast(bit) << 56)) {} + (static_cast(bitIndex) << 56)) {} - WritablePosition(T* pointer) + explicit WritablePosition(T* pointer) : pointer_(reinterpret_cast(pointer)) {} operator T() const { @@ -144,8 +142,8 @@ template <> inline WritablePosition MutableRange::operator[](int32_t idx) { int32_t bit = begin_ + idx; return WritablePosition( - reinterpret_cast(data_) + (bit / 64), bit & 63); + reinterpret_cast(data_) + (bit / 64), + static_cast(bit & 63)); } -} // namespace velox -} // namespace facebook +} // namespace facebook::velox diff --git a/velox/common/base/RawVector.cpp b/velox/common/base/RawVector.cpp index d541bbe26b968..6dc1c048d57f6 100644 --- a/velox/common/base/RawVector.cpp +++ b/velox/common/base/RawVector.cpp @@ -30,13 +30,14 @@ bool initializeIota() { } } // namespace -const int32_t* iota(int32_t size, raw_vector& storage) { - if (iotaData.size() < size) { +const int32_t* +iota(int32_t size, raw_vector& storage, int32_t offset) { + if (iotaData.size() < offset + size) { storage.resize(size); - std::iota(&storage[0], &storage[storage.size()], 0); + std::iota(storage.begin(), storage.end(), offset); return storage.data(); } - return iotaData.data(); + return iotaData.data() + offset; } static bool FB_ANONYMOUS_VARIABLE(g_iotaConstants) = initializeIota(); diff --git a/velox/common/base/RawVector.h b/velox/common/base/RawVector.h index d4941a7fdf5cc..fc6e01d133208 100644 --- a/velox/common/base/RawVector.h +++ b/velox/common/base/RawVector.h @@ -67,7 +67,9 @@ class raw_vector { data_ = other.data_; size_ = other.size_; capacity_ = other.capacity_; - simd::memset(&other, 0, sizeof(other)); + other.data_ = nullptr; + other.size_ = 0; + other.capacity_ = 0; } bool empty() const { @@ -199,6 +201,7 @@ class raw_vector { // SIMD width. Typically returns preallocated memory but if this is // not large enough,resizes and initializes 'storage' to the requested // size and returns storage.data(). -const int32_t* iota(int32_t size, raw_vector& storage); +const int32_t* +iota(int32_t size, raw_vector& storage, int32_t offset = 0); } // namespace facebook::velox diff --git a/velox/common/base/RuntimeMetrics.cpp b/velox/common/base/RuntimeMetrics.cpp index f1150292b8d0b..dc4ad630366d8 100644 --- a/velox/common/base/RuntimeMetrics.cpp +++ b/velox/common/base/RuntimeMetrics.cpp @@ -34,7 +34,13 @@ void RuntimeMetric::aggregate() { min = max = sum; } -void RuntimeMetric::merge(const RuntimeMetric& other) { +void RuntimeMetric::merge(const RuntimeMetric& other) +#if defined(__has_feature) +#if __has_feature(__address_sanitizer__) + __attribute__((__no_sanitize__("signed-integer-overflow"))) +#endif +#endif +{ VELOX_CHECK_EQ(unit, other.unit); sum += other.sum; count += other.count; diff --git a/velox/common/base/RuntimeMetrics.h b/velox/common/base/RuntimeMetrics.h index 9a81ff1837916..7bb45ab4807d6 100644 --- a/velox/common/base/RuntimeMetrics.h +++ b/velox/common/base/RuntimeMetrics.h @@ -16,11 +16,12 @@ #pragma once -#include -#include #include #include +#include +#include + namespace facebook::velox { struct RuntimeCounter { @@ -111,3 +112,10 @@ class RuntimeStatWriterScopeGuard { }; } // namespace facebook::velox +template <> +struct fmt::formatter : formatter { + auto format(facebook::velox::RuntimeCounter::Unit s, format_context& ctx) + const { + return formatter::format(static_cast(s), ctx); + } +}; diff --git a/velox/common/base/Scratch.h b/velox/common/base/Scratch.h new file mode 100644 index 0000000000000..5866c82063fd9 --- /dev/null +++ b/velox/common/base/Scratch.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/common/base/RawVector.h" + +/// A utility for reusable scoped temporary scratch areas. +namespace facebook::velox { + +/// A collection of temporary reusable scratch vectors. The vectors are accessed +/// via the ScratchPtr scoped lease. The vectors are padded so that their last +/// element can be written at full SIMD width, as with raw_vector. +class Scratch { + public: + using Item = raw_vector; + + Scratch() = default; + Scratch(const Scratch& other) = delete; + + ~Scratch() { + reserve(0); + ::free(items_); + items_ = nullptr; + capacity_ = 0; + fill_ = 0; + } + void operator=(const Scratch& other) = delete; + + /// Returns the next reusable scratch vector or makes a new one. + Item get() { + if (fill_ == 0) { + return Item(); + } + auto temp = std::move(items_[fill_ - 1]); + --fill_; + retainedSize_ -= temp.capacity(); + return temp; + } + + void release(Item&& item) { + retainedSize_ += item.capacity(); + if (fill_ == capacity_) { + reserve(std::max(16, 2 * capacity_)); + } + items_[fill_++] = std::move(item); + } + + void trim() { + reserve(0); + retainedSize_ = 0; + } + + size_t retainedSize() { + return retainedSize_; + } + + private: + void reserve(int32_t newCapacity) { + VELOX_CHECK_LE(fill_, capacity_); + // Delete the items above the new capacity. + for (auto i = newCapacity; i < fill_; ++i) { + std::destroy_at(&items_[i]); + } + // Add hint to prevent the compiler from generating the + // stringop-overflow warning when 'newCapacity' is 0. + folly::assume(capacity_ >= 0); + if (newCapacity > capacity_) { + Item* newItems = + reinterpret_cast(::malloc(sizeof(Item) * newCapacity)); + if (fill_ > 0) { + ::memcpy(newItems, items_, fill_ * sizeof(Item)); + } + ::memset(newItems + fill_, 0, (newCapacity - fill_) * sizeof(Item)); + ::free(items_); + items_ = newItems; + capacity_ = newCapacity; + } + fill_ = std::min(fill_, newCapacity); + } + + Item* items_{nullptr}; + int32_t fill_{0}; + int32_t capacity_{0}; + // The total size held. If too large from outlier use cases, 'this' should be + // trimmed. + int64_t retainedSize_{0}; +}; + +/// A scoped lease for a scratch area of T. For scratch areas <= +/// 'inlineSize' the scratch area is inlined, typically on stack, and +/// no allocation will ever take place. The inline storage is padded +/// with a trailer of simd::kPadding bytes to allow writing at full +/// SIMD width at the end of the area. +template +class ScratchPtr { + public: + ScratchPtr(Scratch& scratch) : scratch_(&scratch) {} + + ScratchPtr(const ScratchPtr& other) = delete; + ScratchPtr(ScratchPtr&& other) = delete; + + inline ~ScratchPtr() { + if (data_.data() != nullptr) { + scratch_->release(std::move(data_)); + } + } + + void operator=(ScratchPtr&& other) = delete; + void operator=(const ScratchPtr& other) = delete; + + /// Returns a writable pointer to at least 'size' uninitialized + /// elements of T. The last element is followed by simd::kPadding + /// bytes to allow a full width SIMD store for any element. This may + /// be called once per lifetime. + T* get(int32_t size) { + VELOX_CHECK_NULL(ptr_); + size_ = size; + if (size <= inlineSize) { + ptr_ = inline_; + return ptr_; + } + data_ = scratch_->get(); + data_.resize(size * sizeof(T)); + ptr_ = reinterpret_cast(data_.data()); + return ptr_; + } + + /// Returns the pointer returned by a previous get(int32_t). + T* get() const { + VELOX_DCHECK_NOT_NULL(ptr_); + return ptr_; + } + + /// Returns the size of the previous get(int32_t). + int32_t size() const { + return size_; + } + + private: + Scratch* const scratch_{nullptr}; + + raw_vector data_; + T* ptr_{nullptr}; + int32_t size_{0}; + T inline_[inlineSize]; + char padding_[inlineSize == 0 ? 0 : simd::kPadding]; +}; + +} // namespace facebook::velox diff --git a/velox/common/base/SelectivityInfo.h b/velox/common/base/SelectivityInfo.h index 825e476321a4b..6d52790d637eb 100644 --- a/velox/common/base/SelectivityInfo.h +++ b/velox/common/base/SelectivityInfo.h @@ -58,8 +58,8 @@ class SelectivityInfo { class SelectivityTimer { public: SelectivityTimer(SelectivityInfo& info, uint64_t numIn) - : startClocks_(folly::hardware_timestamp()), - totalClocks_(&info.timeClocks_) { + : totalClocks_(&info.timeClocks_), + startClocks_(folly::hardware_timestamp()) { info.numIn_ += numIn; } @@ -72,8 +72,8 @@ class SelectivityTimer { } private: - uint64_t startClocks_; uint64_t* const totalClocks_; + uint64_t startClocks_; }; } // namespace velox diff --git a/velox/common/base/SimdUtil-inl.h b/velox/common/base/SimdUtil-inl.h index 63d4fdbe92eaf..87ff71f8b181c 100644 --- a/velox/common/base/SimdUtil-inl.h +++ b/velox/common/base/SimdUtil-inl.h @@ -42,22 +42,29 @@ int genericToBitMask(xsimd::batch_bool mask) { } template -xsimd::batch_bool fromBitMaskImpl(int mask) { - static const auto kMemo = ({ - constexpr int N = xsimd::batch_bool::size; +struct FromBitMask { + FromBitMask() { static_assert(N <= 8); - std::array, (1 << N)> memo; for (int i = 0; i < (1 << N); ++i) { bool tmp[N]; for (int bit = 0; bit < N; ++bit) { tmp[bit] = (i & (1 << bit)) ? true : false; } - memo[i] = xsimd::batch_bool::load_unaligned(tmp); + memo_[i] = xsimd::batch_bool::load_unaligned(tmp); } - memo; - }); - return kMemo[mask]; -} + } + + xsimd::batch_bool operator[](size_t i) const { + return memo_[i]; + } + + private: + static constexpr int N = xsimd::batch_bool::size; + xsimd::batch_bool memo_[1 << N]; +}; + +extern const FromBitMask fromBitMask32; +extern const FromBitMask fromBitMask64; template struct BitMask { @@ -132,9 +139,10 @@ struct BitMask { return genericToBitMask(mask); } - static xsimd::batch_bool fromBitMask(int mask, const A&) { - return UNLIKELY(mask == kAllSet) ? xsimd::batch_bool(true) - : fromBitMaskImpl(mask); + static xsimd::batch_bool fromBitMask( + int mask, + const xsimd::default_arch&) { + return fromBitMask32[mask]; } }; @@ -158,9 +166,10 @@ struct BitMask { return genericToBitMask(mask); } - static xsimd::batch_bool fromBitMask(int mask, const A&) { - return UNLIKELY(mask == kAllSet) ? xsimd::batch_bool(true) - : fromBitMaskImpl(mask); + static xsimd::batch_bool fromBitMask( + int mask, + const xsimd::default_arch&) { + return fromBitMask64[mask]; } }; @@ -242,19 +251,72 @@ int32_t indicesOfSetBits( return result - originalResult; } +namespace detail { + template -xsimd::batch_bool leadingMask(int n, const A&) { - constexpr int N = xsimd::batch_bool::size; - static const auto kMemo = ({ - std::array, N> memo; +struct LeadingMask { + LeadingMask() { bool tmp[N]{}; for (int i = 0; i < N; ++i) { - memo[i] = xsimd::batch_bool::load_unaligned(tmp); + memo_[i] = xsimd::batch_bool::load_unaligned(tmp); tmp[i] = true; } - memo; - }); - return LIKELY(n >= N) ? xsimd::batch_bool(true) : kMemo[n]; + memo_[N] = xsimd::batch_bool::load_unaligned(tmp); + } + + xsimd::batch_bool operator[](size_t i) const { + return memo_[i]; + } + + private: + static constexpr int N = xsimd::batch_bool::size; + xsimd::batch_bool memo_[N + 1]; +}; + +extern const LeadingMask leadingMask32; +extern const LeadingMask leadingMask64; + +template +xsimd::batch_bool leadingMask(int i, const A&); + +template <> +inline xsimd::batch_bool leadingMask( + int i, + const xsimd::default_arch&) { + return leadingMask32[i]; +} + +template <> +inline xsimd::batch_bool leadingMask( + int i, + const xsimd::default_arch&) { + return reinterpret_cast< + xsimd::batch_bool::register_type>( + leadingMask32[i].data); +} + +template <> +inline xsimd::batch_bool leadingMask( + int i, + const xsimd::default_arch&) { + return leadingMask64[i]; +} + +template <> +inline xsimd::batch_bool leadingMask( + int i, + const xsimd::default_arch&) { + return reinterpret_cast< + xsimd::batch_bool::register_type>( + leadingMask64[i].data); +} + +} // namespace detail + +template +xsimd::batch_bool leadingMask(int n, const A& arch) { + constexpr int N = xsimd::batch_bool::size; + return detail::leadingMask(std::min(n, N), arch); } namespace detail { @@ -294,7 +356,7 @@ inline bool copyNextWord(void*& to, const void*& from, int32_t& bytes) { } // namespace detail template -void memcpy(void* to, const void* from, int32_t bytes, const A& arch) { +inline void memcpy(void* to, const void* from, int32_t bytes, const A& arch) { while (bytes >= batchByteSize(arch)) { if (!detail::copyNextWord, A>(to, from, bytes)) { return; diff --git a/velox/common/base/SimdUtil.cpp b/velox/common/base/SimdUtil.cpp index d8a190917e22f..03576ac31ec43 100644 --- a/velox/common/base/SimdUtil.cpp +++ b/velox/common/base/SimdUtil.cpp @@ -19,11 +19,49 @@ namespace facebook::velox::simd { +void gatherBits( + const uint64_t* bits, + folly::Range indexRange, + uint64_t* result) { + constexpr int32_t kStep = xsimd::batch::size; + const auto size = indexRange.size(); + auto indices = indexRange.data(); + uint8_t* resultPtr = reinterpret_cast(result); + if (FOLLY_LIKELY(size < 5)) { + uint8_t smallResult = 0; + for (auto i = 0; i < size; ++i) { + smallResult |= static_cast(bits::isBitSet(bits, indices[i])) + << i; + } + *resultPtr = smallResult; + return; + } + + int32_t i = 0; + for (; i + kStep < size; i += kStep) { + uint16_t flags = + simd::gather8Bits(bits, xsimd::load_unaligned(indices + i), kStep); + bits::storeBitsToByte(flags, resultPtr, i); + } + const auto bitsLeft = size - i; + if (bitsLeft > 0) { + uint16_t flags = + simd::gather8Bits(bits, xsimd::load_unaligned(indices + i), bitsLeft); + bits::storeBitsToByte(flags, resultPtr, i); + } +} + namespace detail { alignas(kPadding) int32_t byteSetBits[256][8]; alignas(kPadding) int32_t permute4x64Indices[16][8]; +const LeadingMask leadingMask32; +const LeadingMask leadingMask64; + +const FromBitMask fromBitMask32; +const FromBitMask fromBitMask64; + } // namespace detail namespace { diff --git a/velox/common/base/SimdUtil.h b/velox/common/base/SimdUtil.h index 7ebbf7931f0bd..9a6ad0c374253 100644 --- a/velox/common/base/SimdUtil.h +++ b/velox/common/base/SimdUtil.h @@ -376,6 +376,73 @@ xsimd::batch setAll(T value, const A& = {}) { } } +// Stores 'data' into 'destination' for the lanes in 'mask'. 'mask' is expected +// to specify contiguous lower lanes of 'batch'. For non-SIMD cases, 'mask' is +// not used but rather the number of leading lanes of 'batch' to store is given +// by 'n'. +template +inline void storeLeading( + const xsimd::batch& data, + const xsimd::batch_bool& mask, + int32_t n, + T* destination) { +#if XSIMD_WITH_AVX2 + if constexpr (sizeof(T) == 8) { + _mm256_maskstore_epi64( + reinterpret_cast(destination), + *reinterpret_cast(&mask), + *reinterpret_cast(&data)); + } else if constexpr (sizeof(T) == 4) { + _mm256_maskstore_epi32( + reinterpret_cast(destination), + *reinterpret_cast(&mask), + *reinterpret_cast(&data)); + } else { +#endif + for (auto i = 0; i < n; ++i) { + reinterpret_cast(destination)[i] = + reinterpret_cast(&data)[i]; + } +#if XSIMD_WITH_AVX2 + } +#endif +} + +/// Stores elements of 'input' selected by 'indices' into 'output'. output[i] = +/// input[indices[i]]. +/// Indices and output may be the same. May over-read indices but will not +/// dereference indices that are not in range. Writes exactly indices.size() +/// elements of 'output'. +template +inline void transpose( + const TData* input, + folly::Range indices, + TData* output) { + constexpr int32_t kBatch = xsimd::batch::size; + const auto size = indices.size(); + int32_t i = 0; + for (; i + kBatch < size; i += kBatch) { + auto indexBatch = loadGatherIndices(indices.data() + i); + simd::gather(input, indexBatch).store_unaligned(output + i); + } + if (i < size) { + const auto numLeft = size - i; + auto mask = simd::leadingMask(numLeft); + auto indexBatch = loadGatherIndices(indices.data() + i); + const auto values = simd::maskGather( + xsimd::broadcast(0), mask, input, indexBatch); + storeLeading(values, mask, numLeft, output + i); + } +} + +/// Gathers the bit from 'bits' for each bit offset in 'indices'. Stores the +/// result in 'result'. Writes one byte of 'result' for each 8 bits. If the last +/// byte is not full the trailing bits are undefined. +void gatherBits( + const uint64_t* bits, + folly::Range indices, + uint64_t* result); + // Adds 'bytes' bytes to an address of arbitrary type. template inline T* addBytes(T* pointer, int32_t bytes) { @@ -385,7 +452,7 @@ inline T* addBytes(T* pointer, int32_t bytes) { // 'memcpy' implementation that copies at maximum width and unrolls // when 'bytes' is constant. template -void memcpy(void* to, const void* from, int32_t bytes, const A& = {}); +inline void memcpy(void* to, const void* from, int32_t bytes, const A& = {}); // memset implementation that writes at maximum width and unrolls for // constant values of 'bytes'. diff --git a/velox/common/base/SpillConfig.cpp b/velox/common/base/SpillConfig.cpp new file mode 100644 index 0000000000000..eb02a00b4f1d6 --- /dev/null +++ b/velox/common/base/SpillConfig.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/SpillConfig.h" +#include "velox/common/base/Exceptions.h" + +namespace facebook::velox::common { +SpillConfig::SpillConfig( + GetSpillDirectoryPathCB _getSpillDirPathCb, + UpdateAndCheckSpillLimitCB _updateAndCheckSpillLimitCb, + std::string _fileNamePrefix, + uint64_t _maxFileSize, + uint64_t _writeBufferSize, + uint64_t _readBufferSize, + folly::Executor* _executor, + int32_t _minSpillableReservationPct, + int32_t _spillableReservationGrowthPct, + uint8_t _startPartitionBit, + uint8_t _numPartitionBits, + int32_t _maxSpillLevel, + uint64_t _maxSpillRunRows, + uint64_t _writerFlushThresholdSize, + const std::string& _compressionKind, + const std::string& _fileCreateConfig) + : getSpillDirPathCb(std::move(_getSpillDirPathCb)), + updateAndCheckSpillLimitCb(std::move(_updateAndCheckSpillLimitCb)), + fileNamePrefix(std::move(_fileNamePrefix)), + maxFileSize( + _maxFileSize == 0 ? std::numeric_limits::max() + : _maxFileSize), + writeBufferSize(_writeBufferSize), + readBufferSize(_readBufferSize), + executor(_executor), + minSpillableReservationPct(_minSpillableReservationPct), + spillableReservationGrowthPct(_spillableReservationGrowthPct), + startPartitionBit(_startPartitionBit), + numPartitionBits(_numPartitionBits), + maxSpillLevel(_maxSpillLevel), + maxSpillRunRows(_maxSpillRunRows), + writerFlushThresholdSize(_writerFlushThresholdSize), + compressionKind(common::stringToCompressionKind(_compressionKind)), + fileCreateConfig(_fileCreateConfig) { + VELOX_USER_CHECK_GE( + spillableReservationGrowthPct, + minSpillableReservationPct, + "Spillable memory reservation growth pct should not be lower than minimum available pct"); +} + +int32_t SpillConfig::spillLevel(uint8_t startBitOffset) const { + VELOX_CHECK_LE( + startBitOffset + numPartitionBits, + 64, + "startBitOffset:{} numPartitionsBits:{}", + startBitOffset, + numPartitionBits); + const int32_t deltaBits = startBitOffset - startPartitionBit; + VELOX_CHECK_GE(deltaBits, 0, "deltaBits:{}", deltaBits); + VELOX_CHECK_EQ( + deltaBits % numPartitionBits, + 0, + "deltaBits:{} numPartitionsBits{}", + deltaBits, + numPartitionBits); + return deltaBits / numPartitionBits; +} + +bool SpillConfig::exceedSpillLevelLimit(uint8_t startBitOffset) const { + if (startBitOffset + numPartitionBits > 64) { + return true; + } + if (maxSpillLevel == -1) { + return false; + } + return spillLevel(startBitOffset) > maxSpillLevel; +} +} // namespace facebook::velox::common diff --git a/velox/common/base/SpillConfig.h b/velox/common/base/SpillConfig.h new file mode 100644 index 0000000000000..d6ced347e7870 --- /dev/null +++ b/velox/common/base/SpillConfig.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include "velox/common/compression/Compression.h" + +namespace facebook::velox::common { + +#define VELOX_SPILL_LIMIT_EXCEEDED(errorMessage) \ + _VELOX_THROW( \ + ::facebook::velox::VeloxRuntimeError, \ + ::facebook::velox::error_source::kErrorSourceRuntime.c_str(), \ + ::facebook::velox::error_code::kSpillLimitExceeded.c_str(), \ + /* isRetriable */ true, \ + "{}", \ + errorMessage); + +/// Defining type for a callback function that returns the spill directory path. +/// Implementations can use it to ensure the path exists before returning. +using GetSpillDirectoryPathCB = std::function; + +/// The callback used to update the aggregated spill bytes of a query. If the +/// query spill limit is set, the callback throws if the aggregated spilled +/// bytes exceed the set limit. +using UpdateAndCheckSpillLimitCB = std::function; + +/// Specifies the config for spilling. +struct SpillConfig { + SpillConfig() = default; + SpillConfig( + GetSpillDirectoryPathCB _getSpillDirPathCb, + UpdateAndCheckSpillLimitCB _updateAndCheckSpillLimitCb, + std::string _filePath, + uint64_t _maxFileSize, + uint64_t _writeBufferSize, + uint64_t _readBufferSize, + folly::Executor* _executor, + int32_t _minSpillableReservationPct, + int32_t _spillableReservationGrowthPct, + uint8_t _startPartitionBit, + uint8_t _numPartitionBits, + int32_t _maxSpillLevel, + uint64_t _maxSpillRunRows, + uint64_t _writerFlushThresholdSize, + const std::string& _compressionKind, + const std::string& _fileCreateConfig = {}); + + /// Returns the spilling level with given 'startBitOffset' and + /// 'numPartitionBits'. + /// + /// NOTE: we advance (or right shift) the partition bit offset when goes to + /// the next level of recursive spilling. + int32_t spillLevel(uint8_t startBitOffset) const; + + /// Checks if the given 'startBitOffset' has exceeded the max spill limit. + bool exceedSpillLevelLimit(uint8_t startBitOffset) const; + + /// A callback function that returns the spill directory path. Implementations + /// can use it to ensure the path exists before returning. + GetSpillDirectoryPathCB getSpillDirPathCb; + + /// The callback used to update the aggregated spill bytes of a query. If the + /// query spill limit is set, the callback throws if the aggregated spilled + /// bytes exceed the set limit. + UpdateAndCheckSpillLimitCB updateAndCheckSpillLimitCb; + + /// Prefix for spill files. + std::string fileNamePrefix; + + /// The max spill file size. If it is zero, there is no limit on the spill + /// file size. + uint64_t maxFileSize; + + /// Specifies the size to buffer the serialized spill data before write to + /// storage system for io efficiency. + uint64_t writeBufferSize; + + /// Specifies the buffer size to read from one spilled file. If the underlying + /// filesystem supports async read, we do read-ahead with double buffering, + /// which doubles the buffer used to read from each spill file. + uint64_t readBufferSize; + + /// Executor for spilling. If nullptr spilling writes on the Driver's thread. + folly::Executor* executor; // Not owned. + + /// The minimal spillable memory reservation in percentage of the current + /// memory usage. + int32_t minSpillableReservationPct; + + /// The spillable memory reservation growth in percentage of the current + /// memory usage. + int32_t spillableReservationGrowthPct; + + /// The start partition bit offset of the top (the first level) partitions. + uint8_t startPartitionBit; + + /// Used to calculate the spill hash partition number for hash join and + /// RowNumber with 'startPartitionBit'. + uint8_t numPartitionBits; + + /// The max allowed spilling level with zero being the initial spilling + /// level. This only applies for hash build spilling which needs recursive + /// spilling when the build table is too big. If it is set to -1, then there + /// is no limit and then some extreme large query might run out of spilling + /// partition bits at the end. + int32_t maxSpillLevel; + + /// The max row numbers to fill and spill for each spill run. This is used to + /// cap the memory used for spilling. If it is zero, then there is no limit + /// and spilling might run out of memory. + uint64_t maxSpillRunRows; + + /// Minimum memory footprint size required to reclaim memory from a file + /// writer by flushing its buffered data to disk. + uint64_t writerFlushThresholdSize; + + /// CompressionKind when spilling, CompressionKind_NONE means no compression. + common::CompressionKind compressionKind; + + /// Custom options passed to velox::FileSystem to create spill WriteFile. + std::string fileCreateConfig; +}; +} // namespace facebook::velox::common diff --git a/velox/common/base/SpillStats.cpp b/velox/common/base/SpillStats.cpp new file mode 100644 index 0000000000000..a6a3dae5f9d2c --- /dev/null +++ b/velox/common/base/SpillStats.cpp @@ -0,0 +1,341 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/SpillStats.h" +#include "velox/common/base/Counters.h" +#include "velox/common/base/Exceptions.h" +#include "velox/common/base/StatsReporter.h" +#include "velox/common/base/SuccinctPrinter.h" + +namespace facebook::velox::common { +namespace { +std::vector>& allSpillStats() { + static std::vector> spillStatsList( + std::thread::hardware_concurrency()); + return spillStatsList; +} + +folly::Synchronized& localSpillStats() { + const auto idx = std::hash{}(std::this_thread::get_id()); + auto& spillStatsVector = allSpillStats(); + return spillStatsVector[idx % spillStatsVector.size()]; +} +} // namespace + +SpillStats::SpillStats( + uint64_t _spillRuns, + uint64_t _spilledInputBytes, + uint64_t _spilledBytes, + uint64_t _spilledRows, + uint32_t _spilledPartitions, + uint64_t _spilledFiles, + uint64_t _spillFillTimeNanos, + uint64_t _spillSortTimeNanos, + uint64_t _spillSerializationTimeNanos, + uint64_t _spillWrites, + uint64_t _spillFlushTimeNanos, + uint64_t _spillWriteTimeNanos, + uint64_t _spillMaxLevelExceededCount, + uint64_t _spillReadBytes, + uint64_t _spillReads, + uint64_t _spillReadTimeNanos, + uint64_t _spillDeserializationTimeNanos) + : spillRuns(_spillRuns), + spilledInputBytes(_spilledInputBytes), + spilledBytes(_spilledBytes), + spilledRows(_spilledRows), + spilledPartitions(_spilledPartitions), + spilledFiles(_spilledFiles), + spillFillTimeNanos(_spillFillTimeNanos), + spillSortTimeNanos(_spillSortTimeNanos), + spillSerializationTimeNanos(_spillSerializationTimeNanos), + spillWrites(_spillWrites), + spillFlushTimeNanos(_spillFlushTimeNanos), + spillWriteTimeNanos(_spillWriteTimeNanos), + spillMaxLevelExceededCount(_spillMaxLevelExceededCount), + spillReadBytes(_spillReadBytes), + spillReads(_spillReads), + spillReadTimeNanos(_spillReadTimeNanos), + spillDeserializationTimeNanos(_spillDeserializationTimeNanos) {} + +SpillStats& SpillStats::operator+=(const SpillStats& other) { + spillRuns += other.spillRuns; + spilledInputBytes += other.spilledInputBytes; + spilledBytes += other.spilledBytes; + spilledRows += other.spilledRows; + spilledPartitions += other.spilledPartitions; + spilledFiles += other.spilledFiles; + spillFillTimeNanos += other.spillFillTimeNanos; + spillSortTimeNanos += other.spillSortTimeNanos; + spillSerializationTimeNanos += other.spillSerializationTimeNanos; + spillWrites += other.spillWrites; + spillFlushTimeNanos += other.spillFlushTimeNanos; + spillWriteTimeNanos += other.spillWriteTimeNanos; + spillMaxLevelExceededCount += other.spillMaxLevelExceededCount; + spillReadBytes += other.spillReadBytes; + spillReads += other.spillReads; + spillReadTimeNanos += other.spillReadTimeNanos; + spillDeserializationTimeNanos += other.spillDeserializationTimeNanos; + return *this; +} + +SpillStats SpillStats::operator-(const SpillStats& other) const { + SpillStats result; + result.spillRuns = spillRuns - other.spillRuns; + result.spilledInputBytes = spilledInputBytes - other.spilledInputBytes; + result.spilledBytes = spilledBytes - other.spilledBytes; + result.spilledRows = spilledRows - other.spilledRows; + result.spilledPartitions = spilledPartitions - other.spilledPartitions; + result.spilledFiles = spilledFiles - other.spilledFiles; + result.spillFillTimeNanos = spillFillTimeNanos - other.spillFillTimeNanos; + result.spillSortTimeNanos = spillSortTimeNanos - other.spillSortTimeNanos; + result.spillSerializationTimeNanos = + spillSerializationTimeNanos - other.spillSerializationTimeNanos; + result.spillWrites = spillWrites - other.spillWrites; + result.spillFlushTimeNanos = spillFlushTimeNanos - other.spillFlushTimeNanos; + result.spillWriteTimeNanos = spillWriteTimeNanos - other.spillWriteTimeNanos; + result.spillMaxLevelExceededCount = + spillMaxLevelExceededCount - other.spillMaxLevelExceededCount; + result.spillReadBytes = spillReadBytes - other.spillReadBytes; + result.spillReads = spillReads - other.spillReads; + result.spillReadTimeNanos = spillReadTimeNanos - other.spillReadTimeNanos; + result.spillDeserializationTimeNanos = + spillDeserializationTimeNanos - other.spillDeserializationTimeNanos; + return result; +} + +bool SpillStats::operator<(const SpillStats& other) const { + uint32_t gtCount{0}; + uint32_t ltCount{0}; +#define UPDATE_COUNTER(counter) \ + do { \ + if (counter < other.counter) { \ + ++ltCount; \ + } else if (counter > other.counter) { \ + ++gtCount; \ + } \ + } while (0); + + UPDATE_COUNTER(spillRuns); + UPDATE_COUNTER(spilledInputBytes); + UPDATE_COUNTER(spilledBytes); + UPDATE_COUNTER(spilledRows); + UPDATE_COUNTER(spilledPartitions); + UPDATE_COUNTER(spilledFiles); + UPDATE_COUNTER(spillFillTimeNanos); + UPDATE_COUNTER(spillSortTimeNanos); + UPDATE_COUNTER(spillSerializationTimeNanos); + UPDATE_COUNTER(spillWrites); + UPDATE_COUNTER(spillFlushTimeNanos); + UPDATE_COUNTER(spillWriteTimeNanos); + UPDATE_COUNTER(spillMaxLevelExceededCount); + UPDATE_COUNTER(spillReadBytes); + UPDATE_COUNTER(spillReads); + UPDATE_COUNTER(spillReadTimeNanos); + UPDATE_COUNTER(spillDeserializationTimeNanos); +#undef UPDATE_COUNTER + VELOX_CHECK( + !((gtCount > 0) && (ltCount > 0)), + "gtCount {} ltCount {}", + gtCount, + ltCount); + return ltCount > 0; +} + +bool SpillStats::operator>(const SpillStats& other) const { + return !(*this < other) && (*this != other); +} + +bool SpillStats::operator>=(const SpillStats& other) const { + return !(*this < other); +} + +bool SpillStats::operator<=(const SpillStats& other) const { + return !(*this > other); +} + +bool SpillStats::operator==(const SpillStats& other) const { + return std::tie( + spillRuns, + spilledInputBytes, + spilledBytes, + spilledRows, + spilledPartitions, + spilledFiles, + spillFillTimeNanos, + spillSortTimeNanos, + spillSerializationTimeNanos, + spillWrites, + spillFlushTimeNanos, + spillWriteTimeNanos, + spillMaxLevelExceededCount, + spillReadBytes, + spillReads, + spillReadTimeNanos, + spillDeserializationTimeNanos) == + std::tie( + other.spillRuns, + other.spilledInputBytes, + other.spilledBytes, + other.spilledRows, + other.spilledPartitions, + other.spilledFiles, + other.spillFillTimeNanos, + other.spillSortTimeNanos, + other.spillSerializationTimeNanos, + other.spillWrites, + other.spillFlushTimeNanos, + other.spillWriteTimeNanos, + spillMaxLevelExceededCount, + spillReadBytes, + spillReads, + spillReadTimeNanos, + spillDeserializationTimeNanos); +} + +void SpillStats::reset() { + spillRuns = 0; + spilledInputBytes = 0; + spilledBytes = 0; + spilledRows = 0; + spilledPartitions = 0; + spilledFiles = 0; + spillFillTimeNanos = 0; + spillSortTimeNanos = 0; + spillSerializationTimeNanos = 0; + spillWrites = 0; + spillFlushTimeNanos = 0; + spillWriteTimeNanos = 0; + spillMaxLevelExceededCount = 0; + spillReadBytes = 0; + spillReads = 0; + spillReadTimeNanos = 0; + spillDeserializationTimeNanos = 0; +} + +std::string SpillStats::toString() const { + return fmt::format( + "spillRuns[{}] spilledInputBytes[{}] spilledBytes[{}] spilledRows[{}] " + "spilledPartitions[{}] spilledFiles[{}] spillFillTimeNanos[{}] " + "spillSortTimeNanos[{}] spillSerializationTimeNanos[{}] spillWrites[{}] " + "spillFlushTimeNanos[{}] spillWriteTimeNanos[{}] maxSpillExceededLimitCount[{}] " + "spillReadBytes[{}] spillReads[{}] spillReadTimeNanos[{}] " + "spillReadDeserializationTimeNanos[{}]", + spillRuns, + succinctBytes(spilledInputBytes), + succinctBytes(spilledBytes), + spilledRows, + spilledPartitions, + spilledFiles, + succinctNanos(spillFillTimeNanos), + succinctNanos(spillSortTimeNanos), + succinctNanos(spillSerializationTimeNanos), + spillWrites, + succinctNanos(spillFlushTimeNanos), + succinctNanos(spillWriteTimeNanos), + spillMaxLevelExceededCount, + succinctBytes(spillReadBytes), + spillReads, + succinctNanos(spillReadTimeNanos), + succinctNanos(spillDeserializationTimeNanos)); +} + +void updateGlobalSpillRunStats(uint64_t numRuns) { + auto statsLocked = localSpillStats().wlock(); + statsLocked->spillRuns += numRuns; +} + +void updateGlobalSpillAppendStats( + uint64_t numRows, + uint64_t serializationTimeNs) { + RECORD_METRIC_VALUE(kMetricSpilledRowsCount, numRows); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricSpillSerializationTimeMs, serializationTimeNs / 1'000'000); + auto statsLocked = localSpillStats().wlock(); + statsLocked->spilledRows += numRows; + statsLocked->spillSerializationTimeNanos += serializationTimeNs; +} + +void incrementGlobalSpilledPartitionStats() { + ++localSpillStats().wlock()->spilledPartitions; +} + +void updateGlobalSpillFillTime(uint64_t timeNs) { + RECORD_HISTOGRAM_METRIC_VALUE(kMetricSpillFillTimeMs, timeNs / 1'000'000); + localSpillStats().wlock()->spillFillTimeNanos += timeNs; +} + +void updateGlobalSpillSortTime(uint64_t timeNs) { + RECORD_HISTOGRAM_METRIC_VALUE(kMetricSpillSortTimeMs, timeNs / 1'000'000); + localSpillStats().wlock()->spillSortTimeNanos += timeNs; +} + +void updateGlobalSpillWriteStats( + uint64_t spilledBytes, + uint64_t flushTimeNs, + uint64_t writeTimeNs) { + RECORD_METRIC_VALUE(kMetricSpillWritesCount); + RECORD_METRIC_VALUE(kMetricSpilledBytes, spilledBytes); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricSpillFlushTimeMs, flushTimeNs / 1'000'000); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricSpillWriteTimeMs, writeTimeNs / 1'000'000); + auto statsLocked = localSpillStats().wlock(); + ++statsLocked->spillWrites; + statsLocked->spilledBytes += spilledBytes; + statsLocked->spillFlushTimeNanos += flushTimeNs; + statsLocked->spillWriteTimeNanos += writeTimeNs; +} + +void updateGlobalSpillReadStats( + uint64_t spillReads, + uint64_t spillReadBytes, + uint64_t spillReadTimeNs) { + auto statsLocked = localSpillStats().wlock(); + statsLocked->spillReads += spillReads; + statsLocked->spillReadBytes += spillReadBytes; + statsLocked->spillReadTimeNanos += spillReadTimeNs; +} + +void updateGlobalSpillMemoryBytes(uint64_t spilledInputBytes) { + RECORD_METRIC_VALUE(kMetricSpilledInputBytes, spilledInputBytes); + auto statsLocked = localSpillStats().wlock(); + statsLocked->spilledInputBytes += spilledInputBytes; +} + +void incrementGlobalSpilledFiles() { + RECORD_METRIC_VALUE(kMetricSpilledFilesCount); + ++localSpillStats().wlock()->spilledFiles; +} + +void updateGlobalMaxSpillLevelExceededCount( + uint64_t maxSpillLevelExceededCount) { + localSpillStats().wlock()->spillMaxLevelExceededCount += + maxSpillLevelExceededCount; +} + +void updateGlobalSpillDeserializationTimeNs(uint64_t timeNs) { + localSpillStats().wlock()->spillDeserializationTimeNanos += timeNs; +} + +SpillStats globalSpillStats() { + SpillStats gSpillStats; + for (auto& spillStats : allSpillStats()) { + gSpillStats += spillStats.copy(); + } + return gSpillStats; +} +} // namespace facebook::velox::common diff --git a/velox/common/base/SpillStats.h b/velox/common/base/SpillStats.h new file mode 100644 index 0000000000000..e88db0527c057 --- /dev/null +++ b/velox/common/base/SpillStats.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include "velox/common/compression/Compression.h" + +namespace facebook::velox::common { +/// Provides the fine-grained spill execution stats. +struct SpillStats { + /// The number of times that spilling runs on an operator. + uint64_t spillRuns{0}; + /// The number of bytes in memory to spill + uint64_t spilledInputBytes{0}; + /// The number of bytes spilled to disks. + /// + /// NOTE: if compression is enabled, this counts the compressed bytes. + uint64_t spilledBytes{0}; + /// The number of spilled rows. + uint64_t spilledRows{0}; + /// NOTE: when we sum up the stats from a group of spill operators, it is + /// the total number of spilled partitions X number of operators. + uint32_t spilledPartitions{0}; + /// The number of spilled files. + uint64_t spilledFiles{0}; + /// The time spent on filling rows for spilling. + uint64_t spillFillTimeNanos{0}; + /// The time spent on sorting rows for spilling. + uint64_t spillSortTimeNanos{0}; + /// The time spent on serializing rows for spilling. + uint64_t spillSerializationTimeNanos{0}; + /// The number of spill writer flushes, equivalent to number of write calls to + /// underlying filesystem. + uint64_t spillWrites{0}; + /// The time spent on copy out serialized rows for disk write. If compression + /// is enabled, this includes the compression time. + uint64_t spillFlushTimeNanos{0}; + /// The time spent on writing spilled rows to disk. + uint64_t spillWriteTimeNanos{0}; + /// The number of times that an hash build operator exceeds the max spill + /// limit. + uint64_t spillMaxLevelExceededCount{0}; + /// The number of bytes read from spilled files. + uint64_t spillReadBytes{0}; + /// The number of spill reader reads, equivalent to the number of read calls + /// to the underlying filesystem. + uint64_t spillReads{0}; + /// The time spent on read data from spilled files. + uint64_t spillReadTimeNanos{0}; + /// The time spent on deserializing rows read from spilled files. + uint64_t spillDeserializationTimeNanos{0}; + + SpillStats( + uint64_t _spillRuns, + uint64_t _spilledInputBytes, + uint64_t _spilledBytes, + uint64_t _spilledRows, + uint32_t _spilledPartitions, + uint64_t _spilledFiles, + uint64_t _spillFillTimeNanos, + uint64_t _spillSortTimeNanos, + uint64_t _spillSerializationTimeNanos, + uint64_t _spillWrites, + uint64_t _spillFlushTimeNanos, + uint64_t _spillWriteTimeNanos, + uint64_t _spillMaxLevelExceededCount, + uint64_t _spillReadBytes, + uint64_t _spillReads, + uint64_t _spillReadTimeNanos, + uint64_t _spillDeserializationTimeNanos); + + SpillStats() = default; + + bool empty() const { + return spilledBytes == 0; + } + + SpillStats& operator+=(const SpillStats& other); + SpillStats operator-(const SpillStats& other) const; + bool operator==(const SpillStats& other) const; + bool operator!=(const SpillStats& other) const { + return !(*this == other); + } + bool operator>(const SpillStats& other) const; + bool operator<(const SpillStats& other) const; + bool operator>=(const SpillStats& other) const; + bool operator<=(const SpillStats& other) const; + + void reset(); + + std::string toString() const; +}; + +FOLLY_ALWAYS_INLINE std::ostream& operator<<( + std::ostream& o, + const common::SpillStats& stats) { + return o << stats.toString(); +} + +/// The utilities to update the process wide spilling stats. +/// Updates the number of spill runs. +void updateGlobalSpillRunStats(uint64_t numRuns); + +/// Updates the stats of new append spilled rows including the number of spilled +/// rows and the serializaion time. +void updateGlobalSpillAppendStats( + uint64_t numRows, + uint64_t serializaionTimeUs); + +/// Increments the number of spilled partitions. +void incrementGlobalSpilledPartitionStats(); + +/// Updates the time spent on filling rows to spill. +void updateGlobalSpillFillTime(uint64_t timeUs); + +/// Updates the time spent on sorting rows to spill. +void updateGlobalSpillSortTime(uint64_t timeUs); + +/// Updates the stats for disk write including the number of disk writes, +/// the written bytes, the time spent on copying out (compression) for disk +/// writes, the time spent on disk writes. +void updateGlobalSpillWriteStats( + uint64_t spilledBytes, + uint64_t flushTimeUs, + uint64_t writeTimeUs); + +/// Updates the stats for disk read including the number of disk reads, the +/// amount of data read in bytes, and the time it takes to read from the disk. +void updateGlobalSpillReadStats( + uint64_t spillReads, + uint64_t spillReadBytes, + uint64_t spillRadTimeUs); + +/// Increments the spill memory bytes. +void updateGlobalSpillMemoryBytes(uint64_t spilledInputBytes); + +/// Increments the spilled files by one. +void incrementGlobalSpilledFiles(); + +/// Increments the exceeded max spill level count. +void updateGlobalMaxSpillLevelExceededCount( + uint64_t maxSpillLevelExceededCount); + +/// Increments the spill read deserialization time. +void updateGlobalSpillDeserializationTimeNs(uint64_t timeUs); + +/// Gets the cumulative global spill stats. +SpillStats globalSpillStats(); +} // namespace facebook::velox::common + +template <> +struct fmt::formatter + : fmt::formatter { + auto format(const facebook::velox::common::SpillStats& s, format_context& ctx) + const { + return formatter::format(s.toString(), ctx); + } +}; diff --git a/velox/common/base/StatsReporter.h b/velox/common/base/StatsReporter.h index b27ad242a672a..d306c31cde182 100644 --- a/velox/common/base/StatsReporter.h +++ b/velox/common/base/StatsReporter.h @@ -19,11 +19,11 @@ #include #include -/// StatsReporter designed to assist in reporting various stats of the +/// StatsReporter designed to assist in reporting various metrics of the /// application that uses velox library. The library itself does not implement /// the StatsReporter and it should be implemented by the application. /// -/// To inialize the reporter singleton in your application use this pattern +/// To initialize the reporter singleton in your application use this pattern /// (note that MyReporter should implement the abstract class /// BaseStatsReporter): /// @@ -31,34 +31,56 @@ /// return new MyReporter(); /// }); /// -/// Then, for every stat that needs to be reported, it is required to register -/// one (usually) or more types (StatType) before reporting the stat: +/// Then, for every metric that needs to be reported, it is required to register +/// one (usually) or more types (StatType) before reporting the metric: /// -/// REPORT_ADD_STAT_EXPORT_TYPE("my_stat1", facebook::velox::StatType::COUNT); +/// DEFINE_METRIC("my_stat1", facebook::velox::StatType::COUNT); /// /// To register one histogram, it requires the min and max value of -// the range, the bucket width as well as the percentiles to be reported. -/// REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE("my_stat2", 10, 0, 100, 50, 99, -/// 100); +/// the range, the bucket width as well as the percentiles to be reported. +/// DEFINE_HISTOGRAM_METRIC("my_stat2", 10, 0, 100, 50, 99, 100); /// -/// The StatType controls how counter/stat is aggregated. -/// After that, every call to REPORT_ADD_STAT_VALUE increases the counter by the +/// The StatType controls how metric is aggregated. +/// After that, every call to RECORD_METRIC_VALUE increases the metric by the /// given value: /// -/// By default the following will add 1 to the stat if not provided value -/// REPORT_ADD_STAT_VALUE("my_stat1"); -/// REPORT_ADD_STAT_VALUE("my_stat2", 10); -/// REPORT_ADD_STAT_VALUE("my_stat1", numOfFailures); +/// By default the following will add 1 to the metric if not provided value +/// RECORD_METRIC_VALUE("my_stat1"); +/// RECORD_METRIC_VALUE("my_stat2", 10); +/// RECORD_METRIC_VALUE("my_stat1", numOfFailures); namespace facebook::velox { enum class StatType { + /// Tracks the average of the inserted values. AVG, + /// Tracks the sum of the inserted values. SUM, + /// Tracks the sum of the inserted values per second. RATE, + /// Tracks the count of inserted values. COUNT, + /// Tracks the histogram of inserted values. + HISTOGRAM, }; +inline std::string statTypeString(StatType stat) { + switch (stat) { + case StatType::AVG: + return "Avg"; + case StatType::SUM: + return "Sum"; + case StatType::RATE: + return "Rate"; + case StatType::COUNT: + return "Count"; + case StatType::HISTOGRAM: + return "Histogram"; + default: + return fmt::format("UNKNOWN: {}", static_cast(stat)); + } +} + /// This is the base stats reporter interface that should be extended by /// different implementations. class BaseStatsReporter { @@ -68,25 +90,27 @@ class BaseStatsReporter { /// Register a stat of the given stat type. /// @param key The key to identify the stat. /// @param statType How the stat is aggregated. - virtual void addStatExportType(const char* key, StatType statType) const = 0; - - virtual void addStatExportType(folly::StringPiece key, StatType statType) + virtual void registerMetricExportType(const char* key, StatType statType) const = 0; + virtual void registerMetricExportType( + folly::StringPiece key, + StatType statType) const = 0; + /// Register a histogram with a list of percentiles defined. /// @param key The key to identify the histogram. /// @param bucketWidth The width of the buckets. /// @param min The starting value of the buckets. /// @param max The ending value of the buckets. /// @param pcts The aggregated percentiles to be reported. - virtual void addHistogramExportPercentiles( + virtual void registerHistogramMetricExportType( const char* key, int64_t bucketWidth, int64_t min, int64_t max, const std::vector& pcts) const = 0; - virtual void addHistogramExportPercentiles( + virtual void registerHistogramMetricExportType( folly::StringPiece key, int64_t bucketWidth, int64_t min, @@ -94,112 +118,122 @@ class BaseStatsReporter { const std::vector& pcts) const = 0; /// Add the given value to the stat. - virtual void addStatValue(const std::string& key, size_t value = 1) const = 0; + virtual void addMetricValue(const std::string& key, size_t value = 1) + const = 0; - virtual void addStatValue(const char* key, size_t value = 1) const = 0; + virtual void addMetricValue(const char* key, size_t value = 1) const = 0; - virtual void addStatValue(folly::StringPiece key, size_t value = 1) const = 0; + virtual void addMetricValue(folly::StringPiece key, size_t value = 1) + const = 0; /// Add the given value to the histogram. - virtual void addHistogramValue(const std::string& key, size_t value) + virtual void addHistogramMetricValue(const std::string& key, size_t value) const = 0; - virtual void addHistogramValue(const char* key, size_t value) const = 0; + virtual void addHistogramMetricValue(const char* key, size_t value) const = 0; - virtual void addHistogramValue(folly::StringPiece key, size_t value) + virtual void addHistogramMetricValue(folly::StringPiece key, size_t value) const = 0; + /// Return the aggregated metrics in a serialized string format. + virtual std::string fetchMetrics() = 0; + static bool registered; }; // This is a dummy reporter that does nothing class DummyStatsReporter : public BaseStatsReporter { public: - void addStatExportType(const char* /*key*/, StatType /*statType*/) + void registerMetricExportType(const char* /*key*/, StatType /*statType*/) const override {} - void addStatExportType(folly::StringPiece /*key*/, StatType /*statType*/) - const override {} + void registerMetricExportType( + folly::StringPiece /*key*/, + StatType /*statType*/) const override {} - void addHistogramExportPercentiles( + void registerHistogramMetricExportType( const char* /*key*/, int64_t /* bucketWidth */, int64_t /* min */, int64_t /* max */, const std::vector& /* pcts */) const override {} - void addHistogramExportPercentiles( + void registerHistogramMetricExportType( folly::StringPiece /* key */, int64_t /* bucketWidth */, int64_t /* min */, int64_t /* max */, const std::vector& /* pcts */) const override {} - void addStatValue(const std::string& /* key */, size_t /* value */) + void addMetricValue(const std::string& /* key */, size_t /* value */) const override {} - void addStatValue(const char* /* key */, size_t /* value */) const override {} + void addMetricValue(const char* /* key */, size_t /* value */) + const override {} - void addStatValue(folly::StringPiece /* key */, size_t /* value */) + void addMetricValue(folly::StringPiece /* key */, size_t /* value */) const override {} - void addHistogramValue(const std::string& /* key */, size_t /* value */) + void addHistogramMetricValue(const std::string& /* key */, size_t /* value */) const override {} - void addHistogramValue(const char* /* key */, size_t /* value */) + void addHistogramMetricValue(const char* /* key */, size_t /* value */) const override {} - void addHistogramValue(folly::StringPiece /* key */, size_t /* value */) + void addHistogramMetricValue(folly::StringPiece /* key */, size_t /* value */) const override {} + + std::string fetchMetrics() override { + return ""; + } }; -#define REPORT_ADD_STAT_VALUE(key, ...) \ +#define DEFINE_METRIC(key, type) \ { \ if (::facebook::velox::BaseStatsReporter::registered) { \ auto reporter = folly::Singleton< \ facebook::velox::BaseStatsReporter>::try_get_fast(); \ if (FOLLY_LIKELY(reporter != nullptr)) { \ - reporter->addStatValue((key), ##__VA_ARGS__); \ + reporter->registerMetricExportType((key), (type)); \ } \ } \ } -#define REPORT_ADD_STAT_EXPORT_TYPE(key, type) \ +#define RECORD_METRIC_VALUE(key, ...) \ { \ if (::facebook::velox::BaseStatsReporter::registered) { \ auto reporter = folly::Singleton< \ facebook::velox::BaseStatsReporter>::try_get_fast(); \ if (FOLLY_LIKELY(reporter != nullptr)) { \ - reporter->addStatExportType((key), (type)); \ + reporter->addMetricValue((key), ##__VA_ARGS__); \ } \ } \ } -#define REPORT_ADD_HISTOGRAM_VALUE(key, ...) \ +#define DEFINE_HISTOGRAM_METRIC(key, bucket, min, max, ...) \ { \ if (::facebook::velox::BaseStatsReporter::registered) { \ auto reporter = folly::Singleton< \ facebook::velox::BaseStatsReporter>::try_get_fast(); \ if (FOLLY_LIKELY(reporter != nullptr)) { \ - reporter->addHistogramValue((key), ##__VA_ARGS__); \ + reporter->registerHistogramMetricExportType( \ + (key), \ + (bucket), \ + (min), \ + (max), \ + (std::vector({__VA_ARGS__}))); \ } \ } \ } -#define REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE(key, bucket, min, max, ...) \ - { \ - if (::facebook::velox::BaseStatsReporter::registered) { \ - auto reporter = folly::Singleton< \ - facebook::velox::BaseStatsReporter>::try_get_fast(); \ - if (FOLLY_LIKELY(reporter != nullptr)) { \ - reporter->addHistogramExportPercentiles( \ - (key), \ - (bucket), \ - (min), \ - (max), \ - (std::vector({__VA_ARGS__}))); \ - } \ - } \ +#define RECORD_HISTOGRAM_METRIC_VALUE(key, ...) \ + { \ + if (::facebook::velox::BaseStatsReporter::registered) { \ + auto reporter = folly::Singleton< \ + facebook::velox::BaseStatsReporter>::try_get_fast(); \ + if (FOLLY_LIKELY(reporter != nullptr)) { \ + reporter->addHistogramMetricValue((key), ##__VA_ARGS__); \ + } \ + } \ } - } // namespace facebook::velox diff --git a/velox/common/base/Status.cpp b/velox/common/base/Status.cpp new file mode 100644 index 0000000000000..a170d3d9053f4 --- /dev/null +++ b/velox/common/base/Status.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/Status.h" + +#include +#include +#include +#include +#include + +namespace facebook::velox { + +std::string_view toString(StatusCode code) { + switch (code) { + case StatusCode::kOK: + return "OK"; + case StatusCode::kUserError: + return "User error"; + case StatusCode::kTypeError: + return "Type error"; + case StatusCode::kIndexError: + return "Index error"; + case StatusCode::kKeyError: + return "Key error"; + case StatusCode::kAlreadyExists: + return "Already exists"; + case StatusCode::kOutOfMemory: + return "Out of memory"; + case StatusCode::kIOError: + return "IOError"; + case StatusCode::kCancelled: + return "Cancelled"; + case StatusCode::kInvalid: + return "Invalid"; + case StatusCode::kUnknownError: + return "Unknown error"; + case StatusCode::kNotImplemented: + return "NotImplemented"; + } + return ""; // no-op +} + +Status::Status(StatusCode code) { + state_ = new State; + state_->code = code; +} + +Status::Status(StatusCode code, std::string msg) { + if (FOLLY_UNLIKELY(code == StatusCode::kOK)) { + throw std::invalid_argument("Cannot construct ok status with message"); + } + state_ = new State; + state_->code = code; + state_->msg = std::move(msg); +} + +void Status::copyFrom(const Status& s) { + delete state_; + if (s.state_ == nullptr) { + state_ = nullptr; + } else { + state_ = new State(*s.state_); + } +} + +std::string_view Status::codeAsString() const { + if (state_ == nullptr) { + return "OK"; + } + return ::facebook::velox::toString(code()); +} + +std::string Status::toString() const { + std::string result(codeAsString()); + if (state_ == nullptr) { + return result; + } + result += ": "; + result += state_->msg; + return result; +} + +void Status::abort() const { + abort(""); +} + +void Status::abort(const std::string_view& message) const { + std::cerr << "-- Velox Fatal Error --\n"; + if (!message.empty()) { + std::cerr << message << "\n"; + } + std::cerr << toString() << std::endl; + std::abort(); +} + +void Status::warn() const { + LOG(WARNING) << toString(); +} + +void Status::warn(const std::string_view& message) const { + LOG(WARNING) << message << ": " << toString(); +} + +} // namespace facebook::velox diff --git a/velox/common/base/Status.h b/velox/common/base/Status.h new file mode 100644 index 0000000000000..2f99acca70ec9 --- /dev/null +++ b/velox/common/base/Status.h @@ -0,0 +1,535 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Adapted from Apache Arrow. + +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace facebook::velox { + +/// The Status object is an object holding the outcome of an operation (success +/// or error). +/// +/// The outcome is represented as a StatusCode, holding either a success +/// (StatusCode::kOK) or an error (any other of the StatusCode enumeration +/// values). If an error occurred, a specific error message is generally +/// attached. +/// +/// The status object is commonly allocated in the stack, so it needs be compact +/// to be efficient. For the common success case, its size is a single (nullptr) +/// pointer. For failure cases, it allocates an external object containing the +/// StatusCode and error message. This keeps the object compact and prevents +/// allocations in the common success case. The same strategy is used in Status +/// object from other well-known libraries like Apache Arrow, RocksDB, Kudu, and +/// others. +/// +/// Simple usage: +/// +/// Status operation() { +/// if (noMoreMemory) { +/// return Status::OutOfMemory("Not enough memory to run 'operation'!"); +/// } +/// return Status::OK(); +/// } +/// +/// Call site: +/// +/// auto status = operation(); +/// if (status.ok()) { +/// ... +/// } else if (status.isOutOfMemory()) { +/// (gracefully handle out of memory) +/// } else if (status.isNotImplemented()) { +/// (and so on) +/// } +/// +/// Other common usage patterns: +/// +/// The same logic above can be implemented using helper macros: +/// +/// Status operation() { +/// VELOX_RETURN_IF(noMoreMemory, Status::OutOfMemory( +/// "Not enough memory to run 'operation'!")); +/// ... +/// return Status::OK(); +/// } +/// +/// To ensure operations succeed (or if not, return the same status from the +/// current function): +/// +/// ... +/// VELOX_RETURN_NOT_OK(operation1()); +/// VELOX_RETURN_NOT_OK(operation2()); +/// VELOX_RETURN_NOT_OK(operation3()); +/// ... + +/// This enum represents common categories of errors found in the library. These +/// are not meant to cover every specific error situation, but rather cover +/// broader categories of errors. Therefore, additions to this list should be +/// infrequent. +/// +/// Errors should be further described by attaching an error message to the +/// Status object. +/// +/// The error classes are loosely defined as follows: +/// +/// - kOk: A successful operation. No errors. +/// +/// - kUserError: An error triggered by bad input from an API user. The user +/// in this context usually means the program using Velox (or its end users). +/// +/// - kTypeError: An error triggered by a logical type mismatch (e.g. expecting +/// BIGINT but REAL provided). +/// +/// - kIndexError: An error triggered by the index of something being invalid or +/// out-of-bounds. +/// +/// - kKeyError: An error triggered by the key of something in a map/set being +/// invalid or not found. +/// +/// - kAlreadyExists: An error triggered by an operation meant to create some +/// form of resource which already exists. +/// +/// - kOutOfMemory: A failure triggered by a lack of available memory to +/// complete the operation. +/// +/// - kIOError: An error triggered by IO failures (e.g: network or disk/SSD read +/// error). +/// +/// - kCancelled: An error triggered because a certain resource required has +/// been stopped or cancelled. +/// +/// - kInvalid: An error triggered by an invalid program state. Usually +/// triggered by bugs. +/// +/// - kUnknownError: An error triggered by an unknown cause. Also usually +/// triggered by bugs. Should be used scarcely, favoring a more specific error +/// class above. +/// +/// - kNotImplemented: An error triggered by a feature not being implemented +/// yet. +/// +enum class StatusCode : int8_t { + kOK = 0, + kUserError = 1, + kTypeError = 2, + kIndexError = 3, + kKeyError = 4, + kAlreadyExists = 5, + kOutOfMemory = 6, + kIOError = 7, + kCancelled = 8, + kInvalid = 9, + kUnknownError = 10, + kNotImplemented = 11, +}; +std::string_view toString(StatusCode code); + +class [[nodiscard]] Status { + public: + // Create a success status. + constexpr Status() noexcept : state_(nullptr) {} + + ~Status() noexcept { + if (FOLLY_UNLIKELY(state_ != nullptr)) { + deleteState(); + } + } + + explicit Status(StatusCode code); + + Status(StatusCode code, std::string msg); + + // Copy the specified status. + inline Status(const Status& s); + inline Status& operator=(const Status& s); + + // Move the specified status. + inline Status(Status&& s) noexcept; + inline Status& operator=(Status&& s) noexcept; + + inline bool operator==(const Status& other) const noexcept; + inline bool operator!=(const Status& other) const noexcept { + return !(*this == other); + } + + // AND the statuses. + inline Status operator&(const Status& s) const noexcept; + inline Status operator&(Status&& s) const noexcept; + inline Status& operator&=(const Status& s) noexcept; + inline Status& operator&=(Status&& s) noexcept; + + inline friend std::ostream& operator<<(std::ostream& ss, const Status& s) { + return ss << s.toString(); + } + + /// Return a success status. + static Status OK() { + return Status(); + } + + // The static factory methods below do not follow the lower camel-case pattern + // as they are meant to represent classes of errors. For example: + // + // auto st1 = Status::UserError("my error"): + // auto st2 = Status::TypeError("my other error"): + + /// Return an error status for user errors. + template + static Status UserError(Args&&... args) { + return Status::fromArgs( + StatusCode::kUserError, std::forward(args)...); + } + + /// Return an error status for type errors (such as mismatching data types) + template + static Status TypeError(Args&&... args) { + return Status::fromArgs( + StatusCode::kTypeError, std::forward(args)...); + } + + /// Return an error status when an index is out of bounds + template + static Status IndexError(Args&&... args) { + return Status::fromArgs( + StatusCode::kIndexError, std::forward(args)...); + } + + /// Return an error status for failed key lookups (e.g. column name in a + /// table) + template + static Status KeyError(Args&&... args) { + return Status::fromArgs(StatusCode::kKeyError, std::forward(args)...); + } + + /// Return an error status when something already exists (e.g. a file). + template + static Status AlreadyExists(Args&&... args) { + return Status::fromArgs( + StatusCode::kAlreadyExists, std::forward(args)...); + } + + /// Return an error status for out-of-memory conditions. + template + static Status OutOfMemory(Args&&... args) { + return Status::fromArgs( + StatusCode::kOutOfMemory, std::forward(args)...); + } + + /// Return an error status when some IO-related operation failed + template + static Status IOError(Args&&... args) { + return Status::fromArgs(StatusCode::kIOError, std::forward(args)...); + } + + /// Return an error status for cancelled operation + template + static Status Cancelled(Args&&... args) { + return Status::fromArgs( + StatusCode::kCancelled, std::forward(args)...); + } + + /// Return an error status for invalid data (for example a string that fails + /// parsing) + template + static Status Invalid(Args&&... args) { + return Status::fromArgs(StatusCode::kInvalid, std::forward(args)...); + } + + /// Return an error status for unknown errors + template + static Status UnknownError(Args&&... args) { + return Status::fromArgs( + StatusCode::kUnknownError, std::forward(args)...); + } + + /// Return an error status when an operation or a combination of operation and + /// data types is unimplemented + template + static Status NotImplemented(Args&&... args) { + return Status::fromArgs( + StatusCode::kNotImplemented, std::forward(args)...); + } + + /// Return true iff the status indicates success. + constexpr bool ok() const { + return (state_ == nullptr); + } + + /// Return true iff the status indicates an user error. + constexpr bool isUserError() const { + return code() == StatusCode::kUserError; + } + + /// Return true iff the status indicates a type error. + constexpr bool isTypeError() const { + return code() == StatusCode::kTypeError; + } + + /// Return true iff the status indicates an out of bounds index. + constexpr bool isIndexError() const { + return code() == StatusCode::kIndexError; + } + + /// Return true iff the status indicates a key lookup error. + constexpr bool isKeyError() const { + return code() == StatusCode::kKeyError; + } + + /// Return true iff the status indicates that something already exists. + constexpr bool isAlreadyExists() const { + return code() == StatusCode::kAlreadyExists; + } + + /// Return true iff the status indicates an out-of-memory error. + constexpr bool isOutOfMemory() const { + return code() == StatusCode::kOutOfMemory; + } + + /// Return true iff the status indicates an IO-related failure. + constexpr bool isIOError() const { + return code() == StatusCode::kIOError; + } + + /// Return true iff the status indicates a cancelled operation. + constexpr bool isCancelled() const { + return code() == StatusCode::kCancelled; + } + + /// Return true iff the status indicates invalid data. + constexpr bool isInvalid() const { + return code() == StatusCode::kInvalid; + } + + /// Return true iff the status indicates an unknown error. + constexpr bool isUnknownError() const { + return code() == StatusCode::kUnknownError; + } + + /// Return true iff the status indicates an unimplemented operation. + constexpr bool isNotImplemented() const { + return code() == StatusCode::kNotImplemented; + } + + /// Return a string representation of this status suitable for printing. + /// + /// The string "OK" is returned for success. + std::string toString() const; + + /// Return a string representation of the status code, without the message + /// text or POSIX code information. + std::string_view codeAsString() const; + static std::string_view codeAsString(StatusCode); + + /// Return the StatusCode value attached to this status. + constexpr StatusCode code() const { + return ok() ? StatusCode::kOK : state_->code; + } + + /// Return the specific error message attached to this status. + const std::string& message() const { + static const std::string kNoMessage = ""; + return ok() ? kNoMessage : state_->msg; + } + + /// Return a new Status with changed message, copying the existing status + /// code. + template + Status withMessage(Args&&... args) const { + return fromArgs(code(), std::forward(args)...); + } + + void warn() const; + void warn(const std::string_view& message) const; + + [[noreturn]] void abort() const; + [[noreturn]] void abort(const std::string_view& message) const; + + private: + template + static Status + fromArgs(StatusCode code, fmt::string_view fmt, Args&&... args) { + return Status(code, fmt::vformat(fmt, fmt::make_format_args(args...))); + } + + static Status fromArgs(StatusCode code) { + return Status(code); + } + + void deleteState() { + delete state_; + state_ = nullptr; + } + + void copyFrom(const Status& s); + inline void moveFrom(Status& s); + + struct State { + StatusCode code; + std::string msg; + }; + + // OK status has a `nullptr` state_. Otherwise, `state_` points to + // a `State` structure containing the error code and message(s) + State* state_; +}; + +// Copy the specified status. +Status::Status(const Status& s) + : state_((s.state_ == nullptr) ? nullptr : new State(*s.state_)) {} + +Status& Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (state_ != s.state_) { + copyFrom(s); + } + return *this; +} + +// Move the specified status. +Status::Status(Status&& s) noexcept : state_(s.state_) { + s.state_ = nullptr; +} + +Status& Status::operator=(Status&& s) noexcept { + moveFrom(s); + return *this; +} + +inline bool Status::operator==(const Status& other) const noexcept { + if (state_ == other.state_) { + return true; + } + + if (ok() || other.ok()) { + return false; + } + return (code() == other.code()) && (message() == other.message()); +} + +Status Status::operator&(const Status& s) const noexcept { + if (ok()) { + return s; + } else { + return *this; + } +} + +Status Status::operator&(Status&& s) const noexcept { + if (ok()) { + return std::move(s); + } else { + return *this; + } +} + +Status& Status::operator&=(const Status& s) noexcept { + if (ok() && !s.ok()) { + copyFrom(s); + } + return *this; +} + +Status& Status::operator&=(Status&& s) noexcept { + if (ok() && !s.ok()) { + moveFrom(s); + } + return *this; +} + +void Status::moveFrom(Status& s) { + delete state_; + state_ = s.state_; + s.state_ = nullptr; +} + +// Helper Macros. + +#define _VELOX_STRINGIFY(x) #x + +/// Return with given status if condition is met. +#define VELOX_RETURN_IF(condition, status) \ + do { \ + if (FOLLY_UNLIKELY(condition)) { \ + return (status); \ + } \ + } while (0) + +/// Propagate any non-successful Status to the caller. +#define VELOX_RETURN_NOT_OK(status) \ + do { \ + ::facebook::velox::Status __s = \ + ::facebook::velox::internal::genericToStatus(status); \ + VELOX_RETURN_IF(!__s.ok(), __s); \ + } while (false) + +namespace internal { + +/// Common API for extracting Status from either Status or Result (the latter +/// is defined in Result.h). +/// Useful for status check macros such as VELOX_RETURN_NOT_OK. +inline const Status& genericToStatus(const Status& st) { + return st; +} +inline Status genericToStatus(Status&& st) { + return std::move(st); +} + +} // namespace internal + +/// Holds a result or an error. Designed to be used by APIs that do not throw. +/// +/// Here is an example of a modulo operation that doesn't throw, but indicates +/// failure using Status. +/// +/// Expected modulo(int a, int b) { +/// if (b == 0) { +/// return folly::makeUnexpected(Status::UserError("division by zero")); +/// } +/// +/// return a % b; +/// } +/// +/// Status should not be OK. +template +using Expected = folly::Expected; + +} // namespace facebook::velox + +template <> +struct fmt::formatter : fmt::formatter { + auto format(const facebook::velox::Status& s, format_context& ctx) const { + return formatter::format(s.toString(), ctx); + } +}; + +template <> +struct fmt::formatter + : fmt::formatter { + auto format(facebook::velox::StatusCode code, format_context& ctx) const { + return formatter::format( + facebook::velox::toString(code), ctx); + } +}; diff --git a/velox/common/base/SuccinctPrinter.h b/velox/common/base/SuccinctPrinter.h index e5c3cc9cf9871..a09b487a912a8 100644 --- a/velox/common/base/SuccinctPrinter.h +++ b/velox/common/base/SuccinctPrinter.h @@ -15,6 +15,8 @@ */ #pragma once + +#include #include namespace facebook::velox { diff --git a/velox/common/base/VeloxException.cpp b/velox/common/base/VeloxException.cpp index ffdb32ac853eb..0c00c461b5e95 100644 --- a/velox/common/base/VeloxException.cpp +++ b/velox/common/base/VeloxException.cpp @@ -25,7 +25,7 @@ namespace velox { std::exception_ptr toVeloxException(const std::exception_ptr& exceptionPtr) { try { std::rethrow_exception(exceptionPtr); - } catch (const VeloxException& e) { + } catch (const VeloxException&) { return exceptionPtr; } catch (const std::exception& e) { return std::make_exception_ptr( @@ -38,34 +38,41 @@ int64_t& threadNumVeloxThrow() { return numThrow; } +bool& threadSkipErrorDetails() { + thread_local bool skipErrorDetails{false}; + return skipErrorDetails; +} + ExceptionContext& getExceptionContext() { thread_local ExceptionContext context; return context; } -// Retrieves the message of the top-level ancestor of the current exception -// context. If the top-level context message is not empty and is the same as the -// current one, returns a string indicating they are the same. -std::string getTopLevelExceptionContextString( +// Traverses the context hierarchy and appends messages from all contexts that +// are marked as essential. +std::string getAdditionalExceptionContextString( VeloxException::Type exceptionType, const std::string& currentMessage) { auto* context = &getExceptionContext(); - if (context->parent && context->parent->parent) { - while (context->parent && context->parent->parent) { - context = context->parent; - } - auto topLevelMessage = context->message(exceptionType); - if (!topLevelMessage.empty() && topLevelMessage == currentMessage) { - return "Same as context."; - } else { - return topLevelMessage; + std::string additionalMessage = ""; + if (!context->parent || !context->parent->parent) { + return additionalMessage; + } + context = context->parent; + while (context->parent) { + if (context->isEssential) { + auto message = context->message(exceptionType); + if (!message.empty()) { + additionalMessage += message + " "; + } } + context = context->parent; } - - if (!currentMessage.empty()) { - return "Same as context."; + if (!additionalMessage.empty()) { + // Get rid of the extra space at the end. + additionalMessage.pop_back(); } - return ""; + return additionalMessage; } VeloxException::VeloxException( @@ -90,8 +97,8 @@ VeloxException::VeloxException( state.errorSource = errorSource; state.errorCode = errorCode; state.context = getExceptionContext().message(exceptionType); - state.topLevelContext = - getTopLevelExceptionContextString(exceptionType, state.context); + state.additionalContext = + getAdditionalExceptionContextString(exceptionType, state.context); state.isRetriable = isRetriable; })) {} @@ -114,8 +121,8 @@ VeloxException::VeloxException( state.errorSource = errorSource; state.errorCode = errorCode; state.context = getExceptionContext().message(exceptionType); - state.topLevelContext = - getTopLevelExceptionContextString(exceptionType, state.context); + state.additionalContext = + getAdditionalExceptionContextString(exceptionType, state.context); state.isRetriable = isRetriable; state.wrappedException = e; })) {} @@ -223,8 +230,8 @@ void VeloxException::State::finalize() const { elaborateMessage += "Context: " + context + "\n"; } - if (!topLevelContext.empty()) { - elaborateMessage += "Top-Level Context: " + topLevelContext + "\n"; + if (!additionalContext.empty()) { + elaborateMessage += "Additional Context: " + additionalContext + "\n"; } if (function) { diff --git a/velox/common/base/VeloxException.h b/velox/common/base/VeloxException.h index 15930f4ee49df..bb22e9a490c23 100644 --- a/velox/common/base/VeloxException.h +++ b/velox/common/base/VeloxException.h @@ -99,11 +99,23 @@ inline constexpr auto kMemAllocError = "MEM_ALLOC_ERROR"_fs; // Error caused by failing to allocate cache buffer space for IO. inline constexpr auto kNoCacheSpace = "NO_CACHE_SPACE"_fs; +// An error raised when spill bytes exceeds limits. +inline constexpr auto kSpillLimitExceeded = "SPILL_LIMIT_EXCEEDED"_fs; + // Errors indicating file read corruptions. inline constexpr auto kFileCorruption = "FILE_CORRUPTION"_fs; +// Errors indicating file not found. +inline constexpr auto kFileNotFound = "FILE_NOT_FOUND"_fs; + // We do not know how to classify it yet. inline constexpr auto kUnknown = "UNKNOWN"_fs; + +// VeloxRuntimeErrors due to unsupported input values such as unicode input to +// cast-varchar-to-integer and timestamps beyond the year 2037 to datetime +// functions. This kind of errors is allowed in expression fuzzer. +inline constexpr auto kUnsupportedInputUncatchable = + "UNSUPPORTED_INPUT_UNCATCHABLE"_fs; } // namespace error_code class VeloxException : public std::exception { @@ -201,8 +213,8 @@ class VeloxException : public std::exception { return state_->context; } - const std::string& topLevelContext() const { - return state_->topLevelContext; + const std::string& additionalContext() const { + return state_->additionalContext; } const std::exception_ptr& wrappedException() const { @@ -224,7 +236,7 @@ class VeloxException : public std::exception { // The current exception context. std::string context; // The top-level ancestor of the current exception context. - std::string topLevelContext; + std::string additionalContext; bool isRetriable; // The original std::exception. std::exception_ptr wrappedException; @@ -335,6 +347,25 @@ class VeloxRuntimeError final : public VeloxException { /// Returns a reference to a thread level counter of Velox error throws. int64_t& threadNumVeloxThrow(); +/// Returns a reference to a thread level boolean that controls whether no-throw +/// APIs include detailed error messages in Status. +bool& threadSkipErrorDetails(); + +class ScopedThreadSkipErrorDetails { + public: + ScopedThreadSkipErrorDetails(bool skip = true) + : original_{threadSkipErrorDetails()} { + threadSkipErrorDetails() = skip; + } + + ~ScopedThreadSkipErrorDetails() { + threadSkipErrorDetails() = original_; + } + + private: + bool original_; +}; + /// Holds a pointer to a function that provides addition context to be /// added to the detailed error message in case of an exception. struct ExceptionContext { @@ -347,6 +378,10 @@ struct ExceptionContext { /// Value to pass to `messageFunc`. Can be null. void* arg{nullptr}; + /// If true, then the addition context in 'this' is always included when there + /// are hierarchical exception contexts. + bool isEssential{false}; + /// Pointer to the parent context when there are hierarchical exception /// contexts. ExceptionContext* parent{nullptr}; diff --git a/velox/common/base/benchmarks/BitUtilBenchmark.cpp b/velox/common/base/benchmarks/BitUtilBenchmark.cpp index 6f8ac7cefbbd2..453c61ddcaefc 100644 --- a/velox/common/base/benchmarks/BitUtilBenchmark.cpp +++ b/velox/common/base/benchmarks/BitUtilBenchmark.cpp @@ -191,7 +191,7 @@ BENCHMARK_RELATIVE_MULTI(forEachBitFirstBitFalse) { } // namespace facebook int main(int argc, char** argv) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; folly::runBenchmarks(); return 0; } diff --git a/velox/common/base/benchmarks/SimdUtilBenchmark.cpp b/velox/common/base/benchmarks/SimdUtilBenchmark.cpp new file mode 100644 index 0000000000000..c358b673c05ae --- /dev/null +++ b/velox/common/base/benchmarks/SimdUtilBenchmark.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/SimdUtil.h" + +#include +#include + +#include + +namespace facebook::velox { +namespace { + +#define VELOX_BENCHMARK(_type, _name, ...) \ + [[maybe_unused]] _type _name(FOLLY_PP_STRINGIZE(_name), __VA_ARGS__) + +template +class LeadingMask { + public: + LeadingMask(const char* name, std::default_random_engine& gen) { + std::uniform_int_distribution<> dist(0, xsimd::batch::size + 1); + for (int i = 0; i < kSize; ++i) { + inputs_[i] = dist(gen); + } + folly::addBenchmark(__FILE__, name, [this] { return run(); }); + } + + private: + unsigned run() { + xsimd::batch_bool ans = {}; + for (int i = 0; i < kSize; ++i) { + ans = ans ^ simd::leadingMask(inputs_[i]); + } + folly::doNotOptimizeAway(ans); + return kSize; + } + + static constexpr int kSize = 4 << 10; + int8_t inputs_[kSize]; +}; + +template +class FromBitMask { + public: + FromBitMask(const char* name, std::default_random_engine& gen) { + std::uniform_int_distribution dist( + 0, (1ull << xsimd::batch::size) - 1); + for (int i = 0; i < kSize; ++i) { + inputs_[i] = dist(gen); + } + folly::addBenchmark(__FILE__, name, [this] { return run(); }); + } + + private: + unsigned run() { + xsimd::batch_bool ans = {}; + for (int i = 0; i < kSize; ++i) { + ans = ans ^ simd::fromBitMask(inputs_[i]); + } + folly::doNotOptimizeAway(ans); + return kSize; + } + + static constexpr int kSize = 2 << 10; + uint64_t inputs_[kSize]; +}; + +} // namespace +} // namespace facebook::velox + +int main(int argc, char* argv[]) { + using namespace facebook::velox; + folly::Init follyInit(&argc, &argv); + std::default_random_engine gen(std::random_device{}()); + VELOX_BENCHMARK(LeadingMask, leadingMaskInt32, gen); + VELOX_BENCHMARK(LeadingMask, leadingMaskInt64, gen); + VELOX_BENCHMARK(FromBitMask, fromBitMaskInt32, gen); + VELOX_BENCHMARK(FromBitMask, fromBitMaskInt64, gen); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/common/base/tests/AsyncSourceTest.cpp b/velox/common/base/tests/AsyncSourceTest.cpp index 1ed6f90893d17..657a7ba8e08e8 100644 --- a/velox/common/base/tests/AsyncSourceTest.cpp +++ b/velox/common/base/tests/AsyncSourceTest.cpp @@ -18,11 +18,13 @@ #include #include #include +#include #include #include #include "velox/common/base/Exceptions.h" using namespace facebook::velox; +using namespace std::chrono_literals; // A sample class to be constructed via AsyncSource. struct Gizmo { @@ -39,6 +41,7 @@ TEST(AsyncSourceTest, basic) { auto value = gizmo.move(); EXPECT_FALSE(gizmo.hasValue()); EXPECT_EQ(11, value->id); + EXPECT_EQ(1, gizmo.prepareTiming().count); AsyncSource error( []() -> std::unique_ptr { VELOX_USER_FAIL("Testing error"); }); @@ -111,7 +114,7 @@ TEST(AsyncSourceTest, errorsWithThreads) { std::atomic numErrors{0}; for (auto i = 0; i < kNumGizmos; ++i) { gizmos.push_back( - std::make_shared>([i]() -> std::unique_ptr { + std::make_shared>([]() -> std::unique_ptr { std::this_thread::sleep_for(std::chrono::milliseconds(1)); // NOLINT VELOX_USER_FAIL("Testing error"); })); @@ -136,7 +139,7 @@ TEST(AsyncSourceTest, errorsWithThreads) { auto gizmo = gizmos[folly::Random::rand32(rng) % gizmos.size()]->move(); EXPECT_EQ(nullptr, gizmo); - } catch (std::exception& e) { + } catch (std::exception&) { ++numErrors; } } @@ -149,4 +152,147 @@ TEST(AsyncSourceTest, errorsWithThreads) { // There will always be errors since the first to wait for any given // gizmo is sure to get an error. EXPECT_LT(0, numErrors); + for (auto& source : gizmos) { + source->close(); + } +} + +class DataCounter { + public: + DataCounter() { + objectNumber_ = ++numCreatedDataCounters_; + } + + ~DataCounter() { + ++numDeletedDataCounters_; + } + + static uint64_t numCreatedDataCounters() { + return numCreatedDataCounters_; + } + + static uint64_t numDeletedDataCounters() { + return numDeletedDataCounters_; + } + + static void reset() { + numCreatedDataCounters_ = 0; + numDeletedDataCounters_ = 0; + } + + uint64_t objectNumber() const { + return objectNumber_; + } + + private: + static std::atomic numCreatedDataCounters_; + static std::atomic numDeletedDataCounters_; + + uint64_t objectNumber_{0}; +}; + +std::atomic DataCounter::numCreatedDataCounters_ = 0; + +std::atomic DataCounter::numDeletedDataCounters_ = 0; + +TEST(AsyncSourceTest, close) { + // If 'prepare()' is not executed within the thread pool, invoking 'close()' + // will set 'make_' to nullptr. The deletion of 'dateCounter' is used as a + // verification for this behavior. + auto dateCounter = std::make_shared(); + AsyncSource countAsyncSource([dateCounter]() { + return std::make_unique(dateCounter->objectNumber()); + }); + dateCounter.reset(); + EXPECT_EQ(DataCounter::numCreatedDataCounters(), 1); + EXPECT_EQ(DataCounter::numDeletedDataCounters(), 0); + + countAsyncSource.close(); + EXPECT_EQ(DataCounter::numCreatedDataCounters(), 1); + EXPECT_EQ(DataCounter::numDeletedDataCounters(), 1); + DataCounter::reset(); + + // If 'prepare()' is executed within the thread pool but 'move()' is not + // invoked, invoking 'close()' will set 'item_' to nullptr. The deletion of + // 'dateCounter' is used as a verification for this behavior. + auto asyncSource = std::make_shared>( + []() { return std::make_unique(); }); + asyncSource->prepare(); + EXPECT_EQ(DataCounter::numCreatedDataCounters(), 1); + EXPECT_EQ(DataCounter::numDeletedDataCounters(), 0); + + asyncSource->close(); + EXPECT_EQ(DataCounter::numCreatedDataCounters(), 1); + EXPECT_EQ(DataCounter::numDeletedDataCounters(), 1); + DataCounter::reset(); + + // If 'prepare()' is currently being executed within the thread pool, + // 'close()' should wait for the completion of 'prepare()' and set 'item_' to + // nullptr. + folly::Baton<> baton; + auto sleepAsyncSource = + std::make_shared>([&baton]() { + baton.post(); + return std::make_unique(); + }); + auto thread1 = + std::thread([&sleepAsyncSource] { sleepAsyncSource->prepare(); }); + EXPECT_TRUE(baton.try_wait_for(1s)); + sleepAsyncSource->close(); + EXPECT_EQ(DataCounter::numCreatedDataCounters(), 1); + EXPECT_EQ(DataCounter::numDeletedDataCounters(), 1); + thread1.join(); +} + +void verifyContexts( + const std::string& expectedPoolName, + const std::string& expectedTaskId) { + EXPECT_EQ(process::GetThreadDebugInfo()->taskId_, expectedTaskId); +} + +TEST(AsyncSourceTest, emptyContexts) { + EXPECT_EQ(process::GetThreadDebugInfo(), nullptr); + + AsyncSource src([]() { + // The Contexts at the time this was created were null so we should inherit + // them from the caller. + verifyContexts("test", "task_id"); + + return std::make_unique(true); + }); + + process::ThreadDebugInfo debugInfo{"query_id", "task_id", nullptr}; + process::ScopedThreadDebugInfo scopedDebugInfo(debugInfo); + + verifyContexts("test", "task_id"); + + ASSERT_TRUE(*src.move()); + + verifyContexts("test", "task_id"); +} + +TEST(AsyncSourceTest, setContexts) { + process::ThreadDebugInfo debugInfo1{"query_id1", "task_id1", nullptr}; + + std::unique_ptr> src; + process::ScopedThreadDebugInfo scopedDebugInfo1(debugInfo1); + + verifyContexts("test1", "task_id1"); + + src = std::make_unique>(([]() { + // The Contexts at the time this was created were set so we should have + // the same contexts when this is executed. + verifyContexts("test1", "task_id1"); + + return std::make_unique(true); + })); + + process::ThreadDebugInfo debugInfo2{"query_id2", "task_id2", nullptr}; + process::ScopedThreadDebugInfo scopedDebugInfo2(debugInfo2); + + verifyContexts("test2", "task_id2"); + + ASSERT_TRUE(*src->move()); + + verifyContexts("test2", "task_id2"); } diff --git a/velox/common/base/tests/BitUtilTest.cpp b/velox/common/base/tests/BitUtilTest.cpp index 36d63f0dd1a81..c0c31464d3b51 100644 --- a/velox/common/base/tests/BitUtilTest.cpp +++ b/velox/common/base/tests/BitUtilTest.cpp @@ -461,12 +461,29 @@ TEST_F(BitUtilTest, forEachBit) { } TEST_F(BitUtilTest, hash) { - std::unordered_set hashes; - const char* text = "Forget the night, come live with us in forests of azure"; - for (int32_t i = 0; i < strlen(text); ++i) { - hashes.insert(hashBytes(1, text, i)); + std::unordered_map hashes; + std::string text = + "Forget the night, come live with us in forests of azure, " + "for we have constructed pyramids in honor of our escaping..."; + for (int32_t i = 0; i < text.size(); ++i) { + // starts hashing at unaligned addresses. + int32_t offset = i > 3 && i < text.size() - 3 ? i % 3 : 0; + auto hash = hashBytes(1, text.data() + offset, i); + if (i + offset < text.size() - 1) { + ++text[i + offset]; + // Change the first byte after the hashed range and check that the hash + // function does not overread its range. + EXPECT_EQ(hash, hashBytes(1, text.data() + offset, i)); + --text[i + offset]; + } + auto it = hashes.find(hash); + if (it == hashes.end()) { + hashes[hash] = i; + } else { + EXPECT_TRUE(false) << "Duplicate hash at " << i; + } } - EXPECT_EQ(hashes.size(), strlen(text)); + EXPECT_EQ(hashes.size(), text.size()); } TEST_F(BitUtilTest, nextPowerOfTwo) { @@ -482,6 +499,9 @@ TEST_F(BitUtilTest, nextPowerOfTwo) { EXPECT_EQ(nextPowerOfTwo(31), 32); EXPECT_EQ(nextPowerOfTwo(32), 32); EXPECT_EQ(nextPowerOfTwo(33), 64); + EXPECT_EQ(nextPowerOfTwo(1ULL << 32), 1ULL << 32); + EXPECT_EQ(nextPowerOfTwo((1ULL << 32) + 1), 1ULL << 33); + EXPECT_EQ(nextPowerOfTwo((1ULL << 62) + 1), 1ULL << 63); } TEST_F(BitUtilTest, isPowerOfTwo) { @@ -833,6 +853,84 @@ TEST_F(BitUtilTest, countLeadingZeros) { EXPECT_EQ( countLeadingZeros<__uint128_t>(HugeInt::build(0x08FFFFFFFFFFFFFF, 0)), 4); } + +TEST_F(BitUtilTest, storeBitsToByte) { + uint8_t bytes[3]{}; + storeBitsToByte<8>(0xAA, bytes, 0); + ASSERT_EQ(bytes[0], 0xAA); + ASSERT_EQ(bytes[1], 0); + ASSERT_EQ(bytes[2], 0); + storeBitsToByte<4>(0x5, bytes, 8); + ASSERT_EQ(bytes[0], 0xAA); + ASSERT_EQ(bytes[1], 0x5); + ASSERT_EQ(bytes[2], 0); + storeBitsToByte<4>(0xA, bytes, 12); + ASSERT_EQ(bytes[0], 0xAA); + ASSERT_EQ(bytes[1], 0xA5); + ASSERT_EQ(bytes[2], 0); +} + +TEST_F(BitUtilTest, roundUp) { + struct { + uint64_t value; + uint64_t factor; + uint64_t expected; + + std::string debugString() const { + return fmt::format( + "value: {}, factor: {}, expected: {}", value, factor, expected); + } + } testSettings[] = { + {10, 1, 10}, + {10, 3, 12}, + {10, 4, 12}, + {10, 10, 10}, + {10, 11, 11}, + {10, 20, 20}, + {11, 1, 11}, + {11, 3, 12}, + {11, 4, 12}, + {11, 11, 11}, + {11, 12, 12}, + {11, 23, 23}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + ASSERT_EQ( + bits::roundUp(testData.value, testData.factor), testData.expected); + } +} + +TEST_F(BitUtilTest, divRoundUp) { + struct { + uint64_t value; + uint64_t factor; + uint64_t expected; + + std::string debugString() const { + return fmt::format( + "value: {}, factor: {}, expected: {}", value, factor, expected); + } + } testSettings[] = { + {10, 1, 10}, + {10, 3, 4}, + {10, 4, 3}, + {10, 10, 1}, + {10, 11, 1}, + {10, 20, 1}, + {11, 1, 11}, + {11, 3, 4}, + {11, 4, 3}, + {11, 11, 1}, + {11, 12, 1}, + {11, 23, 1}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + ASSERT_EQ( + bits::divRoundUp(testData.value, testData.factor), testData.expected); + } +} } // namespace bits } // namespace velox } // namespace facebook diff --git a/velox/common/base/tests/CMakeLists.txt b/velox/common/base/tests/CMakeLists.txt index 4731d686db904..51fa95a14f95f 100644 --- a/velox/common/base/tests/CMakeLists.txt +++ b/velox/common/base/tests/CMakeLists.txt @@ -18,31 +18,42 @@ add_executable( BitUtilTest.cpp BloomFilterTest.cpp CoalesceIoTest.cpp + ConcurrentCounterTest.cpp ExceptionTest.cpp FsTest.cpp RangeTest.cpp RawVectorTest.cpp RuntimeMetricsTest.cpp ScopedLockTest.cpp + ScratchTest.cpp SemaphoreTest.cpp SimdUtilTest.cpp + SpillConfigTest.cpp + SpillStatsTest.cpp StatsReporterTest.cpp + StatusTest.cpp SuccinctPrinterTest.cpp) add_test(velox_base_test velox_base_test) target_link_libraries( velox_base_test - PRIVATE velox_common_base - velox_exception - velox_temp_path - Boost::filesystem - Boost::headers - Folly::folly - fmt::fmt - gflags::gflags - gtest - gtest_main) + PRIVATE + velox_caching + velox_common_base + velox_memory + velox_time + velox_status + velox_exception + velox_temp_path + Boost::filesystem + Boost::headers + Folly::folly + fmt::fmt + gflags::gflags + GTest::gtest + GTest::gmock + GTest::gtest_main) add_executable(velox_id_map_test IdMapTest.cpp) @@ -56,11 +67,16 @@ target_link_libraries( Boost::headers gflags::gflags glog::glog - gtest - gtest_main + GTest::gtest + GTest::gtest_main pthread) add_executable(velox_memcpy_meter Memcpy.cpp) target_link_libraries( - velox_memcpy_meter PRIVATE velox_common_base velox_exception velox_time - Folly::folly gflags::gflags) + velox_memcpy_meter + PRIVATE + velox_common_base + velox_exception + velox_time + Folly::folly + gflags::gflags) diff --git a/velox/common/base/tests/ConcurrentCounterTest.cpp b/velox/common/base/tests/ConcurrentCounterTest.cpp new file mode 100644 index 0000000000000..5b5975a2389fe --- /dev/null +++ b/velox/common/base/tests/ConcurrentCounterTest.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/ConcurrentCounter.h" + +#include +#include +#include +#include +#include "velox/common/base/tests/GTestUtils.h" + +namespace facebook::velox::common::test { + +class ConcurrentCounterTest : public testing::TestWithParam { + protected: + void SetUp() override { + setupCounter(); + } + + void update(int64_t delta) { + if (useUpdateFn_) { + counter_->update( + delta, [&](int64_t& counter, int64_t delta, std::mutex& lock) { + std::lock_guard l(lock); + counter += delta; + return true; + }); + } else { + counter_->update(delta); + } + } + + int64_t read() const { + return counter_->read(); + } + + void setupCounter() { + counter_ = std::make_unique>( + std::thread::hardware_concurrency()); + } + + const bool useUpdateFn_{GetParam()}; + + std::unique_ptr> counter_; +}; + +TEST_P(ConcurrentCounterTest, basic) { + ASSERT_EQ(read(), 0); + update(1); + ASSERT_EQ(read(), 1); + update(1); + ASSERT_EQ(read(), 2); + update(-1); + ASSERT_EQ(read(), 1); + update(-3); + ASSERT_EQ(read(), -2); +} + +TEST_P(ConcurrentCounterTest, multithread) { + const int32_t numUpdatesPerThread = 5'000; + std::vector numThreads; + numThreads.push_back(1); + numThreads.push_back(std::thread::hardware_concurrency()); + numThreads.push_back(std::thread::hardware_concurrency() * 2); + for (int numThreads : numThreads) { + SCOPED_TRACE(fmt::format("numThreads: {}", numThreads)); + counter_->testingClear(); + ASSERT_EQ(counter_->read(), 0); + + std::vector threads; + threads.reserve(numThreads); + std::vector counts(numThreads, 0); + for (size_t i = 0; i < numThreads; ++i) { + ASSERT_EQ(counts[i], 0); + threads.emplace_back([&, i]() { + folly::Random::DefaultGenerator rng; + rng.seed(1234 + i); + ASSERT_EQ(counts[i], 0); + for (int j = 0; j < numUpdatesPerThread; ++j) { + const int delta = folly::Random::rand32(rng); + counts[i] += delta; + update(delta); + } + }); + } + + for (auto& th : threads) { + th.join(); + } + int64_t expectedCount{0}; + for (int i = 0; i < numThreads; ++i) { + expectedCount += counts[i]; + } + ASSERT_EQ(read(), expectedCount); + } +} + +VELOX_INSTANTIATE_TEST_SUITE_P( + ConcurrentCounterTestSuite, + ConcurrentCounterTest, + testing::ValuesIn({false, true})); + +} // namespace facebook::velox::common::test diff --git a/velox/common/base/tests/ExceptionTest.cpp b/velox/common/base/tests/ExceptionTest.cpp index 4e5dd6dbaa54d..9386b8cb672e0 100644 --- a/velox/common/base/tests/ExceptionTest.cpp +++ b/velox/common/base/tests/ExceptionTest.cpp @@ -583,11 +583,13 @@ TEST(ExceptionTest, context) { }; { - // Create multi-layer contexts. + // Create multi-layer contexts with top level marked as essential. MessageFunctionArg topLevelTroubleshootingAid{ "Top-level troubleshooting aid.", &callCount}; - facebook::velox::ExceptionContextSetter topLevelContext( - {messageFunction, &topLevelTroubleshootingAid}); + facebook::velox::ExceptionContextSetter additionalContext( + {.messageFunc = messageFunction, + .arg = &topLevelTroubleshootingAid, + .isEssential = true}); MessageFunctionArg midLevelTroubleshootingAid{ "Mid-level troubleshooting aid.", &callCount}; @@ -608,7 +610,7 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: System error: Inner-level troubleshooting aid." - "\nTop-Level Context: System error: Top-level troubleshooting aid." + "\nAdditional Context: System error: Top-level troubleshooting aid." "\nFunction: operator()" "\nFile: "); @@ -623,13 +625,164 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: User error: Inner-level troubleshooting aid." - "\nTop-Level Context: User error: Top-level troubleshooting aid." + "\nAdditional Context: User error: Top-level troubleshooting aid." "\nFunction: operator()" "\nFile: "); EXPECT_EQ(4, callCount); } + { + callCount = 0; + // Create multi-layer contexts with middle level marked as essential. + MessageFunctionArg topLevelTroubleshootingAid{ + "Top-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter additionalContext( + {.messageFunc = messageFunction, .arg = &topLevelTroubleshootingAid}); + + MessageFunctionArg midLevelTroubleshootingAid{ + "Mid-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter midLevelContext( + {.messageFunc = messageFunction, + .arg = &midLevelTroubleshootingAid, + .isEssential = true}); + + MessageFunctionArg innerLevelTroubleshootingAid{ + "Inner-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter innerLevelContext( + {messageFunction, &innerLevelTroubleshootingAid}); + + verifyVeloxException( + [&]() { VELOX_CHECK_EQ(1, 3); }, + "Exception: VeloxRuntimeError" + "\nError Source: RUNTIME" + "\nError Code: INVALID_STATE" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: System error: Inner-level troubleshooting aid." + "\nAdditional Context: System error: Mid-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(2, callCount); + + verifyVeloxException( + [&]() { VELOX_USER_CHECK_EQ(1, 3); }, + "Exception: VeloxUserError" + "\nError Source: USER" + "\nError Code: INVALID_ARGUMENT" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: User error: Inner-level troubleshooting aid." + "\nAdditional Context: User error: Mid-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(4, callCount); + } + + { + callCount = 0; + // Create multi-layer contexts with none marked as essential. + MessageFunctionArg topLevelTroubleshootingAid{ + "Top-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter additionalContext( + {.messageFunc = messageFunction, .arg = &topLevelTroubleshootingAid}); + + MessageFunctionArg midLevelTroubleshootingAid{ + "Mid-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter midLevelContext( + {.messageFunc = messageFunction, .arg = &midLevelTroubleshootingAid}); + + MessageFunctionArg innerLevelTroubleshootingAid{ + "Inner-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter innerLevelContext( + {messageFunction, &innerLevelTroubleshootingAid}); + + verifyVeloxException( + [&]() { VELOX_CHECK_EQ(1, 3); }, + "Exception: VeloxRuntimeError" + "\nError Source: RUNTIME" + "\nError Code: INVALID_STATE" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: System error: Inner-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(1, callCount); + + verifyVeloxException( + [&]() { VELOX_USER_CHECK_EQ(1, 3); }, + "Exception: VeloxUserError" + "\nError Source: USER" + "\nError Code: INVALID_ARGUMENT" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: User error: Inner-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(2, callCount); + } + + { + callCount = 0; + // Create multi-layer contexts with all ancestors marked as essential. + MessageFunctionArg topLevelTroubleshootingAid{ + "Top-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter additionalContext( + {.messageFunc = messageFunction, + .arg = &topLevelTroubleshootingAid, + .isEssential = true}); + + MessageFunctionArg midLevelTroubleshootingAid{ + "Mid-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter midLevelContext( + {.messageFunc = messageFunction, + .arg = &midLevelTroubleshootingAid, + .isEssential = true}); + + MessageFunctionArg innerLevelTroubleshootingAid{ + "Inner-level troubleshooting aid.", &callCount}; + facebook::velox::ExceptionContextSetter innerLevelContext( + {messageFunction, &innerLevelTroubleshootingAid}); + + verifyVeloxException( + [&]() { VELOX_CHECK_EQ(1, 3); }, + "Exception: VeloxRuntimeError" + "\nError Source: RUNTIME" + "\nError Code: INVALID_STATE" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: System error: Inner-level troubleshooting aid." + "\nAdditional Context: System error: Mid-level troubleshooting aid. System error: Top-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(3, callCount); + + verifyVeloxException( + [&]() { VELOX_USER_CHECK_EQ(1, 3); }, + "Exception: VeloxUserError" + "\nError Source: USER" + "\nError Code: INVALID_ARGUMENT" + "\nReason: (1 vs. 3)" + "\nRetriable: False" + "\nExpression: 1 == 3" + "\nContext: User error: Inner-level troubleshooting aid." + "\nAdditional Context: User error: Mid-level troubleshooting aid. User error: Top-level troubleshooting aid." + "\nFunction: operator()" + "\nFile: "); + + EXPECT_EQ(6, callCount); + } + // Different context. { callCount = 0; @@ -649,7 +802,6 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: System error: Debugging info." - "\nTop-Level Context: Same as context." "\nFunction: operator()" "\nFile: "); @@ -664,7 +816,6 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: User error: Debugging info." - "\nTop-Level Context: Same as context." "\nFunction: operator()" "\nFile: "); @@ -709,7 +860,6 @@ TEST(ExceptionTest, context) { "\nRetriable: False" "\nExpression: 1 == 3" "\nContext: Failed to produce additional context." - "\nTop-Level Context: Same as context." "\nFunction: operator()" "\nFile: "); @@ -743,7 +893,7 @@ TEST(ExceptionTest, wrappedException) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_TRUE(ve.isUserError()); ASSERT_EQ(ve.context(), ""); - ASSERT_EQ(ve.topLevelContext(), ""); + ASSERT_EQ(ve.additionalContext(), ""); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -755,7 +905,7 @@ TEST(ExceptionTest, wrappedException) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_FALSE(ve.isUserError()); ASSERT_EQ(ve.context(), ""); - ASSERT_EQ(ve.topLevelContext(), ""); + ASSERT_EQ(ve.additionalContext(), ""); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -784,7 +934,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { std::string data = "lakes"; facebook::velox::ExceptionContextSetter context( - {messageFunction, data.data()}); + {messageFunction, data.data(), true}); try { throw std::invalid_argument("This is a test."); @@ -793,7 +943,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_TRUE(ve.isUserError()); ASSERT_EQ(ve.context(), "User error: lakes"); - ASSERT_EQ(ve.topLevelContext(), "Same as context."); + ASSERT_EQ(ve.additionalContext(), ""); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -805,7 +955,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_FALSE(ve.isUserError()); ASSERT_EQ(ve.context(), "System error: lakes"); - ASSERT_EQ(ve.topLevelContext(), "Same as context."); + ASSERT_EQ(ve.additionalContext(), ""); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -821,7 +971,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_TRUE(ve.isUserError()); ASSERT_EQ(ve.context(), "User error: mountains"); - ASSERT_EQ(ve.topLevelContext(), "User error: lakes"); + ASSERT_EQ(ve.additionalContext(), "User error: lakes"); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } @@ -833,7 +983,7 @@ TEST(ExceptionTest, wrappedExceptionWithContext) { ASSERT_EQ(ve.message(), "This is a test."); ASSERT_FALSE(ve.isUserError()); ASSERT_EQ(ve.context(), "System error: mountains"); - ASSERT_EQ(ve.topLevelContext(), "System error: lakes"); + ASSERT_EQ(ve.additionalContext(), "System error: lakes"); ASSERT_THROW( std::rethrow_exception(ve.wrappedException()), std::invalid_argument); } diff --git a/velox/common/base/tests/FsTest.cpp b/velox/common/base/tests/FsTest.cpp index a9dbd822c0faf..10e4dafee6792 100644 --- a/velox/common/base/tests/FsTest.cpp +++ b/velox/common/base/tests/FsTest.cpp @@ -24,7 +24,8 @@ namespace facebook::velox::common { class FsTest : public testing::Test {}; TEST_F(FsTest, createDirectory) { - auto rootPath = exec::test::TempDirectoryPath::createTempDirectory(); + auto dir = exec::test::TempDirectoryPath::create(); + auto rootPath = dir->getPath(); auto tmpDirectoryPath = rootPath + "/first/second/third"; // First time should generate directory successfully. EXPECT_FALSE(fs::exists(tmpDirectoryPath.c_str())); @@ -34,7 +35,7 @@ TEST_F(FsTest, createDirectory) { // Directory already exist, not creating but should return success. EXPECT_TRUE(generateFileDirectory(tmpDirectoryPath.c_str())); EXPECT_TRUE(fs::exists(tmpDirectoryPath.c_str())); - boost::filesystem::remove_all(rootPath); + dir.reset(); EXPECT_FALSE(fs::exists(rootPath.c_str())); } diff --git a/velox/common/base/tests/GTestUtils.h b/velox/common/base/tests/GTestUtils.h index dbb7a213585a5..0ed61bf9b15d9 100644 --- a/velox/common/base/tests/GTestUtils.h +++ b/velox/common/base/tests/GTestUtils.h @@ -32,16 +32,69 @@ #define VELOX_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P #endif -#define VELOX_ASSERT_THROW(expression, errorMessage) \ - try { \ - (expression); \ - FAIL() << "Expected an exception"; \ - } catch (const facebook::velox::VeloxException& e) { \ - ASSERT_TRUE(e.message().find(errorMessage) != std::string::npos) \ - << "Expected error message to contain '" << errorMessage \ - << "', but received '" << e.message() << "'."; \ +// The void static cast supresses the "unused expression result" warning in +// clang. +#define VELOX_ASSERT_THROW_IMPL(_type, _expression, _errorMessage) \ + try { \ + static_cast(_expression); \ + FAIL() << "Expected an exception"; \ + } catch (const _type& e) { \ + ASSERT_TRUE(e.message().find(_errorMessage) != std::string::npos) \ + << "Expected error message to contain '" << (_errorMessage) \ + << "', but received '" << e.message() << "'."; \ } +#define VELOX_ASSERT_THROW(_expression, _errorMessage) \ + VELOX_ASSERT_THROW_IMPL( \ + facebook::velox::VeloxException, _expression, _errorMessage) + +#define VELOX_ASSERT_USER_THROW(_expression, _errorMessage) \ + VELOX_ASSERT_THROW_IMPL( \ + facebook::velox::VeloxUserError, _expression, _errorMessage) + +#define VELOX_ASSERT_RUNTIME_THROW(_expression, _errorMessage) \ + VELOX_ASSERT_THROW_IMPL( \ + facebook::velox::VeloxRuntimeError, _expression, _errorMessage) + +#define VELOX_ASSERT_ERROR_STATUS(_expression, _statusCode, _errorMessage) \ + const auto status = (_expression); \ + ASSERT_TRUE(status.code() == _statusCode) \ + << "Expected error code to be '" << toString(_statusCode) \ + << "', but received '" << toString(status.code()) << "'."; \ + ASSERT_TRUE(status.message().find(_errorMessage) != std::string::npos) \ + << "Expected error message to contain '" << (_errorMessage) \ + << "', but received '" << status.message() << "'." + +#define VELOX_ASSERT_ERROR_CODE_IMPL( \ + _type, _expression, _errorCode, _errorMessage) \ + try { \ + (_expression); \ + FAIL() << "Expected an exception"; \ + } catch (const _type& e) { \ + ASSERT_TRUE(e.errorCode() == _errorCode) \ + << "Expected error code to be '" << _errorCode << "', but received '" \ + << e.errorCode() << "'."; \ + ASSERT_TRUE(e.message().find(_errorMessage) != std::string::npos) \ + << "Expected error message to contain '" << (_errorMessage) \ + << "', but received '" << e.message() << "'."; \ + } + +#define VELOX_ASSERT_THROW_CODE(_expression, _errorCode, _errorMessage) \ + VELOX_ASSERT_ERROR_CODE_IMPL( \ + facebook::velox::VeloxException, _expression, _errorCode, _errorMessage) + +#define VELOX_ASSERT_USER_THROW_CODE(_expression, _errorCode, _errorMessage) \ + VELOX_ASSERT_ERROR_CODE_IMPL( \ + facebook::velox::VeloxUserError, _expression, _errorCode, _errorMessage) + +#define VELOX_ASSERT_RUNTIME_THROW_CODE( \ + _expression, _errorCode, _errorMessage) \ + VELOX_ASSERT_ERROR_CODE_IMPL( \ + facebook::velox::VeloxRuntimeError, \ + _expression, \ + _errorCode, \ + _errorMessage) + #ifndef NDEBUG #define DEBUG_ONLY_TEST(test_fixture, test_name) TEST(test_fixture, test_name) #define DEBUG_ONLY_TEST_F(test_fixture, test_name) \ diff --git a/velox/common/base/tests/IdMapTest.cpp b/velox/common/base/tests/IdMapTest.cpp index e57ef50f81c19..3b0b157321f63 100644 --- a/velox/common/base/tests/IdMapTest.cpp +++ b/velox/common/base/tests/IdMapTest.cpp @@ -76,8 +76,12 @@ class IdMapTest : public testing::Test { int64_t n4; }; + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + void SetUp() override { - root_ = memory::MemoryManager::getInstance().addRootPool("IdMapRoot"); + root_ = memory::memoryManager()->addRootPool("IdMapRoot"); pool_ = root_->addLeafChild("IdMapLeakLeaf"); } diff --git a/velox/common/base/tests/Memcpy.cpp b/velox/common/base/tests/Memcpy.cpp index 236612db45e20..2ab51f87815e1 100644 --- a/velox/common/base/tests/Memcpy.cpp +++ b/velox/common/base/tests/Memcpy.cpp @@ -41,10 +41,10 @@ uint64_t sum(uint64_t* data, int32_t size) { } struct CopyCallable { - void* FOLLY_NULLABLE source; - void* FOLLY_NULLABLE destination; + void* source; + void* destination; int64_t size; - Semaphore* FOLLY_NULLABLE sem; + Semaphore* sem; void operator()() { if (FLAGS_system_memcpy) { @@ -58,7 +58,7 @@ struct CopyCallable { int main(int argc, char** argv) { constexpr int32_t kAlignment = folly::hardware_destructive_interference_size; - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; auto chunk = bits::roundUp( std::max(FLAGS_bytes / FLAGS_threads, kAlignment), kAlignment); int64_t bytes = chunk * FLAGS_threads; diff --git a/velox/common/base/tests/PointersTest.cpp b/velox/common/base/tests/PointersTest.cpp new file mode 100644 index 0000000000000..4c016dfae6dc1 --- /dev/null +++ b/velox/common/base/tests/PointersTest.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/Pointers.h" + +#include +#include "velox/common/base/tests/GTestUtils.h" + +using namespace facebook::velox; +namespace { +TEST(PointersTest, uniquePtrConvert) { + class Foo1 { + public: + explicit Foo1(int32_t a) : a_(a) {} + virtual ~Foo1() = default; + + int32_t a() const { + return a_; + } + + protected: + int32_t a_; + }; + + class Foo2 : public Foo1 { + public: + Foo2(int32_t a, int32_t b) : Foo1(a), b_(b) {} + + int32_t b() const { + return b_; + } + + private: + int32_t b_; + }; + + Foo2* rawFoo2 = new Foo2(10, 20); + ASSERT_EQ(rawFoo2->a(), 10); + ASSERT_EQ(rawFoo2->b(), 20); + std::unique_ptr foo1(rawFoo2); + std::unique_ptr foo2; + castUniquePointer(std::move(foo1), foo2); + ASSERT_TRUE(foo2 != nullptr); + ASSERT_EQ(foo2->a(), 10); + ASSERT_EQ(foo2->b(), 20); + + class Foo3 { + public: + explicit Foo3(int32_t c) : c_(c) {} + + private: + int32_t c_; + }; + std::unique_ptr foo3; + + ASSERT_ANY_THROW(castUniquePointer(std::move(foo2), foo3)); + ASSERT_TRUE(foo3 == nullptr); +} +} // namespace diff --git a/velox/common/base/tests/RangeTest.cpp b/velox/common/base/tests/RangeTest.cpp index e3aa5ccbb6d04..a5d3507644325 100644 --- a/velox/common/base/tests/RangeTest.cpp +++ b/velox/common/base/tests/RangeTest.cpp @@ -18,30 +18,35 @@ #include -namespace facebook { -namespace velox { +namespace facebook::velox { TEST(RangeTest, ranges) { std::vector bits(10); uint64_t* data = &bits[0]; + Range readable(data, 11, 511); MutableRange writable(data, 9, 509); Range readableBytes(&bits[0], 1, 79); MutableRange writableBytes(&bits[0], 0, 80); - // Bit 13 appears as bit 2 in readable and as bit 4 in writable. + + // Bit 13 appears as bit 2 in 'readable' and as bit 4 in 'writable'. bits::setBit(data, 13); EXPECT_TRUE(readable[2]); EXPECT_TRUE(writable[4]); + + // Bit 21 appears as bit 11 in 'readable' and as bit 13 in 'writable'. writable[13] = true; EXPECT_TRUE(readable[11]); + writable[13] = false; EXPECT_FALSE(readable[11]); + // Byte 10 (ie, bit[80~87]), which corresponds to byte 9 in 'readableBytes' + // and bit[69~76] in 'readable'. writableBytes[10] = 123; EXPECT_EQ(readableBytes[9], 123); // Bit 80 is set. EXPECT_TRUE(readable[69]); } -} // namespace velox -} // namespace facebook +} // namespace facebook::velox diff --git a/velox/common/base/tests/ScratchTest.cpp b/velox/common/base/tests/ScratchTest.cpp new file mode 100644 index 0000000000000..36bdd2da2f814 --- /dev/null +++ b/velox/common/base/tests/ScratchTest.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/Scratch.h" + +#include + +using namespace facebook::velox; + +TEST(ScratchTest, basic) { + Scratch scratch; + { + ScratchPtr ints(scratch); + ScratchPtr longs(scratch); + auto tempInts = ints.get(1000); + auto tempLongs = longs.get(2000); + std::fill(tempInts, tempInts + 1000, -1); + std::fill(tempLongs, tempLongs + 2000, -1); + EXPECT_EQ(0, scratch.retainedSize()); + } + EXPECT_EQ(20352, scratch.retainedSize()); + { + ScratchPtr ints(scratch); + ScratchPtr longs(scratch); + auto tempLongs = longs.get(2000); + auto tempInts = ints.get(1000); + std::fill(tempInts, tempInts + 1000, -1); + std::fill(tempInts, tempInts + 2000, -1); + EXPECT_EQ(0, scratch.retainedSize()); + } + // The scratch vectors were acquired in a different order, so the smaller got + // resized to the larger size. + EXPECT_EQ(32640, scratch.retainedSize()); + scratch.trim(); + EXPECT_EQ(0, scratch.retainedSize()); + { + ScratchPtr ints(scratch); + // The size is the inline size, nothing gets returned to 'scratch'. + auto temp = ints.get(10); + temp[0] = 1; + } + EXPECT_EQ(0, scratch.retainedSize()); +} + +TEST(ScratchTest, large) { + constexpr int32_t kSize = 100; + Scratch scratch; + std::vector>> pointers; + for (auto i = 0; i < kSize; ++i) { + pointers.push_back(std::make_unique>(scratch)); + pointers.back()->get(1000); + } + pointers.clear(); + // 100 times 1000 bytes returned. + EXPECT_LT(100'000, scratch.retainedSize()); + for (auto i = 0; i < kSize; ++i) { + pointers.push_back(std::make_unique>(scratch)); + pointers.back()->get(1000); + } + EXPECT_EQ(0, scratch.retainedSize()); +} diff --git a/velox/common/base/tests/SimdUtilTest.cpp b/velox/common/base/tests/SimdUtilTest.cpp index ba481f11d0485..ba389780b1cba 100644 --- a/velox/common/base/tests/SimdUtilTest.cpp +++ b/velox/common/base/tests/SimdUtilTest.cpp @@ -16,6 +16,8 @@ #include "velox/common/base/SimdUtil.h" #include +#include "velox/common/base/RawVector.h" +#include "velox/common/time/Timer.h" #include @@ -207,6 +209,58 @@ TEST_F(SimdUtilTest, gatherBits) { EXPECT_EQ(bits::isBitSet(&bits, i), bits::isBitSet(data, vindex.get(i))); } EXPECT_FALSE(bits::isBitSet(&bits, N - 1)); + + uint64_t source = 0x123456789abcdefLU; + raw_vector bitIndices; + uint64_t result = 0; + for (auto i = 61; i >= 0; i -= 2) { + bitIndices.push_back(i); + } + simd::gatherBits( + &source, + folly::Range(bitIndices.data(), bitIndices.size()), + &result); + for (auto i = 0; i < bitIndices.size(); ++i) { + EXPECT_EQ( + bits::isBitSet(&source, bitIndices[i]), bits::isBitSet(&result, i)); + } +} + +TEST_F(SimdUtilTest, transpose) { + constexpr int32_t kMaxSize = 100; + std::vector data32(kMaxSize); + std::vector data64(kMaxSize); + raw_vector indices(kMaxSize); + constexpr int64_t kMagic = 0x4fe12LU; + // indices are scattered over 0..kMaxSize - 1. + for (auto i = 0; i < kMaxSize; ++i) { + indices[i] = ((i * kMagic) & 0xffffff) % indices.size(); + data32[i] = i; + data64[i] = static_cast(i) << 32; + } + for (auto size = 1; size < kMaxSize; ++size) { + std::vector result32(kMaxSize + 1, -1); + simd::transpose( + data32.data(), + folly::Range(indices.data(), size), + result32.data()); + for (auto i = 0; i < size; ++i) { + EXPECT_EQ(data32[indices[i]], result32[i]); + } + // See that there is no write past 'size'. + EXPECT_EQ(-1, result32[size]); + + std::vector result64(kMaxSize + 1, -1); + simd::transpose( + data64.data(), + folly::Range(indices.data(), size), + result64.data()); + for (auto i = 0; i < size; ++i) { + EXPECT_EQ(data64[indices[i]], result64[i]); + } + // See that there is no write past 'size'. + EXPECT_EQ(-1, result64[size]); + } } namespace { @@ -376,7 +430,7 @@ TEST_F(SimdUtilTest, reinterpretBatch) { validateReinterpretBatch(); } -TEST_F(SimdUtilTest, memEqual) { +TEST_F(SimdUtilTest, memEqualUnsafe) { constexpr int32_t kSize = 132; struct { char x[kSize]; @@ -399,4 +453,42 @@ TEST_F(SimdUtilTest, memEqual) { EXPECT_FALSE(simd::memEqualUnsafe(&data.x[1], &data.y[1], 67)); } +TEST_F(SimdUtilTest, memcpyTime) { + constexpr int64_t kMaxMove = 128; + constexpr int64_t kSize = (128 << 20) + kMaxMove; + constexpr uint64_t kSizeMask = (128 << 20) - 1; + constexpr int32_t kMoveMask = kMaxMove - 1; + constexpr uint64_t kMagic1 = 0x5231871; + constexpr uint64_t kMagic3 = 0xfae1; + constexpr uint64_t kMagic2 = 0x817952491; + std::vector dataV(kSize); + + auto data = dataV.data(); + uint64_t simd = 0; + uint64_t sys = 0; + { + MicrosecondTimer t(&simd); + for (auto ctr = 0; ctr < 100; ++ctr) { + for (auto i = 0; i < 10000; ++i) { + char* from = data + ((i * kMagic1) & kSizeMask); + char* to = data + ((i * kMagic2) & kSizeMask); + int32_t size = (i * kMagic3) % kMoveMask; + simd::memcpy(to, from, size); + } + } + } + { + MicrosecondTimer t(&sys); + for (auto ctr = 0; ctr < 100; ++ctr) { + for (auto i = 0; i < 10000; ++i) { + char* from = data + ((i * kMagic1) & kSizeMask); + char* to = data + ((i * kMagic2) & kSizeMask); + int32_t size = (i * kMagic3) % kMoveMask; + ::memcpy(to, from, size); + } + } + } + LOG(INFO) << "simd=" << simd << " sys=" << sys; +} + } // namespace diff --git a/velox/common/testutil/tests/SpillConfigTest.cpp b/velox/common/base/tests/SpillConfigTest.cpp similarity index 89% rename from velox/common/testutil/tests/SpillConfigTest.cpp rename to velox/common/base/tests/SpillConfigTest.cpp index 3a1ea251f39fe..837f0098c47c5 100644 --- a/velox/common/testutil/tests/SpillConfigTest.cpp +++ b/velox/common/base/tests/SpillConfigTest.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "velox/common/config/SpillConfig.h" +#include "velox/common/base/SpillConfig.h" #include #include "velox/common/base/tests/GTestUtils.h" #include "velox/exec/HashBitRange.h" @@ -26,6 +26,8 @@ TEST(SpillConfig, spillLevel) { const uint8_t kInitialBitOffset = 16; const uint8_t kNumPartitionsBits = 3; const SpillConfig config( + []() -> std::string_view { return ""; }, + [&](uint64_t) {}, "fakeSpillPath", 0, 0, @@ -36,7 +38,6 @@ TEST(SpillConfig, spillLevel) { kInitialBitOffset, kNumPartitionsBits, 0, - false, 0, 0, "none"); @@ -62,10 +63,9 @@ TEST(SpillConfig, spillLevel) { for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); if (testData.expectedLevel == -1) { - ASSERT_ANY_THROW(config.joinSpillLevel(testData.bitOffset)); + ASSERT_ANY_THROW(config.spillLevel(testData.bitOffset)); } else { - ASSERT_EQ( - config.joinSpillLevel(testData.bitOffset), testData.expectedLevel); + ASSERT_EQ(config.spillLevel(testData.bitOffset), testData.expectedLevel); } } } @@ -76,7 +76,7 @@ TEST(SpillConfig, spillLevelLimit) { int32_t numBits; uint8_t bitOffset; int32_t maxSpillLevel; - int32_t expectedExceeds; + bool expectedExceeds; std::string debugString() const { return fmt::format( @@ -111,6 +111,8 @@ TEST(SpillConfig, spillLevelLimit) { const HashBitRange partitionBits( testData.startBitOffset, testData.startBitOffset + testData.numBits); const SpillConfig config( + []() -> std::string_view { return ""; }, + [&](uint64_t) {}, "fakeSpillPath", 0, 0, @@ -120,15 +122,14 @@ TEST(SpillConfig, spillLevelLimit) { 0, testData.startBitOffset, testData.numBits, - 0, - false, testData.maxSpillLevel, 0, + 0, "none"); ASSERT_EQ( testData.expectedExceeds, - config.exceedJoinSpillLevelLimit(testData.bitOffset)); + config.exceedSpillLevelLimit(testData.bitOffset)); } } @@ -151,12 +152,13 @@ TEST(SpillConfig, spillableReservationPercentages) { {50, 100, true}, {1, 50, true}, {1, 1, false}}; - for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); auto createConfigFn = [&]() { const SpillConfig config( + [&]() -> std::string_view { return ""; }, + [&](uint64_t) {}, "spillableReservationPercentages", 0, 0, @@ -167,8 +169,7 @@ TEST(SpillConfig, spillableReservationPercentages) { 0, 0, 0, - false, - 0, + 1'000'000, 0, "none"); }; diff --git a/velox/common/base/tests/SpillStatsTest.cpp b/velox/common/base/tests/SpillStatsTest.cpp new file mode 100644 index 0000000000000..114c728fe4f02 --- /dev/null +++ b/velox/common/base/tests/SpillStatsTest.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/SpillStats.h" +#include +#include "velox/common/base/VeloxException.h" +#include "velox/common/base/tests/GTestUtils.h" + +using namespace facebook::velox::common; + +TEST(SpillStatsTest, spillStats) { + SpillStats stats1; + ASSERT_TRUE(stats1.empty()); + stats1.spillRuns = 100; + stats1.spilledInputBytes = 2048; + stats1.spilledBytes = 1024; + stats1.spilledPartitions = 1024; + stats1.spilledFiles = 1023; + stats1.spillWriteTimeNanos = 1023; + stats1.spillFlushTimeNanos = 1023; + stats1.spillWrites = 1023; + stats1.spillSortTimeNanos = 1023; + stats1.spillFillTimeNanos = 1023; + stats1.spilledRows = 1023; + stats1.spillSerializationTimeNanos = 1023; + stats1.spillMaxLevelExceededCount = 3; + stats1.spillReadBytes = 1024; + stats1.spillReads = 10; + stats1.spillReadTimeNanos = 100; + stats1.spillDeserializationTimeNanos = 100; + ASSERT_FALSE(stats1.empty()); + SpillStats stats2; + stats2.spillRuns = 100; + stats2.spilledInputBytes = 2048; + stats2.spilledBytes = 1024; + stats2.spilledPartitions = 1025; + stats2.spilledFiles = 1026; + stats2.spillWriteTimeNanos = 1026; + stats2.spillFlushTimeNanos = 1027; + stats2.spillWrites = 1028; + stats2.spillSortTimeNanos = 1029; + stats2.spillFillTimeNanos = 1030; + stats2.spilledRows = 1031; + stats2.spillSerializationTimeNanos = 1032; + stats2.spillMaxLevelExceededCount = 4; + stats2.spillReadBytes = 2048; + stats2.spillReads = 10; + stats2.spillReadTimeNanos = 100; + stats2.spillDeserializationTimeNanos = 100; + ASSERT_TRUE(stats1 < stats2); + ASSERT_TRUE(stats1 <= stats2); + ASSERT_FALSE(stats1 > stats2); + ASSERT_FALSE(stats1 >= stats2); + ASSERT_TRUE(stats1 != stats2); + ASSERT_FALSE(stats1 == stats2); + + ASSERT_TRUE(stats1 == stats1); + ASSERT_FALSE(stats1 != stats1); + ASSERT_FALSE(stats1 > stats1); + ASSERT_TRUE(stats1 >= stats1); + ASSERT_FALSE(stats1 < stats1); + ASSERT_TRUE(stats1 <= stats1); + + SpillStats delta = stats2 - stats1; + ASSERT_EQ(delta.spilledInputBytes, 0); + ASSERT_EQ(delta.spilledBytes, 0); + ASSERT_EQ(delta.spilledPartitions, 1); + ASSERT_EQ(delta.spilledFiles, 3); + ASSERT_EQ(delta.spillWriteTimeNanos, 3); + ASSERT_EQ(delta.spillFlushTimeNanos, 4); + ASSERT_EQ(delta.spillWrites, 5); + ASSERT_EQ(delta.spillSortTimeNanos, 6); + ASSERT_EQ(delta.spillFillTimeNanos, 7); + ASSERT_EQ(delta.spilledRows, 8); + ASSERT_EQ(delta.spillSerializationTimeNanos, 9); + ASSERT_EQ(delta.spillReadBytes, 1024); + ASSERT_EQ(delta.spillReads, 0); + ASSERT_EQ(delta.spillReadTimeNanos, 0); + ASSERT_EQ(delta.spillDeserializationTimeNanos, 0); + delta = stats1 - stats2; + ASSERT_EQ(delta.spilledInputBytes, 0); + ASSERT_EQ(delta.spilledBytes, 0); + ASSERT_EQ(delta.spilledPartitions, -1); + ASSERT_EQ(delta.spilledFiles, -3); + ASSERT_EQ(delta.spillWriteTimeNanos, -3); + ASSERT_EQ(delta.spillFlushTimeNanos, -4); + ASSERT_EQ(delta.spillWrites, -5); + ASSERT_EQ(delta.spillSortTimeNanos, -6); + ASSERT_EQ(delta.spillFillTimeNanos, -7); + ASSERT_EQ(delta.spilledRows, -8); + ASSERT_EQ(delta.spillSerializationTimeNanos, -9); + ASSERT_EQ(delta.spillMaxLevelExceededCount, -1); + ASSERT_EQ(delta.spillReadBytes, -1024); + ASSERT_EQ(delta.spillReads, 0); + ASSERT_EQ(delta.spillReadTimeNanos, 0); + ASSERT_EQ(delta.spillDeserializationTimeNanos, 0); + stats1.spilledInputBytes = 2060; + stats1.spilledBytes = 1030; + stats1.spillReadBytes = 4096; + VELOX_ASSERT_THROW(stats1 < stats2, ""); + VELOX_ASSERT_THROW(stats1 > stats2, ""); + VELOX_ASSERT_THROW(stats1 <= stats2, ""); + VELOX_ASSERT_THROW(stats1 >= stats2, ""); + ASSERT_TRUE(stats1 != stats2); + ASSERT_FALSE(stats1 == stats2); + const SpillStats zeroStats; + stats1.reset(); + ASSERT_EQ(zeroStats, stats1); + ASSERT_EQ( + stats2.toString(), + "spillRuns[100] spilledInputBytes[2.00KB] spilledBytes[1.00KB] " + "spilledRows[1031] spilledPartitions[1025] spilledFiles[1026] " + "spillFillTimeNanos[1.03us] spillSortTimeNanos[1.03us] " + "spillSerializationTimeNanos[1.03us] spillWrites[1028] spillFlushTimeNanos[1.03us] " + "spillWriteTimeNanos[1.03us] maxSpillExceededLimitCount[4] " + "spillReadBytes[2.00KB] spillReads[10] spillReadTimeNanos[100ns] " + "spillReadDeserializationTimeNanos[100ns]"); + ASSERT_EQ( + fmt::format("{}", stats2), + "spillRuns[100] spilledInputBytes[2.00KB] spilledBytes[1.00KB] " + "spilledRows[1031] spilledPartitions[1025] spilledFiles[1026] " + "spillFillTimeNanos[1.03us] spillSortTimeNanos[1.03us] " + "spillSerializationTimeNanos[1.03us] spillWrites[1028] " + "spillFlushTimeNanos[1.03us] spillWriteTimeNanos[1.03us] " + "maxSpillExceededLimitCount[4] " + "spillReadBytes[2.00KB] spillReads[10] spillReadTimeNanos[100ns] " + "spillReadDeserializationTimeNanos[100ns]"); +} diff --git a/velox/common/base/tests/StatsReporterTest.cpp b/velox/common/base/tests/StatsReporterTest.cpp index d70c00e5f4342..bf268d1a78ef2 100644 --- a/velox/common/base/tests/StatsReporterTest.cpp +++ b/velox/common/base/tests/StatsReporterTest.cpp @@ -20,32 +20,43 @@ #include #include #include +#include +#include "velox/common/base/Counters.h" +#include "velox/common/base/PeriodicStatsReporter.h" +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/caching/AsyncDataCache.h" +#include "velox/common/caching/CacheTTLController.h" +#include "velox/common/caching/SsdCache.h" +#include "velox/common/memory/MmapAllocator.h" namespace facebook::velox { -class StatsReporterTest : public testing::Test { - protected: - void SetUp() override {} - void TearDown() override {} -}; - class TestReporter : public BaseStatsReporter { public: - mutable std::unordered_map counterMap; + mutable std::mutex m; + mutable std::map counterMap; mutable std::unordered_map statTypeMap; mutable std::unordered_map> histogramPercentilesMap; - void addStatExportType(const char* key, StatType statType) const override { + void clear() { + std::lock_guard l(m); + counterMap.clear(); + statTypeMap.clear(); + histogramPercentilesMap.clear(); + } + + void registerMetricExportType(const char* key, StatType statType) + const override { statTypeMap[key] = statType; } - void addStatExportType(folly::StringPiece key, StatType statType) + void registerMetricExportType(folly::StringPiece key, StatType statType) const override { statTypeMap[key.str()] = statType; } - void addHistogramExportPercentiles( + void registerHistogramMetricExportType( const char* key, int64_t /* bucketWidth */, int64_t /* min */, @@ -54,7 +65,7 @@ class TestReporter : public BaseStatsReporter { histogramPercentilesMap[key] = pcts; } - void addHistogramExportPercentiles( + void registerHistogramMetricExportType( folly::StringPiece key, int64_t /* bucketWidth */, int64_t /* min */, @@ -63,64 +74,539 @@ class TestReporter : public BaseStatsReporter { histogramPercentilesMap[key.str()] = pcts; } - void addStatValue(const std::string& key, const size_t value) const override { + void addMetricValue(const std::string& key, const size_t value) + const override { + std::lock_guard l(m); counterMap[key] += value; } - void addStatValue(const char* key, const size_t value) const override { + void addMetricValue(const char* key, const size_t value) const override { + std::lock_guard l(m); counterMap[key] += value; } - void addStatValue(folly::StringPiece key, size_t value) const override { + void addMetricValue(folly::StringPiece key, size_t value) const override { + std::lock_guard l(m); counterMap[key.str()] += value; } - void addHistogramValue(const std::string& key, size_t value) const override { + void addHistogramMetricValue(const std::string& key, size_t value) + const override { + std::lock_guard l(m); counterMap[key] = std::max(counterMap[key], value); } - void addHistogramValue(const char* key, size_t value) const override { + void addHistogramMetricValue(const char* key, size_t value) const override { + std::lock_guard l(m); counterMap[key] = std::max(counterMap[key], value); } - void addHistogramValue(folly::StringPiece key, size_t value) const override { + void addHistogramMetricValue(folly::StringPiece key, size_t value) + const override { + std::lock_guard l(m); counterMap[key.str()] = std::max(counterMap[key.str()], value); } + + std::string fetchMetrics() override { + std::stringstream ss; + ss << "["; + auto sep = ""; + for (const auto& [key, value] : counterMap) { + ss << sep << key << ":" << value; + sep = ","; + } + ss << "]"; + return ss.str(); + } }; -TEST_F(StatsReporterTest, trivialReporter) { - auto reporter = std::dynamic_pointer_cast( - folly::Singleton::try_get()); +class StatsReporterTest : public testing::Test { + protected: + void SetUp() override { + reporter_ = std::dynamic_pointer_cast( + folly::Singleton::try_get()); + reporter_->clear(); + } + void TearDown() override { + reporter_->clear(); + } - REPORT_ADD_STAT_EXPORT_TYPE("key1", StatType::COUNT); - REPORT_ADD_STAT_EXPORT_TYPE("key2", StatType::SUM); - REPORT_ADD_STAT_EXPORT_TYPE("key3", StatType::RATE); - REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE("key4", 10, 0, 100, 50, 99, 100); + std::shared_ptr reporter_; +}; - EXPECT_EQ(StatType::COUNT, reporter->statTypeMap["key1"]); - EXPECT_EQ(StatType::SUM, reporter->statTypeMap["key2"]); - EXPECT_EQ(StatType::RATE, reporter->statTypeMap["key3"]); +TEST_F(StatsReporterTest, trivialReporter) { + DEFINE_METRIC("key1", StatType::COUNT); + DEFINE_METRIC("key2", StatType::SUM); + DEFINE_METRIC("key3", StatType::RATE); + DEFINE_HISTOGRAM_METRIC("key4", 10, 0, 100, 50, 99, 100); + + EXPECT_EQ(StatType::COUNT, reporter_->statTypeMap["key1"]); + EXPECT_EQ(StatType::SUM, reporter_->statTypeMap["key2"]); + EXPECT_EQ(StatType::RATE, reporter_->statTypeMap["key3"]); std::vector expected = {50, 99, 100}; - EXPECT_EQ(expected, reporter->histogramPercentilesMap["key4"]); + EXPECT_EQ(expected, reporter_->histogramPercentilesMap["key4"]); EXPECT_TRUE( - reporter->statTypeMap.find("key5") == reporter->statTypeMap.end()); - - REPORT_ADD_STAT_VALUE("key1", 10); - REPORT_ADD_STAT_VALUE("key1", 11); - REPORT_ADD_STAT_VALUE("key1", 15); - REPORT_ADD_STAT_VALUE("key2", 1001); - REPORT_ADD_STAT_VALUE("key2", 1200); - REPORT_ADD_STAT_VALUE("key3"); - REPORT_ADD_STAT_VALUE("key3", 1100); - REPORT_ADD_HISTOGRAM_VALUE("key4", 50); - REPORT_ADD_HISTOGRAM_VALUE("key4", 100); - - EXPECT_EQ(36, reporter->counterMap["key1"]); - EXPECT_EQ(2201, reporter->counterMap["key2"]); - EXPECT_EQ(1101, reporter->counterMap["key3"]); - EXPECT_EQ(100, reporter->counterMap["key4"]); + reporter_->statTypeMap.find("key5") == reporter_->statTypeMap.end()); + + RECORD_METRIC_VALUE("key1", 10); + RECORD_METRIC_VALUE("key1", 11); + RECORD_METRIC_VALUE("key1", 15); + RECORD_METRIC_VALUE("key2", 1001); + RECORD_METRIC_VALUE("key2", 1200); + RECORD_METRIC_VALUE("key3"); + RECORD_METRIC_VALUE("key3", 1100); + RECORD_HISTOGRAM_METRIC_VALUE("key4", 50); + RECORD_HISTOGRAM_METRIC_VALUE("key4", 100); + + EXPECT_EQ(36, reporter_->counterMap["key1"]); + EXPECT_EQ(2201, reporter_->counterMap["key2"]); + EXPECT_EQ(1101, reporter_->counterMap["key3"]); + EXPECT_EQ(100, reporter_->counterMap["key4"]); + + EXPECT_EQ( + "[key1:36,key2:2201,key3:1101,key4:100]", reporter_->fetchMetrics()); +}; + +class PeriodicStatsReporterTest : public StatsReporterTest {}; + +class TestStatsReportMmapAllocator : public memory::MmapAllocator { + public: + TestStatsReportMmapAllocator( + memory::MachinePageCount numMapped, + memory::MachinePageCount numAllocated, + memory::MachinePageCount numMallocBytes, + memory::MachinePageCount numExternalMapped) + : memory::MmapAllocator({.capacity = 1024}), + numMapped_(numMapped), + numAllocated_(numAllocated), + numMallocBytes_(numMallocBytes), + numExternalMapped_(numExternalMapped) {} + + memory::MachinePageCount numMapped() const override { + return numMapped_; + } + + memory::MachinePageCount numAllocated() const override { + return numAllocated_; + } + + uint64_t numMallocBytes() const { + return numMallocBytes_; + } + + memory::MachinePageCount numExternalMapped() const { + return numExternalMapped_; + } + + private: + memory::MachinePageCount numMapped_; + memory::MachinePageCount numAllocated_; + memory::MachinePageCount numMallocBytes_; + memory::MachinePageCount numExternalMapped_; +}; + +class TestStatsReportAsyncDataCache : public cache::AsyncDataCache { + public: + TestStatsReportAsyncDataCache(cache::CacheStats stats) + : cache::AsyncDataCache(nullptr, nullptr), stats_(stats) {} + + cache::CacheStats refreshStats() const override { + std::lock_guard l(mutex_); + return stats_; + } + + void updateStats(cache::CacheStats stats) { + std::lock_guard l(mutex_); + stats_ = stats; + } + + private: + mutable std::mutex mutex_; + cache::CacheStats stats_; +}; + +class TestStatsReportMemoryArbitrator : public memory::MemoryArbitrator { + public: + explicit TestStatsReportMemoryArbitrator( + memory::MemoryArbitrator::Stats stats) + : memory::MemoryArbitrator({}), stats_(stats) {} + + ~TestStatsReportMemoryArbitrator() override = default; + + void updateStats(memory::MemoryArbitrator::Stats stats) { + std::lock_guard l(mutex_); + stats_ = stats; + } + + std::string kind() const override { + return "test"; + } + + void addPool(const std::shared_ptr& /*unused*/) override { + } + + void removePool(memory::MemoryPool* /*unused*/) override {} + + bool growCapacity(memory::MemoryPool* /*unused*/, uint64_t /*unused*/) + override { + return false; + } + + uint64_t shrinkCapacity(memory::MemoryPool* /*unused*/, uint64_t /*unused*/) + override { + return 0; + } + + uint64_t shrinkCapacity(uint64_t /*unused*/, bool /*unused*/, bool /*unused*/) + override { + return 0; + } + + Stats stats() const override { + std::lock_guard l(mutex_); + return stats_; + } + + std::string toString() const override { + return "TestStatsReportMemoryArbitrator::toString()"; + } + + private: + mutable std::mutex mutex_; + memory::MemoryArbitrator::Stats stats_; +}; + +class TestMemoryPool : public memory::MemoryPool { + public: + explicit TestMemoryPool() : MemoryPool("", Kind::kAggregate, nullptr, {}) {} + + void* allocate(int64_t size) override { + return nullptr; + } + + void* allocateZeroFilled(int64_t /* unused */, int64_t /* unused */) + override { + return nullptr; + } + + void* reallocate( + void* /* unused */, + int64_t /* unused */, + int64_t /* unused */) override { + return nullptr; + } + + void free(void* /* unused */, int64_t /* unused */) override {} + + void allocateNonContiguous( + memory::MachinePageCount /* unused */, + memory::Allocation& /* unused */, + memory::MachinePageCount /* unused */) override {} + + void freeNonContiguous(memory::Allocation& /* unused */) override {} + + memory::MachinePageCount largestSizeClass() const override { + return 0; + } + + const std::vector& sizeClasses() const override { + static std::vector sizeClasses; + return sizeClasses; + } + + void allocateContiguous( + memory::MachinePageCount /* unused */, + memory::ContiguousAllocation& /* unused */, + memory::MachinePageCount /* unused */) override {} + + void freeContiguous(memory::ContiguousAllocation& /* unused */) override {} + + void growContiguous( + memory::MachinePageCount /* unused */, + memory::ContiguousAllocation& /* unused */) override {} + + int64_t capacity() const override { + return 0; + } + + int64_t usedBytes() const override { + return 0; + } + + int64_t peakBytes() const override { + return 0; + } + + int64_t availableReservation() const override { + return 0; + } + + int64_t releasableReservation() const override { + return 0; + } + + int64_t reservedBytes() const override { + return 0; + } + + bool maybeReserve(uint64_t /* unused */) override { + return false; + } + + void release() override {} + + uint64_t freeBytes() const override { + return 0; + } + + uint64_t shrink(uint64_t /* unused */) override { + return 0; + } + + bool grow(uint64_t /* unused */, uint64_t /* unused */) override { + return false; + } + + void setReclaimer( + std::unique_ptr /* unused */) override {} + memory::MemoryReclaimer* reclaimer() const override { + return nullptr; + } + + void enterArbitration() override {} + + void leaveArbitration() noexcept override {} + + std::optional reclaimableBytes() const override { + return std::nullopt; + } + + uint64_t reclaim( + uint64_t /* unused */, + uint64_t /* unused */, + memory::MemoryReclaimer::Stats& /* unused */) override { + return 0; + } + + void abort(const std::exception_ptr& /* unused */) override {} + + bool aborted() const override { + return false; + } + + std::string toString() const override { + return ""; + } + + std::string treeMemoryUsage(bool /* unused */) const override { + return ""; + } + + std::shared_ptr genChild( + std::shared_ptr /* unused */, + const std::string& /* unused */, + Kind /* unused */, + bool /* unused */, + std::unique_ptr /* unused */) override { + return nullptr; + } + + Stats stats() const override { + return Stats(); + } }; +TEST_F(PeriodicStatsReporterTest, basic) { + TestStatsReportMmapAllocator allocator(1, 1, 1, 1); + TestStatsReportAsyncDataCache cache( + {.ssdStats = std::make_shared()}); + cache::CacheTTLController::create(cache); + TestStatsReportMemoryArbitrator arbitrator({}); + TestMemoryPool spillMemoryPool; + PeriodicStatsReporter::Options options; + options.cache = &cache; + options.cacheStatsIntervalMs = 4'000; + options.allocator = &allocator; + options.allocatorStatsIntervalMs = 4'000; + options.arbitrator = &arbitrator; + options.arbitratorStatsIntervalMs = 4'000; + options.spillMemoryPool = &spillMemoryPool; + options.spillStatsIntervalMs = 4'000; + PeriodicStatsReporter periodicReporter(options); + + periodicReporter.start(); + std::this_thread::sleep_for(std::chrono::milliseconds(2'000)); + + // Check snapshot stats + const auto& counterMap = reporter_->counterMap; + { + std::lock_guard l(reporter_->m); + ASSERT_EQ(counterMap.count(kMetricArbitratorFreeCapacityBytes.str()), 1); + ASSERT_EQ( + counterMap.count(kMetricArbitratorFreeReservedCapacityBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumEmptyEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumSharedEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumExclusiveEntries.str()), 1); + ASSERT_EQ( + counterMap.count(kMetricMemoryCacheNumPrefetchedEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheTotalTinyBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheTotalLargeBytes.str()), 1); + ASSERT_EQ( + counterMap.count(kMetricMemoryCacheTotalTinyPaddingBytes.str()), 1); + ASSERT_EQ( + counterMap.count(kMetricMemoryCacheTotalLargePaddingBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheTotalPrefetchBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheCachedEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheCachedRegions.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheCachedBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricCacheMaxAgeSecs.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMappedMemoryBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricAllocatedMemoryBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMmapDelegatedAllocBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMmapExternalMappedBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSpillMemoryBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSpillPeakMemoryBytes.str()), 1); + // Check deltas are not reported + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumHits.str()), 0); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheHitBytes.str()), 0); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumNew.str()), 0); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumEvicts.str()), 0); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumSavableEvicts.str()), 0); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumEvictChecks.str()), 0); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumWaitExclusive.str()), 0); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumAllocClocks.str()), 0); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumAgedOutEntries.str()), 0); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheSumEvictScore.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadEntries.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadBytes.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWrittenEntries.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWrittenBytes.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheOpenSsdErrors.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheOpenCheckpointErrors.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheOpenLogErrors.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheDeleteCheckpointErrors.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheGrowFileErrors.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWriteSsdErrors.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWriteSsdDropped.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWriteCheckpointErrors.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadSsdErrors.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadCorruptions.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadCheckpointErrors.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheCheckpointsRead.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheCheckpointsWritten.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheRegionsEvicted.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheAgedOutEntries.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheAgedOutRegions.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheRecoveredEntries.str()), 0); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadWithoutChecksum.str()), 0); + ASSERT_EQ(counterMap.size(), 22); + } + + // Update stats + auto newSsdStats = std::make_shared(); + newSsdStats->entriesWritten = 10; + newSsdStats->bytesWritten = 10; + newSsdStats->checkpointsWritten = 10; + newSsdStats->entriesRead = 10; + newSsdStats->bytesRead = 10; + newSsdStats->checkpointsRead = 10; + newSsdStats->entriesAgedOut = 10; + newSsdStats->regionsAgedOut = 10; + newSsdStats->regionsEvicted = 10; + newSsdStats->numPins = 10; + newSsdStats->openFileErrors = 10; + newSsdStats->openCheckpointErrors = 10; + newSsdStats->openLogErrors = 10; + newSsdStats->deleteCheckpointErrors = 10; + newSsdStats->growFileErrors = 10; + newSsdStats->writeSsdErrors = 10; + newSsdStats->writeSsdDropped = 10; + newSsdStats->writeCheckpointErrors = 10; + newSsdStats->readSsdErrors = 10; + newSsdStats->readSsdCorruptions = 10; + newSsdStats->readCheckpointErrors = 10; + newSsdStats->readWithoutChecksumChecks = 10; + newSsdStats->entriesRecovered = 10; + cache.updateStats( + {.numHit = 10, + .hitBytes = 10, + .numNew = 10, + .numEvict = 10, + .numSavableEvict = 10, + .numEvictChecks = 10, + .numWaitExclusive = 10, + .numAgedOut = 10, + .allocClocks = 10, + .sumEvictScore = 10, + .ssdStats = newSsdStats}); + arbitrator.updateStats(memory::MemoryArbitrator::Stats( + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10)); + std::this_thread::sleep_for(std::chrono::milliseconds(4'000)); + + // Stop right after sufficient wait to ensure the following reads from main + // thread does not trigger TSAN failures. + periodicReporter.stop(); + + // Check delta stats are reported + { + std::lock_guard l(reporter_->m); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumHits.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheHitBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumNew.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumEvicts.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumSavableEvicts.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumEvictChecks.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumWaitExclusive.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumAllocClocks.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheNumAgedOutEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricMemoryCacheSumEvictScore.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWrittenEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWrittenBytes.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheOpenSsdErrors.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheOpenCheckpointErrors.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheOpenLogErrors.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheDeleteCheckpointErrors.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheGrowFileErrors.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWriteSsdErrors.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWriteSsdDropped.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheWriteCheckpointErrors.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadSsdErrors.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadCorruptions.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadCheckpointErrors.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheCheckpointsRead.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheCheckpointsWritten.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheRegionsEvicted.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheAgedOutEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheAgedOutRegions.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheRecoveredEntries.str()), 1); + ASSERT_EQ(counterMap.count(kMetricSsdCacheReadWithoutChecksum.str()), 1); + ASSERT_EQ(counterMap.size(), 54); + } +} + +TEST_F(PeriodicStatsReporterTest, globalInstance) { + TestStatsReportMemoryArbitrator arbitrator({}); + PeriodicStatsReporter::Options options; + PeriodicStatsReporter periodicReporter(options); + ASSERT_NO_THROW(periodicReporter.start()); + std::this_thread::sleep_for(std::chrono::milliseconds(4'000)); + ASSERT_NO_THROW(periodicReporter.stop()); +} + +TEST_F(PeriodicStatsReporterTest, allNullOption) { + PeriodicStatsReporter::Options options; + VELOX_ASSERT_THROW( + stopPeriodicStatsReporter(), "No periodic stats reporter to stop."); + ASSERT_NO_THROW(startPeriodicStatsReporter(options)); + VELOX_ASSERT_THROW( + startPeriodicStatsReporter(options), + "The periodic stats reporter has already started."); + ASSERT_NO_THROW(stopPeriodicStatsReporter()); +} + // Registering to folly Singleton with intended reporter type folly::Singleton reporter([]() { return new TestReporter(); @@ -130,7 +616,7 @@ folly::Singleton reporter([]() { int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); - folly::init(&argc, &argv, false); + folly::Init init{&argc, &argv, false}; facebook::velox::BaseStatsReporter::registered = true; return RUN_ALL_TESTS(); } diff --git a/velox/common/base/tests/StatusTest.cpp b/velox/common/base/tests/StatusTest.cpp new file mode 100644 index 0000000000000..8c0019e76ce72 --- /dev/null +++ b/velox/common/base/tests/StatusTest.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/base/Status.h" +#include +#include +#include "velox/common/base/Exceptions.h" +#include "velox/common/base/tests/GTestUtils.h" + +namespace facebook::velox::test { +namespace { + +TEST(StatusTest, testCodeAndMessage) { + Status ok = Status::OK(); + ASSERT_EQ(StatusCode::kOK, ok.code()); + ASSERT_EQ("", ok.message()); + ASSERT_EQ("OK", ok.codeAsString()); + ASSERT_TRUE(ok.ok()); + ASSERT_FALSE(ok.isIOError()); + + Status fileError = Status::IOError("file error"); + ASSERT_EQ(StatusCode::kIOError, fileError.code()); + ASSERT_EQ("file error", fileError.message()); +} + +TEST(StatusTest, testNoMessage) { + Status fileError = Status::IOError(); + ASSERT_EQ(StatusCode::kIOError, fileError.code()); + ASSERT_EQ("", fileError.message()); + ASSERT_EQ("IOError: ", fileError.toString()); + ASSERT_EQ("IOError", fileError.codeAsString()); +} + +TEST(StatusTest, testToString) { + Status fileError = Status::IOError("file error"); + ASSERT_EQ("IOError: file error", fileError.toString()); + ASSERT_EQ("IOError", fileError.codeAsString()); + + std::stringstream ss; + ss << fileError; + ASSERT_EQ(fileError.toString(), ss.str()); + + // Check that fmt has the right specializations. + ASSERT_EQ(fileError.toString(), fmt::format("{}", fileError)); + ASSERT_EQ("Unknown error", fmt::format("{}", StatusCode::kUnknownError)); +} + +TEST(StatusTest, andStatus) { + Status a = Status::OK(); + Status b = Status::OK(); + Status c = Status::Invalid("invalid value"); + Status d = Status::IOError("file error"); + + Status res; + res = a & b; + ASSERT_TRUE(res.ok()); + res = a & c; + ASSERT_TRUE(res.isInvalid()); + res = d & c; + ASSERT_TRUE(res.isIOError()); + + res = Status::OK(); + res &= c; + ASSERT_TRUE(res.isInvalid()); + res &= d; + ASSERT_TRUE(res.isInvalid()); + + // With rvalues. + res = Status::OK() & Status::Invalid("foo"); + ASSERT_TRUE(res.isInvalid()); + res = Status::Invalid("foo") & Status::OK(); + ASSERT_TRUE(res.isInvalid()); + res = Status::Invalid("foo") & Status::IOError("bar"); + ASSERT_TRUE(res.isInvalid()); + + res = Status::OK(); + res &= Status::OK(); + ASSERT_TRUE(res.ok()); + res &= Status::Invalid("foo"); + ASSERT_TRUE(res.isInvalid()); + res &= Status::IOError("bar"); + ASSERT_TRUE(res.isInvalid()); +} + +TEST(StatusTest, testEquality) { + ASSERT_EQ(Status(), Status::OK()); + ASSERT_EQ(Status::Invalid("error"), Status::Invalid("error")); + + ASSERT_NE(Status::Invalid("error"), Status::OK()); + ASSERT_NE(Status::Invalid("error"), Status::Invalid("other error")); +} + +TEST(StatusTest, testAbort) { + Status a = Status::Invalid("will abort process"); + ASSERT_DEATH(a.abort(), ""); +} + +Status returnIf(bool cond) { + VELOX_RETURN_IF(cond, Status::Invalid("error")); + return Status::OK(); +} + +Status returnNotOk(Status s) { + VELOX_RETURN_NOT_OK(s); + return Status::Invalid("invalid"); +} + +TEST(StatusTest, macros) { + ASSERT_EQ(returnIf(true), Status::Invalid("error")); + ASSERT_EQ(returnIf(false), Status::OK()); + + ASSERT_EQ(returnNotOk(Status::UserError("user")), Status::UserError("user")); + ASSERT_EQ(returnNotOk(Status::OK()), Status::Invalid("invalid")); + + VELOX_CHECK_OK(Status::OK()); // does not throw. + + bool didThrow = false; + try { + VELOX_CHECK_OK(Status::Invalid("invalid")); + } catch (const VeloxRuntimeError&) { + didThrow = true; + } + ASSERT_TRUE(didThrow) << "VELOX_CHECK_OK did not throw"; +} + +Expected modulo(int a, int b) { + if (b == 0) { + return folly::makeUnexpected(Status::UserError("division by zero")); + } + + return a % b; +} + +TEST(StatusTest, expected) { + auto result = modulo(10, 3); + EXPECT_TRUE(result.hasValue()); + EXPECT_EQ(result.value(), 1); + + result = modulo(10, 0); + EXPECT_TRUE(result.hasError()); + EXPECT_EQ(result.error(), Status::UserError("division by zero")); +} + +} // namespace +} // namespace facebook::velox::test diff --git a/velox/common/caching/AsyncDataCache.cpp b/velox/common/caching/AsyncDataCache.cpp index 2d8015816ac2a..6da669daed66e 100644 --- a/velox/common/caching/AsyncDataCache.cpp +++ b/velox/common/caching/AsyncDataCache.cpp @@ -17,11 +17,23 @@ #include "velox/common/caching/AsyncDataCache.h" #include "velox/common/caching/FileIds.h" #include "velox/common/caching/SsdCache.h" +#include "velox/common/caching/SsdFile.h" -#include +#include "velox/common/base/Counters.h" +#include "velox/common/base/Exceptions.h" +#include "velox/common/base/StatsReporter.h" #include "velox/common/base/SuccinctPrinter.h" #include "velox/common/caching/FileIds.h" +#define VELOX_CACHE_ERROR(errorMessage) \ + _VELOX_THROW( \ + ::facebook::velox::VeloxRuntimeError, \ + ::facebook::velox::error_source::kErrorSourceRuntime.c_str(), \ + ::facebook::velox::error_code::kNoCacheSpace.c_str(), \ + /* isRetriable */ true, \ + "{}", \ + errorMessage); + namespace facebook::velox::cache { using memory::MachinePageCount; @@ -35,7 +47,7 @@ AsyncDataCacheEntry::~AsyncDataCacheEntry() { shard_->cache()->allocator()->freeNonContiguous(data_); } -void AsyncDataCacheEntry::setExclusiveToShared() { +void AsyncDataCacheEntry::setExclusiveToShared(bool ssdSavable) { VELOX_CHECK(isExclusive()); numPins_ = 1; std::unique_ptr> promise; @@ -51,14 +63,19 @@ void AsyncDataCacheEntry::setExclusiveToShared() { // The entry may now have other readers, It is safe to do read-only ops like // integrity and notifying SSD cache of another candidate. + // + // NOTE: this is only used by test for now. const auto& hook = shard_->cache()->verifyHook(); if (hook != nullptr) { hook(*this); } - if ((ssdFile_ == nullptr) && (shard_->cache()->ssdCache() != nullptr)) { - auto* ssdCache = shard_->cache()->ssdCache(); - assert(ssdCache); // for lint only. + if (!ssdSavable) { + return; + } + + auto* ssdCache = shard_->cache()->ssdCache(); + if ((ssdCache != nullptr) && (ssdFile_ == nullptr)) { if (ssdCache->groupStats().shouldSaveToSsd(groupId_, trackingId_)) { ssdSaveable_ = true; shard_->cache()->possibleSsdSave(size_); @@ -112,13 +129,10 @@ void AsyncDataCacheEntry::initialize(FileCacheKey key) { } else { // No memory to cover 'this'. release(); - _VELOX_THROW( - VeloxRuntimeError, - error_source::kErrorSourceRuntime.c_str(), - error_code::kNoCacheSpace.c_str(), - /* isRetriable */ true, - "Failed to allocate {} bytes for cache", - size_); + VELOX_CACHE_ERROR(fmt::format( + "Failed to allocate {} bytes for cache: {}", + size_, + cache->allocator()->getAndClearFailureMessage())); } } } @@ -158,28 +172,28 @@ CachePin CacheShard::findOrCreate( ++eventCounter_; auto it = entryMap_.find(key); if (it != entryMap_.end()) { - auto* found = it->second; - if (found->isExclusive()) { + auto* foundEntry = it->second; + if (foundEntry->isExclusive()) { ++numWaitExclusive_; - if (wait == nullptr) { - return CachePin(); + if (wait != nullptr) { + *wait = foundEntry->getFuture(); } - *wait = found->getFuture(); return CachePin(); } - if (found->size() >= size) { - found->touch(); + + if (foundEntry->size() >= size) { + foundEntry->touch(); // The entry is in a readable state. Add a pin. - if (found->isPrefetch()) { - found->isFirstUse_ = true; - found->setPrefetch(false); + if (foundEntry->isPrefetch()) { + foundEntry->isFirstUse_ = true; + foundEntry->setPrefetch(false); } else { ++numHit_; - hitBytes_ += found->size(); + hitBytes_ += foundEntry->size(); } - ++found->numPins_; + ++foundEntry->numPins_; CachePin pin; - pin.setEntry(found); + pin.setEntry(foundEntry); return pin; } @@ -188,11 +202,14 @@ CachePin CacheShard::findOrCreate( // This can happen if different load quanta apply to access via different // connectors. This is not an error but still worth logging. VELOX_CACHE_LOG_EVERY_MS(WARNING, 1'000) - << "Requested larger entry. Found size " << found->size() + << "Requested larger entry. Found size " << foundEntry->size() << " requested size " << size; // The old entry is superseded. Possible readers of the old entry still // retain a valid read pin. - found->key_.fileNum.clear(); + RECORD_METRIC_VALUE(kMetricMemoryCacheNumStaleEntries); + ++numStales_; + foundEntry->key_.fileNum.clear(); + entryMap_.erase(it); } auto newEntry = getFreeEntry(); @@ -217,6 +234,15 @@ CachePin CacheShard::findOrCreate( return initEntry(key, entryToInit); } +void CacheShard::makeEvictable(RawFileCacheKey key) { + std::lock_guard l(mutex_); + auto it = entryMap_.find(key); + if (it == entryMap_.end()) { + return; + } + it->second->makeEvictable(); +} + bool CacheShard::exists(RawFileCacheKey key) const { std::lock_guard l(mutex_); auto it = entryMap_.find(key); @@ -248,7 +274,9 @@ CoalescedLoad::~CoalescedLoad() { setEndState(State::kCancelled); } -bool CoalescedLoad::loadOrFuture(folly::SemiFuture* wait) { +bool CoalescedLoad::loadOrFuture( + folly::SemiFuture* wait, + bool ssdSavable) { { std::lock_guard l(mutex_); if (state_ == State::kCancelled || state_ == State::kLoaded) { @@ -268,20 +296,21 @@ bool CoalescedLoad::loadOrFuture(folly::SemiFuture* wait) { VELOX_CHECK_EQ(State::kPlanned, state_); state_ = State::kLoading; } + // Outside of 'mutex_'. try { - const auto pins = loadData(!wait); + const auto pins = loadData(/*prefetch=*/wait == nullptr); for (const auto& pin : pins) { auto* entry = pin.checkedEntry(); VELOX_CHECK(entry->key().fileNum.hasValue()); VELOX_CHECK(entry->isExclusive()); - entry->setExclusiveToShared(); + entry->setExclusiveToShared(ssdSavable); } setEndState(State::kLoaded); - } catch (std::exception& e) { + } catch (std::exception&) { try { setEndState(State::kCancelled); - } catch (std::exception& inner) { + } catch (std::exception&) { // May not throw from inside catch. } throw; @@ -290,11 +319,14 @@ bool CoalescedLoad::loadOrFuture(folly::SemiFuture* wait) { } void CoalescedLoad::setEndState(State endState) { - std::lock_guard l(mutex_); - state_ = endState; - if (promise_ != nullptr) { - promise_->setValue(true); - promise_.reset(); + std::unique_ptr> promise; + { + std::lock_guard l(mutex_); + state_ = endState; + promise.swap(promise_); + } + if (promise != nullptr) { + promise->setValue(true); } } @@ -308,14 +340,13 @@ std::unique_ptr> CacheShard::removeEntry( } void CacheShard::removeEntryLocked(AsyncDataCacheEntry* entry) { - if (!entry->key_.fileNum.hasValue()) { - return; + if (entry->key_.fileNum.hasValue()) { + const auto it = entryMap_.find( + RawFileCacheKey{entry->key_.fileNum.id(), entry->key_.offset}); + VELOX_CHECK(it != entryMap_.end()); + entryMap_.erase(it); + entry->key_.fileNum.clear(); } - const auto it = entryMap_.find( - RawFileCacheKey{entry->key_.fileNum.id(), entry->key_.offset}); - VELOX_CHECK(it != entryMap_.end()); - entryMap_.erase(it); - entry->key_.fileNum.clear(); entry->setSsdFile(nullptr, 0); if (entry->isPrefetch()) { entry->setPrefetch(false); @@ -329,25 +360,29 @@ void CacheShard::removeEntryLocked(AsyncDataCacheEntry* entry) { cache_->incrementCachedPages(-numPages); cache_->allocator()->freeNonContiguous(entry->data()); } + entry->tinyData_.clear(); + entry->tinyData_.shrink_to_fit(); + entry->size_ = 0; } -void CacheShard::evict( +uint64_t CacheShard::evict( uint64_t bytesToFree, bool evictAllUnpinned, - int32_t pagesToAcquire, + MachinePageCount pagesToAcquire, memory::Allocation& acquired) { - int64_t tinyFreed = 0; - int64_t largeFreed = 0; - int32_t evictSaveableSkipped = 0; - auto ssdCache = cache_->ssdCache(); - bool skipSsdSaveable = ssdCache && ssdCache->writeInProgress(); + auto* ssdCache = cache_->ssdCache(); + const bool skipSsdSaveable = + (ssdCache != nullptr) && ssdCache->writeInProgress(); auto now = accessTime(); std::vector toFree; + int64_t tinyEvicted = 0; + int64_t largeEvicted = 0; + int32_t evictSaveableSkipped = 0; { std::lock_guard l(mutex_); - int size = entries_.size(); - if (!size) { - return; + const size_t size = entries_.size(); + if (size == 0) { + return 0; } int32_t counter = 0; int32_t numChecked = 0; @@ -360,13 +395,15 @@ void CacheShard::evict( } else { ++entryIndex; } + ++numEvictChecks_; + ++clockHand_; auto candidate = iter->get(); - if (!candidate) { + if (candidate == nullptr) { continue; } + ++numChecked; - ++clockHand_; if (evictionThreshold_ == kNoThreshold || eventCounter_ > entries_.size() / 4 || numChecked > entries_.size() / 8) { @@ -375,17 +412,21 @@ void CacheShard::evict( numChecked = 0; eventCounter_ = 0; } + int32_t score = 0; if (candidate->numPins_ == 0 && (!candidate->key_.fileNum.hasValue() || evictAllUnpinned || (score = candidate->score(now)) >= evictionThreshold_)) { - if (skipSsdSaveable && candidate->ssdSaveable_ && !evictAllUnpinned) { + if (skipSsdSaveable && candidate->ssdSaveable() && !evictAllUnpinned) { ++evictSaveableSkipped; continue; } - largeFreed += candidate->data_.byteSize(); + if (candidate->ssdSaveable()) { + ++numSavableEvict_; + } + largeEvicted += candidate->data_.byteSize(); if (pagesToAcquire > 0) { - auto candidatePages = candidate->data().numPages(); + const auto candidatePages = candidate->data().numPages(); pagesToAcquire = candidatePages > pagesToAcquire ? 0 : pagesToAcquire - candidatePages; @@ -394,36 +435,43 @@ void CacheShard::evict( } else { toFree.push_back(std::move(candidate->data())); } - removeEntryLocked(candidate); - emptySlots_.push_back(entryIndex); - tinyFreed += candidate->tinyData_.size(); + tinyEvicted += candidate->tinyData_.size(); candidate->tinyData_.clear(); candidate->tinyData_.shrink_to_fit(); candidate->size_ = 0; + + removeEntryLocked(candidate); + emptySlots_.push_back(entryIndex); tryAddFreeEntry(std::move(*iter)); ++numEvict_; - if (score) { + if (score > 0) { sumEvictScore_ += score; } - if (largeFreed + tinyFreed > bytesToFree) { + if (largeEvicted + tinyEvicted > bytesToFree) { break; } } } } + ClockTimer t(allocClocks_); freeAllocations(toFree); cache_->incrementCachedPages( - -largeFreed / static_cast(memory::AllocationTraits::kPageSize)); - if (evictSaveableSkipped && ssdCache && ssdCache->startWrite()) { - // Rare. May occur if SSD is unusually slow. Useful for diagnostics. - VELOX_SSD_CACHE_LOG(INFO) - << "Start save for old saveable, skipped " << cache_->numSkippedSaves(); - cache_->numSkippedSaves() = 0; - cache_->saveToSsd(); - } else if (evictSaveableSkipped) { - ++cache_->numSkippedSaves(); + -memory::AllocationTraits::numPages(largeEvicted)); + if (evictSaveableSkipped) { + VELOX_CHECK_NOT_NULL(ssdCache); + if (ssdCache->startWrite()) { + // Rare. May occur if SSD is unusually slow. Useful for diagnostics. + VELOX_SSD_CACHE_LOG(INFO) << "Start save for old saveable, skipped " + << cache_->numSkippedSaves(); + cache_->numSkippedSaves() = 0; + cache_->saveToSsd(); + } else { + ++cache_->numSkippedSaves(); + } } + + return largeEvicted + tinyEvicted; } void CacheShard::tryAddFreeEntry(std::unique_ptr&& entry) { @@ -471,8 +519,12 @@ void CacheShard::updateStats(CacheStats& stats) { ++stats.numEmptyEntries; continue; } else if (entry->isExclusive()) { + stats.exclusivePinnedBytes += + entry->data().byteSize() + entry->tinyData_.capacity(); ++stats.numExclusive; } else if (entry->isShared()) { + stats.sharedPinnedBytes += + entry->data().byteSize() + entry->tinyData_.capacity(); ++stats.numShared; } if (entry->isPrefetch_) { @@ -482,29 +534,37 @@ void CacheShard::updateStats(CacheStats& stats) { ++stats.numEntries; stats.tinySize += entry->tinyData_.size(); stats.tinyPadding += entry->tinyData_.capacity() - entry->tinyData_.size(); - stats.largeSize += entry->size_; - stats.largePadding += entry->data_.byteSize() - entry->size_; + if (entry->tinyData_.empty()) { + stats.largeSize += entry->size_; + stats.largePadding += entry->data_.byteSize() - entry->size_; + } } stats.numHit += numHit_; stats.hitBytes += hitBytes_; stats.numNew += numNew_; stats.numEvict += numEvict_; + stats.numSavableEvict += numSavableEvict_; stats.numEvictChecks += numEvictChecks_; stats.numWaitExclusive += numWaitExclusive_; + stats.numAgedOut += numAgedOut_; + stats.numStales += numStales_; stats.sumEvictScore += sumEvictScore_; stats.allocClocks += allocClocks_; } -void CacheShard::appendSsdSaveable(std::vector& pins) { +void CacheShard::appendSsdSaveable(bool saveAll, std::vector& pins) { std::lock_guard l(mutex_); - // Do not add more than 70% of entries to a write batch.If SSD save - // is slower than storage read, we must not have a situation where - // SSD save pins everything and stops reading. - const int32_t limit = (entries_.size() * 100) / 70; + // Do not add entries to a write batch more than maxWriteRatio_. If SSD save + // is slower than storage read, we must not have a situation where SSD save + // pins everything and stops reading. + const int32_t limit = saveAll + ? std::numeric_limits::max() + : static_cast( + static_cast(entries_.size()) * maxWriteRatio_); VELOX_CHECK(cache_->ssdCache()->writeInProgress()); for (auto& entry : entries_) { - if (entry && !entry->ssdFile_ && !entry->isExclusive() && - entry->ssdSaveable_) { + if (entry && (entry->ssdFile_ == nullptr) && !entry->isExclusive() && + entry->ssdSaveable()) { CachePin pin; ++entry->numPins_; pin.setEntry(entry.get()); @@ -518,22 +578,105 @@ void CacheShard::appendSsdSaveable(std::vector& pins) { } } +bool CacheShard::removeFileEntries( + const folly::F14FastSet& filesToRemove, + folly::F14FastSet& filesRetained) { + if (filesToRemove.empty()) { + VELOX_CACHE_LOG(INFO) << "Removed 0 AsyncDataCache entry."; + return true; + } + + int64_t pagesRemoved = 0; + std::vector toFree; + { + std::lock_guard l(mutex_); + + auto entryIndex = -1; + for (auto& cacheEntry : entries_) { + entryIndex++; + if (!cacheEntry || !cacheEntry->key_.fileNum.hasValue()) { + continue; + } + if (filesToRemove.count(cacheEntry->key_.fileNum.id()) == 0) { + continue; + } + if (cacheEntry->isExclusive() || cacheEntry->isShared()) { + filesRetained.insert(cacheEntry->key_.fileNum.id()); + continue; + } + + numAgedOut_++; + pagesRemoved += (int64_t)cacheEntry->data().numPages(); + + toFree.push_back(std::move(cacheEntry->data())); + removeEntryLocked(cacheEntry.get()); + emptySlots_.push_back(entryIndex); + tryAddFreeEntry(std::move(cacheEntry)); + cacheEntry = nullptr; + } + } + VELOX_CACHE_LOG(INFO) << "Removed " << toFree.size() + << " AsyncDataCache entries."; + + // Free the memory allocation out of the cache shard lock. + ClockTimer t(allocClocks_); + freeAllocations(toFree); + cache_->incrementCachedPages(-pagesRemoved); + + return true; +} + +CacheStats CacheStats::operator-(const CacheStats& other) const { + CacheStats result; + result.numHit = numHit - other.numHit; + result.hitBytes = hitBytes - other.hitBytes; + result.numNew = numNew - other.numNew; + result.numEvict = numEvict - other.numEvict; + result.numSavableEvict = numSavableEvict - other.numSavableEvict; + result.numEvictChecks = numEvictChecks - other.numEvictChecks; + result.numWaitExclusive = numWaitExclusive - other.numWaitExclusive; + result.numAgedOut = numAgedOut - other.numAgedOut; + result.numStales = numStales - other.numStales; + result.allocClocks = allocClocks - other.allocClocks; + result.sumEvictScore = sumEvictScore - other.sumEvictScore; + if (ssdStats != nullptr) { + if (other.ssdStats != nullptr) { + result.ssdStats = + std::make_shared(*ssdStats - *other.ssdStats); + } else { + result.ssdStats = std::make_shared(*ssdStats); + } + } + return result; +} + AsyncDataCache::AsyncDataCache( memory::MemoryAllocator* allocator, std::unique_ptr ssdCache) - : allocator_(allocator), ssdCache_(std::move(ssdCache)), cachedPages_(0) { + : AsyncDataCache({}, allocator, std::move(ssdCache)){}; + +AsyncDataCache::AsyncDataCache( + const Options& options, + memory::MemoryAllocator* allocator, + std::unique_ptr ssdCache) + : opts_(options), + allocator_(allocator), + ssdCache_(std::move(ssdCache)), + cachedPages_(0) { for (auto i = 0; i < kNumShards; ++i) { - shards_.push_back(std::make_unique(this)); + shards_.push_back(std::make_unique(this, opts_.maxWriteRatio)); } } -AsyncDataCache::~AsyncDataCache() {} +AsyncDataCache::~AsyncDataCache() = default; // static std::shared_ptr AsyncDataCache::create( memory::MemoryAllocator* allocator, - std::unique_ptr ssdCache) { - auto cache = std::make_shared(allocator, std::move(ssdCache)); + std::unique_ptr ssdCache, + const AsyncDataCache::Options& options) { + auto cache = + std::make_shared(options, allocator, std::move(ssdCache)); allocator->registerCache(cache); return cache; } @@ -555,6 +698,9 @@ AsyncDataCache** AsyncDataCache::getInstancePtr() { } void AsyncDataCache::shutdown() { + if (ssdCache_) { + ssdCache_->shutdown(); + } for (auto& shard : shards_) { shard->shutdown(); } @@ -573,6 +719,11 @@ CachePin AsyncDataCache::findOrCreate( return shards_[shard]->findOrCreate(key, size, wait); } +void AsyncDataCache::makeEvictable(RawFileCacheKey key) { + const int shard = std::hash()(key) & (kShardMask); + return shards_[shard]->makeEvictable(key); +} + bool AsyncDataCache::exists(RawFileCacheKey key) const { int shard = std::hash()(key) & (kShardMask); return shards_[shard]->exists(key); @@ -659,7 +810,7 @@ bool AsyncDataCache::makeSpace( // with 'evictAllUnpinned' set to true. shards_[shardCounter_ & (kShardMask)]->evict( memory::AllocationTraits::pageBytes( - std::max(kMinEvictPages, numPages) * sizeMultiplier), + std::max(kMinEvictPages, numPages) * sizeMultiplier), nthAttempt >= kNumShards, numPagesToAcquire, acquired); @@ -667,9 +818,51 @@ bool AsyncDataCache::makeSpace( sizeMultiplier *= 2; } } + memory::setCacheFailureMessage( + fmt::format("Failed to evict from cache state: {}", toString(false))); return false; } +uint64_t AsyncDataCache::shrink(uint64_t targetBytes) { + VELOX_CHECK_GT(targetBytes, 0); + + RECORD_METRIC_VALUE(kMetricCacheShrinkCount); + LOG(INFO) << "Try to shrink cache to free up " + << velox::succinctBytes(targetBytes) << " memory"; + + const uint64_t minBytesToEvict = 8UL << 20; + uint64_t evictedBytes{0}; + uint64_t shrinkTimeUs{0}; + { + MicrosecondTimer timer(&shrinkTimeUs); + for (int shard = 0; shard < shards_.size(); ++shard) { + memory::Allocation unused; + evictedBytes += shards_[shardCounter_++ & (kShardMask)]->evict( + std::max(minBytesToEvict, targetBytes - evictedBytes), + // Cache shrink is triggered when server is under low memory pressure + // so need to free up memory as soon as possible. So we always avoid + // triggering ssd save to accelerate the cache evictions. + true, + 0, + unused); + VELOX_CHECK(unused.empty()); + if (evictedBytes >= targetBytes) { + break; + } + } + // Call unmap to free up to 'targetBytes' unused memory space back to + // operating system after shrink. + allocator_->unmap(memory::AllocationTraits::numPages(targetBytes)); + } + + RECORD_HISTOGRAM_METRIC_VALUE(kMetricCacheShrinkTimeMs, shrinkTimeUs / 1'000); + LOG(INFO) << "Freed " << velox::succinctBytes(evictedBytes) + << " cache memory, spent " << velox::succinctMicros(shrinkTimeUs) + << "\n" + << toString(); + return evictedBytes; +} + bool AsyncDataCache::canTryAllocate( int32_t numPages, const memory::Allocation& acquired) const { @@ -683,11 +876,11 @@ bool AsyncDataCache::canTryAllocate( void AsyncDataCache::backoff(int32_t counter) { size_t seed = folly::hasher()(++backoffCounter_); - const auto usec = (seed & 0xfff) * (counter & 0x1f); + const auto usecs = (seed & 0xfff) * (counter & 0x1f); VELOX_CACHE_LOG_EVERY_MS(INFO, 1'000) - << "Backoff in allocation contention for " << succinctMicros(usec); + << "Backoff in allocation contention for " << succinctMicros(usecs); - std::this_thread::sleep_for(std::chrono::microseconds(usec)); // NOLINT + std::this_thread::sleep_for(std::chrono::microseconds(usecs)); // NOLINT } void AsyncDataCache::incrementNew(uint64_t size) { @@ -705,14 +898,17 @@ void AsyncDataCache::incrementNew(uint64_t size) { } void AsyncDataCache::possibleSsdSave(uint64_t bytes) { - constexpr int32_t kMinSavePages = 4096; // Save at least 16MB at a time. if (ssdCache_ == nullptr) { return; } ssdSaveable_ += bytes; if (memory::AllocationTraits::numPages(ssdSaveable_) > - std::max(kMinSavePages, cachedPages_ / 8)) { + std::max( + static_cast( + memory::AllocationTraits::numPages(opts_.minSsdSavableBytes)), + static_cast( + static_cast(cachedPages_) * opts_.ssdSavableRatio))) { // Do not start a new save if another one is in progress. if (!ssdCache_->startWrite()) { return; @@ -721,16 +917,37 @@ void AsyncDataCache::possibleSsdSave(uint64_t bytes) { } } -void AsyncDataCache::saveToSsd() { +void AsyncDataCache::saveToSsd(bool saveAll) { std::vector pins; VELOX_CHECK(ssdCache_->writeInProgress()); ssdSaveable_ = 0; for (auto& shard : shards_) { - shard->appendSsdSaveable(pins); + shard->appendSsdSaveable(saveAll, pins); } ssdCache_->write(std::move(pins)); } +bool AsyncDataCache::removeFileEntries( + const folly::F14FastSet& filesToRemove, + folly::F14FastSet& filesRetained) { + bool success = true; + + for (auto& shard : shards_) { + try { + success &= shard->removeFileEntries(filesToRemove, filesRetained); + } catch (const std::exception&) { + VELOX_CACHE_LOG(ERROR) + << "Error removing file entries from AsyncDataCache shard."; + success = false; + } + } + + if (ssdCache_) { + success &= ssdCache_->removeFileEntries(filesToRemove, filesRetained); + } + return success; +} + CacheStats AsyncDataCache::refreshStats() const { CacheStats stats; for (auto& shard : shards_) { @@ -744,26 +961,40 @@ CacheStats AsyncDataCache::refreshStats() const { void AsyncDataCache::clear() { for (auto& shard : shards_) { - memory::Allocation acquired; - shard->evict(std::numeric_limits::max(), true, 0, acquired); - VELOX_CHECK(acquired.empty()); + memory::Allocation unused; + shard->evict(std::numeric_limits::max(), true, 0, unused); + VELOX_CHECK(unused.empty()); } } -std::string AsyncDataCache::toString() const { +std::string AsyncDataCache::toString(bool details) const { auto stats = refreshStats(); std::stringstream out; out << "AsyncDataCache:\n" << stats.toString() << "\n" << "Allocated pages: " << allocator_->numAllocated() << " cached pages: " << cachedPages_ << "\n"; - out << "Backing: " << allocator_->toString(); - if (ssdCache_) { - out << "\nSSD: " << ssdCache_->toString(); + if (details) { + out << "Backing: " << allocator_->toString(); + if (ssdCache_) { + out << "\nSSD: " << ssdCache_->toString(); + } } return out.str(); } +std::vector AsyncDataCache::testingCacheEntries() const { + std::vector totalEntries; + for (const auto& shard : shards_) { + const auto shardEntries = shard->testingCacheEntries(); + std::copy( + shardEntries.begin(), + shardEntries.end(), + std::back_inserter(totalEntries)); + } + return totalEntries; +} + std::string CacheStats::toString() const { std::stringstream out; // Cache size stats. @@ -775,13 +1006,17 @@ std::string CacheStats::toString() const { // Cache entries << "Cache entries: " << numEntries << " read pins: " << numShared << " write pins: " << numExclusive + << " pinned shared: " << succinctBytes(sharedPinnedBytes) + << " pinned exclusive: " << succinctBytes(exclusivePinnedBytes) << "\n" << " num write wait: " << numWaitExclusive << " empty entries: " << numEmptyEntries << "\n" // Cache access stats. << "Cache access miss: " << numNew << " hit: " << numHit << " hit bytes: " << succinctBytes(hitBytes) << " eviction: " << numEvict - << " eviction checks: " << numEvictChecks + << " savable eviction: " << numSavableEvict + << " eviction checks: " << numEvictChecks << " aged out: " << numAgedOut + << " stales: " << numStales << "\n" // Cache prefetch stats. << "Prefetch entries: " << numPrefetch @@ -841,4 +1076,14 @@ CoalesceIoStats readPins( std::move(readFunc)); } +std::vector CacheShard::testingCacheEntries() const { + std::vector entries; + std::lock_guard l(mutex_); + entries.reserve(entries_.size()); + for (const auto& entry : entries_) { + entries.push_back(entry.get()); + } + return entries; +} + } // namespace facebook::velox::cache diff --git a/velox/common/caching/AsyncDataCache.h b/velox/common/caching/AsyncDataCache.h index 4f34306ae2ab0..b1e3c6a409c03 100644 --- a/velox/common/caching/AsyncDataCache.h +++ b/velox/common/caching/AsyncDataCache.h @@ -19,9 +19,11 @@ #include #include +#include #include +#include #include -#include "folly/GLog.h" + #include "velox/common/base/BitUtil.h" #include "velox/common/base/CoalesceIo.h" #include "velox/common/base/Portability.h" @@ -30,6 +32,7 @@ #include "velox/common/caching/ScanTracker.h" #include "velox/common/caching/StringIdMap.h" #include "velox/common/file/File.h" +#include "velox/common/memory/Memory.h" #include "velox/common/memory/MemoryAllocator.h" namespace facebook::velox::cache { @@ -42,7 +45,7 @@ namespace facebook::velox::cache { class AsyncDataCache; class CacheShard; class SsdCache; -class SsdCacheStats; +struct SsdCacheStats; class SsdFile; // Type for tracking last access. This is based on CPU clock and @@ -60,8 +63,8 @@ inline AccessTime accessTime() { } struct AccessStats { - AccessTime lastUse{0}; - int32_t numUses{0}; + tsan_atomic lastUse{0}; + tsan_atomic numUses{0}; // Retention score. A higher number means less worth retaining. This // works well with a typical formula of time over use count going to @@ -76,9 +79,9 @@ struct AccessStats { return (now - lastUse) / (1 + numUses); } - // Resets the access tracking to not accessed. This is used after - // evicting the previous contents of the entry, so that the new data - // does not inherit the history of the previous. + // Resets the access tracking to not accessed. This is used after evicting the + // previous contents of the entry, so that the new data does not inherit the + // history of the previous. void reset() { lastUse = accessTime(); numUses = 0; @@ -132,17 +135,17 @@ struct hash<::facebook::velox::cache::RawFileCacheKey> { namespace facebook::velox::cache { -// Represents a contiguous range of bytes cached from a file. This -// is the primary unit of access. These are typically owned via -// CachePin and can be in shared or exclusive mode. 'numPins_' -// counts the shared leases, the special value kExclusive means that -// this is being written to by another thread. It is possible to -// wait for the exclusive mode to finish, at which time one can -// retry getting access. Entries belong to one CacheShard at a -// time. The CacheShard serializes the mapping from a key to the -// entry and the setting entries to exclusive mode. An unpinned -// entry is evictable. CacheShard decides the eviction policy and -// serializes eviction with other access. +/// Represents a contiguous range of bytes cached from a file. This +/// is the primary unit of access. These are typically owned via +/// CachePin and can be in shared or exclusive mode. 'numPins_' +/// counts the shared leases, the special value kExclusive means that +/// this is being written to by another thread. It is possible to +/// wait for the exclusive mode to finish, at which time one can +/// retry getting access. Entries belong to one CacheShard at a +/// time. The CacheShard serializes the mapping from a key to the +/// entry and the setting entries to exclusive mode. An unpinned +/// entry is evictable. CacheShard decides the eviction policy and +/// serializes eviction with other access. class AsyncDataCacheEntry { public: static constexpr int32_t kExclusive = -10000; @@ -151,9 +154,9 @@ class AsyncDataCacheEntry { explicit AsyncDataCacheEntry(CacheShard* shard); ~AsyncDataCacheEntry(); - // Sets the key and allocates the entry's memory. Resets - // all other state. The entry must be held exclusively and must - // hold no memory when calling this. + /// Sets the key and allocates the entry's memory. Resets + /// all other state. The entry must be held exclusively and must + /// hold no memory when calling this. void initialize(FileCacheKey key); memory::Allocation& data() { @@ -221,7 +224,9 @@ class AsyncDataCacheEntry { return value; } - void setExclusiveToShared(); + /// If 'ssdSavable' is true, marks the loaded cache entry as ssdSavable if it + /// is not loaded from ssd. + void setExclusiveToShared(bool ssdSavable = true); void setSsdFile(SsdFile* file, uint64_t offset) { ssdFile_ = file; @@ -237,6 +242,10 @@ class AsyncDataCacheEntry { return ssdOffset_; } + bool ssdSaveable() const { + return ssdSaveable_; + } + void setTrackingId(TrackingId id) { trackingId_ = id; } @@ -248,15 +257,23 @@ class AsyncDataCacheEntry { /// Sets access stats so that this is immediately evictable. void makeEvictable(); - // Moves the promise out of 'this'. Used in order to handle the - // promise within the lock of the cache shard, so not within private - // methods of 'this'. + /// Moves the promise out of 'this'. Used in order to handle the + /// promise within the lock of the cache shard, so not within private + /// methods of 'this'. std::unique_ptr> movePromise() { return std::move(promise_); } std::string toString() const; + const AccessStats& testingAccessStats() const { + return accessStats_; + } + + bool testingFirstUse() const { + return isFirstUse_; + } + private: void release(); void addReference(); @@ -290,9 +307,9 @@ class AsyncDataCacheEntry { AccessStats accessStats_; - // True if 'this' is speculatively loaded. This is reset on first - // hit. Allows catching a situation where prefetched entries get - // evicted before they are hit. + // True if 'this' is speculatively loaded. This is reset on first hit. Allows + // catching a situation where prefetched entries get evicted before they are + // hit. bool isPrefetch_{false}; // Sets after first use of a prefetched entry. Cleared by @@ -421,8 +438,9 @@ class CoalescedLoad { /// load of the entries that are not yet present. If another thread is in the /// process of doing this and 'wait' is null, returns immediately. If another /// thread is in the process of doing this and 'wait' is not null, waits for - /// the other thread to be done. - bool loadOrFuture(folly::SemiFuture* wait); + /// the other thread to be done. If 'ssdSavable' is true, marks the loaded + /// entries as ssdsavable. + bool loadOrFuture(folly::SemiFuture* wait, bool ssdSavable = true); State state() const { tsan_lock_guard l(mutex_); @@ -441,14 +459,13 @@ class CoalescedLoad { } protected: - // Makes entries for 'keys_' and loads their content. Elements of - // 'keys_' that are already loaded or loading are expected to be left - // out. The returned pins are expected to be exclusive with data - // loaded. The caller will set them to shared state on success. If - // loadData() throws, the pins it may have made will be destructed in - // their exclusive state so that they do not become visible to other - // users of the cache. - virtual std::vector loadData(bool isPrefetch) = 0; + // Makes entries for 'keys_' and loads their content. Elements of 'keys_' that + // are already loaded or loading are expected to be left out. The returned + // pins are expected to be exclusive with data loaded. The caller will set + // them to shared state on success. If loadData() throws, the pins it may have + // made will be destructed in their exclusive state so that they do not become + // visible to other users of the cache. + virtual std::vector loadData(bool prefetch) = 0; // Sets a final state and resumes waiting threads. void setEndState(State endState); @@ -465,53 +482,72 @@ class CoalescedLoad { std::vector sizes_; }; -// Struct for CacheShard stats. Stats from all shards are added into -// this struct to provide a snapshot of state. +/// Struct for CacheShard stats. Stats from all shards are added into +/// this struct to provide a snapshot of state. struct CacheStats { - // Total size in 'tinyData_' + /// ============= Snapshot stats ============= + + /// Total size in 'tinyData_' int64_t tinySize{0}; - // Total size in 'data_' + /// Total size in 'data_' int64_t largeSize{0}; - // Unused capacity in 'tinyData_'. + /// Unused capacity in 'tinyData_'. int64_t tinyPadding{0}; - // Unused capacity in 'data_'. + /// Unused capacity in 'data_'. int64_t largePadding{0}; - // Total number of entries. + /// Total number of entries. int32_t numEntries{0}; - // Number of entries that do not cache anything. + /// Number of entries that do not cache anything. int32_t numEmptyEntries{0}; - // Number of entries pinned for shared access. + /// Number of entries pinned for shared access. int32_t numShared{0}; - // Number of entries pinned for exclusive access. + /// Number of entries pinned for exclusive access. int32_t numExclusive{0}; - // Number of entries that are being or have been prefetched but have not been - // hit. + /// Number of entries that are being or have been prefetched but have not been + /// hit. int32_t numPrefetch{0}; - // Total size of entries in prefetch state. + /// Total size of entries in prefetch state. int64_t prefetchBytes{0}; - // Number of hits (saved IO). The first hit to a prefetched entry does not - // count. + /// Total size of shared/exclusive pinned entries. + int64_t sharedPinnedBytes{0}; + int64_t exclusivePinnedBytes{0}; + + /// ============= Cumulative stats ============= + + /// Number of hits (saved IO). The first hit to a prefetched entry does not + /// count. int64_t numHit{0}; - // Sum of sizes of entries counted in 'numHit'. + /// Sum of sizes of entries counted in 'numHit'. int64_t hitBytes{0}; - // Number of new entries created. + /// Number of new entries created. int64_t numNew{0}; - // Number of times a valid entry was removed in order to make space. + /// Number of times a valid entry was removed in order to make space. int64_t numEvict{0}; - // Number of entries considered for evicting. + /// Number of times a valid entry was removed in order to make space but has + /// not been saved to SSD yet. + int64_t numSavableEvict{0}; + /// Number of entries considered for evicting. int64_t numEvictChecks{0}; - // Number of times a user waited for an entry to transit from exclusive to - // shared mode. + /// Number of times a user waited for an entry to transit from exclusive to + /// shared mode. int64_t numWaitExclusive{0}; - // Cumulative clocks spent in allocating or freeing memory for backing cache - // entries. + /// Total number of entries that are aged out and beyond TTL. + int64_t numAgedOut{0}; + /// Total number of entries that are stale because of cache request size + /// mismatch. + int64_t numStales{0}; + /// Cumulative clocks spent in allocating or freeing memory for backing cache + /// entries. uint64_t allocClocks{0}; - // Sum of scores of evicted entries. This serves to infer an average - // lifetime for entries in cache. + /// Sum of scores of evicted entries. This serves to infer an average + /// lifetime for entries in cache. int64_t sumEvictScore{0}; + /// Ssd cache stats that include both snapshot and cumulative stats. std::shared_ptr ssdStats = nullptr; + CacheStats operator-(const CacheStats& other) const; + std::string toString() const; }; @@ -521,7 +557,8 @@ struct CacheStats { /// and other housekeeping. class CacheShard { public: - explicit CacheShard(AsyncDataCache* cache) : cache_(cache) {} + CacheShard(AsyncDataCache* cache, double maxWriteRatio) + : cache_(cache), maxWriteRatio_(maxWriteRatio) {} /// See AsyncDataCache::findOrCreate. CachePin findOrCreate( @@ -529,6 +566,9 @@ class CacheShard { uint64_t size, folly::SemiFuture* readyFuture); + /// Marks the cache entry with given cache 'key' as immediate evictable. + void makeEvictable(RawFileCacheKey key); + /// Returns true if there is an entry for 'key'. Updates access time. bool exists(RawFileCacheKey key) const; @@ -544,39 +584,51 @@ class CacheShard { /// graceful shutdown. The shard will no longer be valid after this call. void shutdown(); - // removes 'bytesToFree' worth of entries or as many entries as are - // not pinned. This favors first removing older and less frequently - // used entries. If 'evictAllUnpinned' is true, anything that is - // not pinned is evicted at first sight. This is for out of memory - // emergencies. If 'pagesToAcquire' is set, up to this amount is added to - // 'allocation'. A smaller amount can be added if not enough evictable data is - // found. - void evict( + /// Removes 'bytesToFree' worth of entries or as many entries as are not + /// pinned. This favors first removing older and less frequently used entries. + /// If 'evictAllUnpinned' is true, anything that is not pinned is evicted at + /// first sight. This is for out of memory emergencies. If 'pagesToAcquire' is + /// set, up to this amount is added to 'allocation'. A smaller amount can be + /// added if not enough evictable data is found. The function returns the + /// total evicted bytes. + uint64_t evict( uint64_t bytesToFree, bool evictAllUnpinned, - int32_t pagesToAcquire, + memory::MachinePageCount pagesToAcquire, memory::Allocation& acquiredAllocation); - // Removes 'entry' from 'this'. Removes a possible promise from the entry - // inside the shard mutex and returns it so that it can be realized outside of - // the mutex. + /// Removes 'entry' from 'this'. Removes a possible promise from the entry + /// inside the shard mutex and returns it so that it can be realized outside + /// of the mutex. std::unique_ptr> removeEntry( AsyncDataCacheEntry* entry); - // Adds the stats of 'this' to 'stats'. + /// Adds the stats of 'this' to 'stats'. void updateStats(CacheStats& stats); - // Appends a batch of non-saved SSD saveable entries in 'this' to - // 'pins'. This may have to be called several times since this keeps - // limits on the batch to write at one time. The saveable entries - // are pinned for read. 'pins' should be written or dropped before - // calling this a second time. - void appendSsdSaveable(std::vector& pins); + /// Appends a batch of non-saved SSD savable entries in 'this' to 'pins'. This + /// may have to be called several times since this keeps limits on the batch + /// to write at one time. The savable entries are pinned for read. 'pins' + /// should be written or dropped before calling this a second time. If 'all' + /// is true, then appends all the non-savable SSD savable entries without + /// limitation check. 'saveAll' is set to true for Prestissimo worker + /// operation use case. + void appendSsdSaveable(bool saveAll, std::vector& pins); + + /// Remove cache entries from this shard for files in the fileNum set + /// 'filesToRemove'. If successful, return true, and 'filesRetained' contains + /// entries that should not be removed, ex., in exclusive mode or in shared + /// mode. Otherwise, return false and 'filesRetained' could be ignored. + bool removeFileEntries( + const folly::F14FastSet& filesToRemove, + folly::F14FastSet& filesRetained); auto& allocClocks() { return allocClocks_; } + std::vector testingCacheEntries() const; + private: static constexpr uint32_t kMaxFreeEntries = 1 << 10; static constexpr int32_t kNoThreshold = std::numeric_limits::max(); @@ -598,6 +650,7 @@ class CacheShard { void tryAddFreeEntry(std::unique_ptr&& entry); AsyncDataCache* const cache_; + const double maxWriteRatio_; mutable std::mutex mutex_; folly::F14FastMap entryMap_; @@ -617,27 +670,66 @@ class CacheShard { int32_t evictionThreshold_{kNoThreshold}; // Cumulative count of cache hits. uint64_t numHit_{0}; - // Sum of bytes in cache hits. + // Cumulative Sum of bytes in cache hits. uint64_t hitBytes_{0}; // Cumulative count of hits on entries held in exclusive mode. uint64_t numWaitExclusive_{0}; // Cumulative count of new entry creation. uint64_t numNew_{0}; - // Count of entries evicted. + // Cumulative count of entries evicted. uint64_t numEvict_{0}; - // Count of entries considered for eviction. This divided by + // Cumulative count of evicted entries which has not been saved to SSD yet. + uint64_t numSavableEvict_{0}; + // Cumulative count of entries considered for eviction. This divided by // 'numEvict_' measured efficiency of eviction. uint64_t numEvictChecks_{0}; - // Sum of evict scores. This divided by 'numEvict_' correlates to + // Cumulative count of entries aged out due to TTL. + uint64_t numAgedOut_{0}; + // Cumulative count of stale entries because of cache request size mismatch. + uint64_t numStales_{0}; + // Cumulative sum of evict scores. This divided by 'numEvict_' correlates to // time data stays in cache. uint64_t sumEvictScore_{0}; - // Tracker of time spent in allocating/freeing MemoryAllocator space - // for backing cached data. + // Tracker of cumulative time spent in allocating/freeing MemoryAllocator + // space for backing cached data. std::atomic allocClocks_{0}; }; class AsyncDataCache : public memory::Cache { public: + struct Options { + Options( + double _maxWriteRatio = 0.7, + double _ssdSavableRatio = 0.125, + int32_t _minSsdSavableBytes = 1 << 24) + : maxWriteRatio(_maxWriteRatio), + ssdSavableRatio(_ssdSavableRatio), + minSsdSavableBytes(_minSsdSavableBytes){}; + + /// The max ratio of the number of in-memory cache entries being written to + /// SSD cache over the total number of cache entries. This is to control SSD + /// cache write rate, and once the ratio exceeds this threshold, then we + /// stop writing to SSD cache. + double maxWriteRatio; + + /// The min ratio of SSD savable (in-memory) cache space over the total + /// cache space. Once the ratio exceeds this limit, we start writing SSD + /// savable cache entries into SSD cache. + double ssdSavableRatio; + + /// Min SSD savable (in-memory) cache space to start writing SSD savable + /// cache entries into SSD cache. + /// + /// NOTE: we only write to SSD cache when both above conditions satisfy. The + /// default is 16MB. + int32_t minSsdSavableBytes; + }; + + AsyncDataCache( + const Options& options, + memory::MemoryAllocator* allocator, + std::unique_ptr ssdCache = nullptr); + AsyncDataCache( memory::MemoryAllocator* allocator, std::unique_ptr ssdCache = nullptr); @@ -646,7 +738,8 @@ class AsyncDataCache : public memory::Cache { static std::shared_ptr create( memory::MemoryAllocator* allocator, - std::unique_ptr ssdCache = nullptr); + std::unique_ptr ssdCache = nullptr, + const AsyncDataCache::Options& = {}); static AsyncDataCache* getInstance(); @@ -667,6 +760,8 @@ class AsyncDataCache : public memory::Cache { memory::MachinePageCount numPages, std::function allocate) override; + uint64_t shrink(uint64_t targetBytes) override; + memory::MemoryAllocator* allocator() const override { return allocator_; } @@ -687,12 +782,19 @@ class AsyncDataCache : public memory::Cache { uint64_t size, folly::SemiFuture* waitFuture = nullptr); + /// Marks the cache entry with given cache 'key' as immediate evictable. + void makeEvictable(RawFileCacheKey key); + /// Returns true if there is an entry for 'key'. Updates access time. bool exists(RawFileCacheKey key) const; - CacheStats refreshStats() const; + /// Returns snapshot of the aggregated stats from all shards and the stats of + /// SSD cache if used. + virtual CacheStats refreshStats() const; - std::string toString() const; + /// If 'details' is true, returns the stats of the backing memory allocator + /// and ssd cache. Otherwise, only returns the cache stats. + std::string toString(bool details = true) const; memory::MachinePageCount incrementCachedPages(int64_t pages) { // The counter is unsigned and the increment is signed. @@ -729,16 +831,15 @@ class AsyncDataCache : public memory::Cache { return verifyHook_; } - // Looks up a pin for each in 'keys' and skips all loading or - // loaded pins. Calls processPin for each exclusive - // pin. processPin must move its argument if it wants to use it - // afterwards. sizeFunc(i) returns the size of the ith item in - // 'keys'. + /// Looks up a pin for each in 'keys' and skips all loading or loaded pins. + /// Calls processPin for each exclusive pin. processPin must move its argument + /// if it wants to use it afterwards. sizeFunc(i) returns the size of the ith + /// item in 'keys'. template void makePins( const std::vector& keys, - SizeFunc sizeFunc, - ProcessPin processPin) { + const SizeFunc& sizeFunc, + const ProcessPin& processPin) { for (auto i = 0; i < keys.size(); ++i) { auto pin = findOrCreate(keys[i], sizeFunc(i), nullptr); if (pin.empty() || pin.checkedEntry()->isShared()) { @@ -748,16 +849,37 @@ class AsyncDataCache : public memory::Cache { } } - // Drops all unpinned entries. Pins stay valid. - void clear(); - - // Saves all entries with 'ssdSaveable_' to 'ssdCache_'. - void saveToSsd(); + /// Saves entries in 'ssdSaveable_' to 'ssdCache_'. If 'saveAll' is true, then + /// write them all in 'ssdSaveable_'. + void saveToSsd(bool saveAll = false); tsan_atomic& numSkippedSaves() { return numSkippedSaves_; } + /// Remove cache entries from all shards for files in the fileNum set + /// 'filesToRemove'. If successful, return true, and 'filesRetained' contains + /// entries that should not be removed, ex., in exclusive mode or in shared + /// mode. Otherwise, return false and 'filesRetained' could be ignored. + bool removeFileEntries( + const folly::F14FastSet& filesToRemove, + folly::F14FastSet& filesRetained); + + /// Drops all unpinned entries. Pins stay valid. + /// + /// NOTE: it is used by testing and Prestissimo server operation. + void clear(); + + std::vector testingCacheEntries() const; + + uint64_t testingSsdSavable() const { + return ssdSaveable_; + } + + int32_t testingNumShards() const { + return shards_.size(); + } + private: static constexpr int32_t kNumShards = 4; // Must be power of 2. static constexpr int32_t kShardMask = kNumShards - 1; @@ -772,6 +894,7 @@ class AsyncDataCache : public memory::Cache { // Waits a pseudorandom delay times 'counter'. void backoff(int32_t counter); + const Options opts_; memory::MemoryAllocator* const allocator_; std::unique_ptr ssdCache_; std::vector> shards_; @@ -808,9 +931,8 @@ class AsyncDataCache : public memory::Cache { std::atomic numThreadsInAllocate_{0}; }; -// Samples a set of values T from 'numSamples' calls of -// 'iter'. Returns the value where 'percent' of the samples are less than the -// returned value. +/// Samples a set of values T from 'numSamples' calls of 'iter'. Returns the +/// value where 'percent' of the samples are less than the returned value. template T percentile(Next next, int32_t numSamples, int percent) { std::vector values; @@ -822,24 +944,22 @@ T percentile(Next next, int32_t numSamples, int percent) { return values.empty() ? 0 : values[(values.size() * percent) / 100]; } -// Utility function for loading multiple pins with coalesced -// IO. 'pins' is a vector of CachePins to fill. 'maxGap' is the -// largest allowed distance in bytes between the end of one entry and -// the start of the next. If the gap is larger or the next is before -// the end of the previous, the entries will be fetched separately. -// -//'offsetFunc' returns the starting offset of the data in the -// file given a pin and the pin's index in 'pins'. The pins are expected to be -// sorted by this offset. 'readFunc' reads from the appropriate media. It gets -// the 'pins' and the index of the first pin included in the read and the -// index of the first pin not included. It gets the starting offset of the -// read and a vector of memory ranges to fill by ReadFile::preadv or a similar -// function. -// The caller is responsible for calling setValid on the pins after a -// successful read. -// -// Returns the number of distinct IOs, the number of bytes loaded into pins -// and the number of extra bytes read. +/// Utility function for loading multiple pins with coalesced IO. 'pins' is a +/// vector of CachePins to fill. 'maxGap' is the largest allowed distance in +/// bytes between the end of one entry and the start of the next. If the gap is +/// larger or the next is before the end of the previous, the entries will be +/// fetched separately. +/// +/// 'offsetFunc' returns the starting offset of the data in the file given a pin +/// and the pin's index in 'pins'. The pins are expected to be sorted by this +/// offset. 'readFunc' reads from the appropriate media. It gets the 'pins' and +/// the index of the first pin included in the read and the index of the first +/// pin not included. It gets the starting offset of the read and a vector of +/// memory ranges to fill by ReadFile::preadv or a similar function. The caller +/// is responsible for calling setValid on the pins after a successful read. +/// +/// Returns the number of distinct IOs, the number of bytes loaded into pins +/// and the number of extra bytes read. CoalesceIoStats readPins( const std::vector& pins, int32_t maxGap, @@ -859,7 +979,7 @@ struct fmt::formatter : formatter { auto format( facebook::velox::cache::CoalescedLoad::State s, - format_context& ctx) { + format_context& ctx) const { return formatter::format(static_cast(s), ctx); } }; diff --git a/velox/common/caching/CMakeLists.txt b/velox/common/caching/CMakeLists.txt index f30696ac77e58..76d7694b0a805 100644 --- a/velox/common/caching/CMakeLists.txt +++ b/velox/common/caching/CMakeLists.txt @@ -12,21 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library( +velox_add_library( velox_caching - FileIds.cpp - StringIdMap.cpp AsyncDataCache.cpp + CacheTTLController.cpp + FileIds.cpp ScanTracker.cpp SsdCache.cpp SsdFile.cpp - SsdFileTracker.cpp) -target_link_libraries( + SsdFileTracker.cpp + StringIdMap.cpp) +velox_link_libraries( velox_caching PUBLIC velox_common_base velox_exception velox_file velox_memory + velox_process + velox_time Folly::folly fmt::fmt gflags::gflags diff --git a/velox/common/caching/CacheTTLController.cpp b/velox/common/caching/CacheTTLController.cpp new file mode 100644 index 0000000000000..1a731420c5191 --- /dev/null +++ b/velox/common/caching/CacheTTLController.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/caching/CacheTTLController.h" + +#include "velox/common/caching/AsyncDataCache.h" + +namespace facebook::velox::cache { + +std::unique_ptr CacheTTLController::instance_ = nullptr; + +bool CacheTTLController::addOpenFileInfo( + uint64_t fileNum, + int64_t openTimeSec) { + auto lockedFileMap = fileInfoMap_.wlock(); + auto it = lockedFileMap->find(fileNum); + if (it == lockedFileMap->end() || it->second.removeInProgress) { + lockedFileMap->insert_or_assign(fileNum, RawFileInfo{openTimeSec, false}); + return true; + } + return false; +} + +CacheAgeStats CacheTTLController::getCacheAgeStats() const { + auto lockedFileMap = fileInfoMap_.rlock(); + + if (lockedFileMap->empty()) { + return CacheAgeStats{.maxAgeSecs = 0}; + } + + // Use the oldest file open time to calculate the max possible age of cache + // entries loaded from the files. + int64_t minOpenTime = std::numeric_limits::max(); + for (auto it = lockedFileMap->cbegin(); it != lockedFileMap->cend(); it++) { + minOpenTime = std::min(minOpenTime, it->second.openTimeSec); + } + + int64_t maxAge = getCurrentTimeSec() - minOpenTime; + return CacheAgeStats{.maxAgeSecs = std::max(maxAge, 0)}; +} + +void CacheTTLController::applyTTL(int64_t ttlSecs) { + int64_t maxOpenTime = getCurrentTimeSec() - ttlSecs; + + folly::F14FastSet filesToRemove = + getAndMarkAgedOutFiles(maxOpenTime); + if (filesToRemove.empty()) { + LOG(INFO) << "No cache entry is out of TTL " << ttlSecs << "."; + return; + } + + folly::F14FastSet filesRetained; + bool success = cache_.removeFileEntries(filesToRemove, filesRetained); + + LOG(INFO) << (success ? "Succeeded" : "Failed") << " applying cache TTL of " + << ttlSecs << " seconds. Entries from " << filesToRemove.size() + << " files are to be removed, while " << filesRetained.size() + << " files are retained"; + if (success) { + cleanUp(filesRetained); + } else { + reset(); + } +} + +folly::F14FastSet CacheTTLController::getAndMarkAgedOutFiles( + int64_t maxOpenTimeSecs) { + auto lockedFileMap = fileInfoMap_.wlock(); + + folly::F14FastSet fileNums; + + for (auto it = lockedFileMap->begin(); it != lockedFileMap->end(); it++) { + if (it->second.removeInProgress || + it->second.openTimeSec < maxOpenTimeSecs) { + fileNums.insert(it->first); + it->second.removeInProgress = true; + } + } + + return fileNums; +} + +void CacheTTLController::cleanUp( + const folly::F14FastSet& filesToRetain) { + fileInfoMap_.withWLock([&](auto& fileMap) { + auto it = fileMap.begin(); + while (it != fileMap.end()) { + if (!it->second.removeInProgress) { + it++; + continue; + } + if (filesToRetain.count(it->first) > 0) { + it->second.removeInProgress = false; + it++; + continue; + } + it = fileMap.erase(it); + } + }); +} + +void CacheTTLController::reset() { + fileInfoMap_.withWLock([](auto& fileMap) { + for (auto& [_, fileInfo] : fileMap) { + fileInfo.removeInProgress = false; + } + }); +} + +} // namespace facebook::velox::cache diff --git a/velox/common/caching/CacheTTLController.h b/velox/common/caching/CacheTTLController.h new file mode 100644 index 0000000000000..f81e596ed3b02 --- /dev/null +++ b/velox/common/caching/CacheTTLController.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/common/time/Timer.h" + +#include "folly/Synchronized.h" +#include "folly/container/F14Map.h" +#include "folly/container/F14Set.h" + +namespace facebook::velox::cache { + +class AsyncDataCache; + +struct RawFileInfo { + int64_t openTimeSec; + bool removeInProgress; + + bool operator==(const RawFileInfo& other) { + return openTimeSec == other.openTimeSec && + removeInProgress == other.removeInProgress; + } +}; + +struct CacheAgeStats { + // Age in seconds of the oldest opened file loaded to the caches. + int64_t maxAgeSecs{0}; +}; + +/// A process-wide singleton to handle TTL of AsyncDataCache and SsdCache. +class CacheTTLController { + public: + /// Create and return a singleton instance of CacheTTLController. + static CacheTTLController* create(AsyncDataCache& cache) { + if (instance_ == nullptr) { + instance_ = + std::unique_ptr(new CacheTTLController(cache)); + } + return instance_.get(); + } + + /// Return the process-wide singleton instance of CacheTTLController if it has + /// been created. Otherwise, return nullptr. + static CacheTTLController* getInstance() { + if (instance_ == nullptr) { + return nullptr; + } + return instance_.get(); + } + + static void testingClear() { + instance_ = nullptr; + } + + /// Add file opening info for a file identified by fileNum. Return true if a + /// new file entry is inserted, or if the existing file entry is updated + /// while cache deletion for the file is in progress. Return false otherwise + /// if the existing file entry is not updated. + bool addOpenFileInfo( + uint64_t fileNum, + int64_t openTimeSec = getCurrentTimeSec()); + + /// Compute age related stats of the cached files. + CacheAgeStats getCacheAgeStats() const; + + void applyTTL(int64_t ttlSecs); + + private: + /// A process-wide singleton instance of CacheTTLController. + static std::unique_ptr instance_; + + private: + // Prevent creating a random instance of CacheTTLController. + explicit CacheTTLController(AsyncDataCache& cache) : cache_(cache) {} + + folly::F14FastSet getAndMarkAgedOutFiles(int64_t maxOpenTimeSecs); + + /// Clean up file entries with removeInProgress true but keep entries for + /// fileNums in filesToRetain. + void cleanUp(const folly::F14FastSet& filesToRetain); + + void reset(); + + AsyncDataCache& cache_; + + /// A Map of fileNum to RawFileInfo. + folly::Synchronized> fileInfoMap_; +}; + +} // namespace facebook::velox::cache diff --git a/velox/common/caching/CachedFactory.h b/velox/common/caching/CachedFactory.h index 0e02627d2b6bc..22d266ec6235e 100644 --- a/velox/common/caching/CachedFactory.h +++ b/velox/common/caching/CachedFactory.h @@ -31,6 +31,7 @@ #include #include +#include #include #include "folly/container/F14Set.h" @@ -39,75 +40,226 @@ namespace facebook::velox { -// CachedFactory provides a thread-safe way of backing a keyed generator -// (e.g. the key is filename, and the value is the file data) by a cache. -// -// Generator should take a single Key argument and return a Value; -// The Value should be either a value type or should manage its own lifecycle -// (shared_ptr). If it is not thread-safe it must do its own internal locking. -template +/// A smart pointer that represents data that may be in a cache and is thus not +/// owned, or is owned like a unique_ptr. We could also implement this by a +/// unique_ptr with a custom deleter. +template < + typename Key, + typename Value, + typename Comparator = std::equal_to, + typename Hash = std::hash> +class CachedPtr { + public: + /// Nullptr case. + CachedPtr(); + + /// Data is not in cache, ownership taken by *this. + explicit CachedPtr(Value* value); + + /// Data is in the provided cache referenced by the given key. The cache is + /// not guarded by a mutex. + CachedPtr( + bool cached, + Value* value, + SimpleLRUCache* cache, + std::unique_ptr key); + + /// Same as above, but the cache is guarded by a mutex. + CachedPtr( + bool cached, + Value* value, + SimpleLRUCache* cache, + std::unique_ptr key, + std::mutex* cacheMu); + + /// The destructor handles the in-cache and non-in-cache cases appropriately. + ~CachedPtr(); + + /// Move allowed, copy disallowed. Moving a new value into a non-null + /// CachedPtr will clear the previous value. + CachedPtr(CachedPtr&&); + CachedPtr& operator=(CachedPtr&&); + CachedPtr(const CachedPtr&) = delete; + CachedPtr& operator=(const CachedPtr&) = delete; + + /// Whether this value is load from cache. If we had to wait for a generation + /// (whether the actual generation was done in this thread or another) then + /// this is false. Has no effect on this behavior, but may be useful for + /// monitoring cache hit rates/etc. + bool fromCache() const { + return fromCache_; + } + + /// Indicates if this value is cached or not. + bool cached() const { + return cache_ != nullptr; + } + + Value* operator->() const { + return value_; + } + Value& operator*() const { + return *value_; + } + Value* get() const { + return value_; + } + + void testingClear() { + clear(); + key_.reset(); + value_ = nullptr; + cache_ = nullptr; + cacheMu_ = nullptr; + } + + private: + // Delete or release owned value. + void clear(); + + bool fromCache_; + std::unique_ptr key_; + Value* value_; + std::mutex* cacheMu_; + // If 'value_' is in cache, 'cache_' and 'key_' will be non-null, and + // 'cacheMu_' may be non-null. If cacheMu_ is non-null, we use it to protect + // our operations to 'cache_'. + SimpleLRUCache* cache_; +}; + +template +struct DefaultSizer { + int64_t operator()(const Value& value) const { + return 1; + } +}; + +/// CachedFactory provides a thread-safe way of backing a keyed generator +/// (e.g. the key is filename, and the value is the file data) by a cache. +/// +/// Generator should take a single Key argument and return a unique_ptr; +/// If it is not thread-safe it must do its own internal locking. +/// Sizer takes a Value and returns how much cache space it will occupy. The +/// DefaultSizer says each value occupies 1 space. +template < + typename Key, + typename Value, + typename Generator, + typename Properties = void, + typename Sizer = DefaultSizer, + typename Comparator = std::equal_to, + typename Hash = std::hash> class CachedFactory { public: - // It is generally expected that most inserts into the cache will succeed, - // i.e. the cache is large compared to the size of the elements and the number - // of elements that are pinned. Everything should still work if this is not - // true, but performance will suffer. + /// It is generally expected that most inserts into the cache will succeed, + /// i.e. the cache is large compared to the size of the elements and the + /// number of elements that are pinned. Everything should still work if this + /// is not true, but performance will suffer. If 'cache' is nullptr, this + /// means the cache is disabled. 'generator' is invoked directly in 'generate' + /// function. CachedFactory( - std::unique_ptr> cache, + std::unique_ptr> cache, std::unique_ptr generator) - : cache_(std::move(cache)), generator_(std::move(generator)) {} - - // Returns the generator's output on the given key. If the output is - // in the cache, returns immediately. Otherwise, blocks until the output - // is ready. - // The function returns a pair. The boolean in the pair indicates whether a - // cache hit or miss. The Value is the generator output for the key if cache - // miss, or Value in the cache if cache hit. - std::pair generate(const Key& key); - - // Advanced function taking in a group of keys. Separates those keys into - // one's present in the cache (returning CachedPtrs for them) and those not - // in the cache. Does NOT call the Generator for any key. + : generator_(std::move(generator)), cache_(std::move(cache)) {} + + CachedFactory(std::unique_ptr generator) + : CachedFactory(nullptr, std::move(generator)) {} + + /// Returns the generator's output on the given key. If the output is in the + /// cache, returns immediately. Otherwise, blocks until the output is ready. + /// For a given key we will only ever be running the Generator function once. + /// E.g., if N threads ask for the same key at once, the generator will be + /// fired once and all N will receive a pointer from the cache. + /// + /// Actually the last sentence is not quite true in the edge case where + /// inserts into the cache fail; in that case we will re-run the generator + /// repeatedly, handing off the results to one thread at a time until the + /// all pending requests are satisfied or a cache insert succeeds. This + /// will probably mess with your memory model, so really try to avoid it. + CachedPtr generate( + const Key& key, + const Properties* properties = nullptr); + + /// Looks up the cache entry of the given key if it exists, otherwise returns + /// null. + CachedPtr get(const Key& key); + + /// Advanced function taking in a group of keys. Separates those keys into + /// one's present in the cache (returning CachedPtrs for them) and those not + /// in the cache. Does NOT call the Generator for any key. void retrieveCached( const std::vector& keys, - std::vector>* cached, - std::vector* missing); + std::vector>>& + cached, + std::vector& missing); - // Total size of elements cached (NOT the maximum size/limit). + /// Total size of elements cached (NOT the maximum size/limit). int64_t currentSize() const { + if (cache_ == nullptr) { + return 0; + } return cache_->currentSize(); } - // The maximum size of the underlying cache. + /// The maximum size of the underlying cache. int64_t maxSize() const { + if (cache_ == nullptr) { + return 0; + } return cache_->maxSize(); } SimpleLRUCacheStats cacheStats() { + if (cache_ == nullptr) { + return {}; + } std::lock_guard l(cacheMu_); - return cache_->getStats(); + return cache_->stats(); } // Clear the cache and return the current cache status SimpleLRUCacheStats clearCache() { + if (cache_ == nullptr) { + return {}; + } std::lock_guard l(cacheMu_); - cache_->clear(); - return cache_->getStats(); + cache_->free(cache_->maxSize()); + return cache_->stats(); } - // Move allowed, copy disallowed. + /// Move allowed, copy disallowed. CachedFactory(CachedFactory&&) = default; CachedFactory& operator=(CachedFactory&&) = default; CachedFactory(const CachedFactory&) = delete; CachedFactory& operator=(const CachedFactory&) = delete; private: - std::unique_ptr> cache_; + void removePending(const Key& key) { + std::lock_guard pendingLock(pendingMu_); + pending_.erase(key); + } + + bool addCache(const Key& key, Value* value, int64_t size) { + std::lock_guard cacheLock(cacheMu_); + return cache_->addPinned(key, value, size); + } + + Value* getCache(const Key& key) { + std::lock_guard cacheLock(cacheMu_); + return getCacheLocked(key); + } + + Value* getCacheLocked(const Key& key) { + return cache_->get(key); + } + std::unique_ptr generator_; - folly::F14FastSet pending_; std::mutex cacheMu_; + std::unique_ptr> cache_; + std::mutex pendingMu_; + folly::F14FastSet pending_; std::condition_variable pendingCv_; }; @@ -115,74 +267,227 @@ class CachedFactory { // End of public API. Implementation follows. // -template -std::pair CachedFactory::generate( - const Key& key) { +template +CachedPtr::CachedPtr() + : fromCache_(false), + key_(nullptr), + value_(nullptr), + cacheMu_(nullptr), + cache_(nullptr) {} + +template +CachedPtr::CachedPtr(Value* value) + : fromCache_(false), + key_(nullptr), + value_(value), + cacheMu_(nullptr), + cache_(nullptr) {} + +template +CachedPtr::CachedPtr( + bool cached, + Value* value, + SimpleLRUCache* cache, + std::unique_ptr key) + : fromCache_(cached), + key_(std::move(key)), + value_(value), + cacheMu_(nullptr), + cache_(cache) {} + +template +CachedPtr::CachedPtr( + bool cached, + Value* value, + SimpleLRUCache* cache, + std::unique_ptr key, + std::mutex* cacheMu) + : fromCache_(cached), + key_(std::move(key)), + value_(value), + cacheMu_(cacheMu), + cache_(cache) {} + +template +CachedPtr::~CachedPtr() { + clear(); +} + +template +CachedPtr::CachedPtr(CachedPtr&& other) { + fromCache_ = other.fromCache_; + value_ = other.value_; + key_ = std::move(other.key_); + cache_ = other.cache_; + cacheMu_ = other.cacheMu_; + other.value_ = nullptr; +} + +template +CachedPtr& +CachedPtr::operator=(CachedPtr&& other) { + clear(); + fromCache_ = other.fromCache_; + value_ = other.value_; + key_ = std::move(other.key_); + cache_ = other.cache_; + cacheMu_ = other.cacheMu_; + other.value_ = nullptr; + return *this; +} + +template +void CachedPtr::clear() { + if (value_ == nullptr) { + return; + } + if (cache_ == nullptr) { + delete value_; + return; + } + if (cacheMu_ != nullptr) { + std::lock_guard l(*cacheMu_); + cache_->release(*key_); + } else { + cache_->release(*key_); + } +} + +template < + typename Key, + typename Value, + typename Generator, + typename Properties, + typename Sizer, + typename Comparator, + typename Hash> +CachedPtr +CachedFactory:: + generate(const Key& key, const Properties* properties) { process::TraceContext trace("CachedFactory::generate"); - std::unique_lock pending_lock(pendingMu_); + if (cache_ == nullptr) { + return CachedPtr{ + /*fromCache=*/false, + (*generator_)(key, properties).release(), + nullptr, + std::make_unique(key)}; + } + + std::unique_lock pendingLock(pendingMu_); { - std::lock_guard cache_lock(cacheMu_); - auto value = cache_->get(key); - if (value) { - return std::make_pair(true, value.value()); + if (Value* value = getCache(key)) { + return CachedPtr( + /*fromCache=*/true, + value, + cache_.get(), + std::make_unique(key), + &cacheMu_); } } - if (pending_.contains(key)) { - pendingCv_.wait(pending_lock, [&]() { return !pending_.contains(key); }); + pendingCv_.wait(pendingLock, [&]() { return !pending_.contains(key); }); // Will normally hit the cache now. - { - std::lock_guard cache_lock(cacheMu_); - auto value = cache_->get(key); - if (value) { - return std::make_pair(true, value.value()); - } - } - pending_lock.unlock(); - return generate(key); // Regenerate in the edge case. - } else { - pending_.insert(key); - pending_lock.unlock(); - Value generatedValue; - // TODO: consider using folly/ScopeGuard here. - try { - generatedValue = (*generator_)(key); - } catch (const std::exception& e) { - { - std::lock_guard pending_lock(pendingMu_); - pending_.erase(key); - } - pendingCv_.notify_all(); - throw; - } - cacheMu_.lock(); - cache_->add(key, generatedValue); - cacheMu_.unlock(); - - // TODO: this code is exception unsafe and can leave pending_ in an - // inconsistent state. Eventually this code should move to - // folly:synchronized and rewritten with better primitives. - { - std::lock_guard pending_lock(pendingMu_); - pending_.erase(key); + if (Value* value = getCache(key)) { + return CachedPtr( + /*fromCache=*/false, + value, + cache_.get(), + std::make_unique(key), + &cacheMu_); } + pendingLock.unlock(); + // Regenerates in the edge case. + return generate(key, properties); + } + + pending_.insert(key); + pendingLock.unlock(); + + SCOPE_EXIT { + removePending(key); pendingCv_.notify_all(); - return std::make_pair(false, generatedValue); + }; + + std::unique_ptr generatedValue = (*generator_)(key, properties); + const uint64_t valueSize = Sizer()(*generatedValue); + Value* rawValue = generatedValue.release(); + const bool inserted = addCache(key, rawValue, valueSize); + + CachedPtr result; + if (inserted) { + result = CachedPtr( + /*fromCache=*/false, + rawValue, + cache_.get(), + std::make_unique(key), + &cacheMu_); + } else { + FB_LOG_EVERY_MS(WARNING, 60'000) << "Unable to insert into cache!"; + result = CachedPtr(rawValue); } + return result; } -template -void CachedFactory::retrieveCached( - const std::vector& keys, - std::vector>* cached, - std::vector* missing) { - std::lock_guard cache_lock(cacheMu_); +template < + typename Key, + typename Value, + typename Generator, + typename Properties, + typename Sizer, + typename Comparator, + typename Hash> +CachedPtr +CachedFactory::get( + const Key& key) { + if (cache_ == nullptr) { + return {}; + } + std::lock_guard l(cacheMu_); + Value* value = getCacheLocked(key); + if (value == nullptr) { + return {}; + } + return CachedPtr( + /*fromCache=*/true, + value, + cache_.get(), + std::make_unique(key), + &cacheMu_); +} + +template < + typename Key, + typename Value, + typename Generator, + typename Properties, + typename Sizer, + typename Comparator, + typename Hash> +void CachedFactory:: + retrieveCached( + const std::vector& keys, + std::vector>>& + cached, + std::vector& missing) { + if (cache_ == nullptr) { + missing.insert(missing.end(), keys.begin(), keys.end()); + return; + } + + std::lock_guard l(cacheMu_); for (const Key& key : keys) { - auto value = cache_->get(key); - if (value) { - cached->emplace_back(key, value.value()); + Value* value = getCacheLocked(key); + if (value != nullptr) { + cached.emplace_back( + key, + CachedPtr( + /*fromCache=*/true, + value, + cache_.get(), + std::make_unique(key), + &cacheMu_)); } else { - missing->push_back(key); + missing.push_back(key); } } } diff --git a/velox/common/caching/ScanTracker.cpp b/velox/common/caching/ScanTracker.cpp index 3d9410c951101..9bae234d87b0d 100644 --- a/velox/common/caching/ScanTracker.cpp +++ b/velox/common/caching/ScanTracker.cpp @@ -21,8 +21,8 @@ namespace facebook::velox::cache { -// Marks that 'bytes' worth of data may be accessed in the -// future. See TrackingData for meaning of quantum. +// Marks that 'bytes' worth of data may be accessed in the future. See +// TrackingData for meaning of quantum. void ScanTracker::recordReference( const TrackingId id, uint64_t bytes, @@ -52,8 +52,9 @@ void ScanTracker::recordRead( std::string ScanTracker::toString() const { std::stringstream out; out << "ScanTracker for " << id_ << std::endl; - for (auto& pair : data_) { - int pct = 100 * pair.second.readBytes / (1 + pair.second.referencedBytes); + for (const auto& pair : data_) { + const int pct = + 100 * pair.second.readBytes / (1 + pair.second.referencedBytes); out << pair.first.id() << ": " << pct << "% " << pair.second.readBytes << "/" << pair.second.numReads << std::endl; } diff --git a/velox/common/caching/ScanTracker.h b/velox/common/caching/ScanTracker.h index 8103339f76655..4281175e2b8f1 100644 --- a/velox/common/caching/ScanTracker.h +++ b/velox/common/caching/ScanTracker.h @@ -25,11 +25,11 @@ namespace facebook::velox::cache { -// Represents a stream in a table, e.g. nulls/lengths/data of a -// particular column. Column-level access tracking uses this to -// identify the column within a file or partition. The low 5 bits are -// the stream kind, e.g. nulls, data etc. The high 27 bits are the node -// number in the file schema tree, i.e. the column. +/// Represents a stream in a table, e.g. nulls/lengths/data of a particular +/// column. Column-level access tracking uses this to identify the column within +/// a file or partition. The low 5 bits are the stream kind, e.g. nulls, data +/// etc. The high 27 bits are the node number in the file schema tree, i.e. the +/// column. class TrackingId { public: TrackingId() : id_(-1) {} @@ -71,22 +71,22 @@ namespace facebook::velox::cache { class FileGroupStats; -// Records references and actual uses of a stream. +/// Records references and actual uses of a stream. struct TrackingData { int64_t referencedBytes{}; int64_t readBytes{}; int32_t numReferences{}; int32_t numReads{}; - // Marks that 'bytes' worth of data in the tracked object has been - // referenced and may later be accessed. If 'bytes' is larger than a single - // 'oadQuantum', the reference counts for as many accesses as are needed to - // cover 'bytes'. When reading a large object, we will get a read per quantum. - // So then if the referenced and read counts match, we know that the object is - // densely read. + /// Marks that 'bytes' worth of data in the tracked object has been referenced + /// and may later be accessed. If 'bytes' is larger than a single + /// 'loadQuantum', the reference counts for as many accesses as are needed to + /// cover 'bytes'. When reading a large object, we will get a read per + /// quantum. So then if the referenced and read counts match, we know that the + /// object is densely read. void incrementReference(uint64_t bytes, int32_t loadQuantum) { referencedBytes += bytes; - if (!loadQuantum) { + if (loadQuantum == 0) { ++numReferences; } else { numReferences += bits::roundUp(bytes, loadQuantum) / loadQuantum; @@ -99,29 +99,28 @@ struct TrackingData { } }; -// Tracks column access frequency during execution of a query. A -// ScanTracker is created at the level of a Task/TableScan, so that -// all threads of a scan report in the same tracker. The same -// ScanTracker tracks all reads of all partitions of the scan. The -// groupId argument identifies the file group (e.g. partition) a -// tracking event pertains to, since a single ScanTracker can range -// over multiple partitions. +/// Tracks column access frequency during execution of a query. A ScanTracker is +/// created at the level of a Task/TableScan, so that all threads of a scan +/// report in the same tracker. The same ScanTracker tracks all reads of all +/// partitions of the scan. The groupId argument identifies the file group (e.g. +/// partition) a tracking event pertains to, since a single ScanTracker can +/// range over multiple partitions. class ScanTracker { public: - ScanTracker() : loadQuantum_(1 /*not used*/) {} + ScanTracker() : ScanTracker({}, nullptr, 1) {} - // Constructs a tracker with 'id'. The tracker will be owned by - // shared_ptr and will be referenced from a map from id to weak_ptr - // to 'this'. 'unregisterer' is supplied so that the destructor can - // remove the weak_ptr from the map of pending trackers. 'loadQuantum' is the - // largest single IO size for read. + /// Constructs a tracker with 'id'. The tracker will be owned by shared_ptr + /// and will be referenced from a map from id to weak_ptr to 'this'. + /// 'unregisterer' is supplied so that the destructor can remove the weak_ptr + /// from the map of pending trackers. 'loadQuantum' is the largest single IO + /// size for read. ScanTracker( std::string_view id, - std::function unregisterer, + std::function unregisterer, int32_t loadQuantum, - FileGroupStats* FOLLY_NULLABLE fileGroupStats = nullptr) + FileGroupStats* fileGroupStats = nullptr) : id_(id), - unregisterer_(unregisterer), + unregisterer_(std::move(unregisterer)), loadQuantum_(loadQuantum), fileGroupStats_(fileGroupStats) {} @@ -131,33 +130,33 @@ class ScanTracker { } } - // Records that a scan references 'bytes' bytes of the stream given - // by 'id'. This is called when preparing to read a stripe. + /// Records that a scan references 'bytes' bytes of the stream given by 'id'. + /// This is called when preparing to read a stripe. void recordReference( const TrackingId id, uint64_t bytes, uint64_t fileId, uint64_t groupId); - // Records that 'bytes' bytes have actually been read from the stream - // given by 'id'. + /// Records that 'bytes' bytes have actually been read from the stream given + /// by 'id'. void recordRead( const TrackingId id, uint64_t bytes, uint64_t fileId, uint64_t groupId); - // True if 'trackingId' is read at least 'minReadPct' % of the time. + /// True if 'trackingId' is read at least 'minReadPct' % of the time. bool shouldPrefetch(TrackingId id, int32_t minReadPct) { return readPct(id) >= minReadPct; } - // Returns the percentage of referenced columns that are actually read. 100% - // if no data. + /// Returns the percentage of referenced columns that are actually read. 100% + /// if no data. int32_t readPct(TrackingId id) { std::lock_guard l(mutex_); const auto& data = data_[id]; - if (!data.numReferences) { + if (data.numReferences == 0) { return 100; } return (100 * data.numReads) / data.numReferences; @@ -172,25 +171,25 @@ class ScanTracker { return id_; } - FileGroupStats* FOLLY_NULLABLE fileGroupStats() const { + FileGroupStats* fileGroupStats() const { return fileGroupStats_; } std::string toString() const; private: - std::mutex mutex_; // Id of query + scan operator to track. const std::string id_; - std::function unregisterer_; + const std::function unregisterer_{nullptr}; + // Maximum size of a read. 10MB would count as two references if the quantum + // were 8MB. At the same time this would count as a single 10MB reference for + // 'fileGroupStats_'. 0 means the read size is unlimited. + const int32_t loadQuantum_; + FileGroupStats* const fileGroupStats_; + + std::mutex mutex_; folly::F14FastMap data_; TrackingData sum_; - // Maximum size of a read. 10MB would count as two references - // if the quantum were 8MB. At the same time this would count as a - // single 10MB reference for 'fileGroupStats_'. 0 means the read - // size is unlimited. - const int32_t loadQuantum_; - FileGroupStats* FOLLY_NULLABLE fileGroupStats_; }; } // namespace facebook::velox::cache diff --git a/velox/common/caching/SimpleLRUCache.h b/velox/common/caching/SimpleLRUCache.h index cd420995f6413..5812a30c99a91 100644 --- a/velox/common/caching/SimpleLRUCache.h +++ b/velox/common/caching/SimpleLRUCache.h @@ -18,147 +18,379 @@ #include #include #include -#include #include -#include "folly/container/EvictingCacheMap.h" +#include "folly/IntrusiveList.h" +#include "folly/container/F14Map.h" +#include "velox/common/base/Exceptions.h" +#include "velox/common/time/Timer.h" namespace facebook::velox { struct SimpleLRUCacheStats { SimpleLRUCacheStats( size_t _maxSize, + size_t _expireDurationMs, size_t _curSize, + size_t _pinnedSize, + size_t _numElements, size_t _numHits, size_t _numLookups) : maxSize{_maxSize}, + expireDurationMs(_expireDurationMs), curSize{_curSize}, + pinnedSize{_pinnedSize}, + numElements{_numElements}, numHits{_numHits}, - numLookups{_numLookups}, - numElements{curSize}, - pinnedSize{curSize} {} + numLookups{_numLookups} {} - // Capacity of the cache. - const size_t maxSize; + SimpleLRUCacheStats() = default; - // Current cache size used. - const size_t curSize; + /// Capacity of the cache. + size_t maxSize{0}; - // Total number of cache hits since server start. - const size_t numHits; + size_t expireDurationMs{0}; - // Total number of cache lookups since server start. - const size_t numLookups; + /// Current cache size used. + size_t curSize{0}; - // TODO: These 2 are unused, but open source Presto depends on them - // Remove the usage in open source presto and get rid of them. - const size_t numElements; - const size_t pinnedSize; + /// Current cache size used by pinned entries. + size_t pinnedSize{0}; + + /// Total number of elements in the cache. + size_t numElements{0}; + + /// Total number of cache hits since server start. + size_t numHits{0}; + + /// Total number of cache lookups since server start. + size_t numLookups{0}; std::string toString() const { return fmt::format( "{{\n" " maxSize: {}\n" + " expireDurationMs: {}\n" " curSize: {}\n" + " pinnedSize: {}\n" + " numElements: {}\n" " numHits: {}\n" " numLookups: {}\n" "}}\n", maxSize, + expireDurationMs, curSize, + pinnedSize, + numElements, numHits, numLookups); } - bool operator==(const SimpleLRUCacheStats& rhs) const { - return std::tie(curSize, maxSize, numHits, numLookups) == - std::tie(rhs.curSize, rhs.maxSize, rhs.numHits, rhs.numLookups); + bool operator==(const SimpleLRUCacheStats& other) const { + return std::tie( + curSize, + expireDurationMs, + maxSize, + pinnedSize, + numElements, + numHits, + numLookups) == + std::tie( + other.curSize, + other.expireDurationMs, + other.maxSize, + other.pinnedSize, + other.numElements, + other.numHits, + other.numLookups); } }; -/// A simple wrapper on top of the folly::EvictingCacheMap that tracks -/// hit/miss counters. Key/Value evicted are immediately destructed. -/// So the Key/Value should be a value type or self managing lifecycle -/// shared_ptr. +/// A simple LRU cache that allows each element to occupy an arbitrary amount of +/// space in the cache. Useful when the size of the cached elements can vary a +/// lot; if they are all roughly the same size something that only tracks the +/// number of elements in the cache like common/datastruct/LRUCacheMap.h may be +/// better. /// /// NOTE: -/// 1. NOT Thread-Safe: All the public calls modify internal structures -/// and hence require external write locks if used from multiple threads. -template +/// 1. NOT Thread-Safe: All the public calls modify internal structures and +/// hence require external write locks if used from multiple threads. +/// 2. 'Key' is required to be copyable and movable. +template < + typename Key, + typename Value, + typename Comparator = std::equal_to, + typename Hash = std::hash> class SimpleLRUCache { public: - /// Constructs a cache of the specified size. The maxSize represents the - /// number of entries in the cache. clearSize represents the number of entries - /// to evict in a given time, when the cache is full. - explicit SimpleLRUCache(size_t maxSize, size_t clearSize = 1); + /// Constructs a cache of the specified size. This size can represent whatever + /// you want -- slots, or bytes, or etc; you provide the size of each element + /// whenever you add a new value to the cache. If 'expireDurationMs' is not + /// zero, then a cache value will be evicted out of cache after + /// 'expireDurationMs' time passed since its insertion into the cache no + /// matter if it been accessed or not. + explicit SimpleLRUCache(size_t maxSize, size_t expireDurationMs = 0); + + /// Frees all owned data. Check-fails if any element remains pinned. + ~SimpleLRUCache(); + + /// Adds a key-value pair that will occupy the provided size, evicting + /// older elements repeatedly until enough room is avialable in the cache. + /// Returns whether insertion succeeded. If it did, the cache takes + /// ownership of |value|. Insertion will fail in two cases: + /// 1) There isn't enough room in the cache even after all unpinned + /// elements are freed. + /// 2) The key you are adding is already present in the cache. In + /// this case the element currently existing in the cache remains + /// totally unchanged. + /// + /// If you use size to represent in-memory size, keep in mind that the + /// total space used per entry is roughly 2 * key_size + value_size + 30 bytes + /// (nonexact because we use a hash map internally, so the ratio of reserved + /// slot to used slots will vary). + bool add(Key key, Value* value, size_t size); - /// Add an item to the cache. Returns true if the item is successfully - /// added, false otherwise. - bool add(const Key& key, const Value& value); + /// Same as add(), but the value starts pinned. Saves a map lookup if you + /// would otherwise do add() then get(). Keep in mind that if insertion + /// fails the key's pin count has NOT been incremented. + bool addPinned(Key key, Value* value, size_t size); - /// Gets value associated with key. - /// returns std::nullopt when the key is missing - /// returns the cached value, when the key is present. - std::optional get(const Key& key); + /// Gets an unowned pointer to the value associated with key. + /// Returns nullptr if the key is not present in the cache. + /// Once you are done using the returned non-null *value, you must call + /// release with the same key you passed to get. + /// + /// The returned pointer is guaranteed to remain valid until release + /// is called. + /// + /// Note that we return a non-const pointer, and multiple callers + /// can lease the same object, so if you're mutating it you need + /// to manage your own locking. + Value* get(const Key& key); - void clear(); + /// Unpins a key. You MUST call release on every key you have + /// get'd once are you done using the value or bad things will + /// happen (namely, memory leaks). + void release(const Key& key); /// Total size of elements in the cache (NOT the maximum size/limit). size_t currentSize() const { - return lru_.size(); + return curSize_; } /// The maximum size of the cache. size_t maxSize() const { - return lru_.getMaxSize(); + return maxSize_; } - SimpleLRUCacheStats getStats() const { + SimpleLRUCacheStats stats() const { return { - lru_.getMaxSize(), - lru_.size(), + maxSize_, + expireDurationMs_, + curSize_, + pinnedSize_, + lruList_.size(), numHits_, numLookups_, }; } + /// Removes unpinned elements until at least size space is freed. Returns + /// the size actually freed, which may be less than requested if the + /// remaining are all pinned. + size_t free(size_t size); + private: + struct Element { + Key key; + Value* value; + size_t size; + uint32_t numPins; + size_t expireTimeMs; + folly::IntrusiveListHook lruEntry; + folly::IntrusiveListHook expireEntry; + }; + using LruList = folly::IntrusiveList; + using ExpireList = folly::IntrusiveList; + + bool addInternal(Key key, Value* value, size_t size, bool pinned); + + // Removes the expired and unpinned cache entries from the cache. The function + // is invoked upon cache lookup, cache insertion and cache entry release. + void removeExpiredEntries(); + + // Removes entry 'e' from cache by unlinking it from 'lruList_' and + // 'expireList_', and destroy the object at the end. + size_t freeEntry(Element* e); + + const size_t maxSize_; + const size_t expireDurationMs_; + size_t curSize_{0}; + size_t pinnedSize_{0}; size_t numHits_{0}; size_t numLookups_{0}; - folly::EvictingCacheMap lru_; + // Elements get newer as we evict from lruList_.begin() to lruList_.end(). + LruList lruList_; + ExpireList expireList_; + folly::F14FastMap keys_; }; -// -// End of public API. Imlementation follows. -// - -template -inline SimpleLRUCache::SimpleLRUCache( +/// +/// End of public API. Implementation follows. +/// +template +inline SimpleLRUCache::SimpleLRUCache( size_t maxSize, - size_t clearSize) - : lru_(maxSize, clearSize) {} - -template -inline bool SimpleLRUCache::add( - const Key& key, - const Value& value) { - return lru_.insert(key, value).second; + size_t expireDurationMs) + : maxSize_(maxSize), expireDurationMs_(expireDurationMs) {} + +template +inline SimpleLRUCache::~SimpleLRUCache() { + VELOX_CHECK_EQ(pinnedSize_, 0); + // We could be more optimal than calling free here, but in + // general this destructor will never get called during normal + // usage so we don't bother. + free(maxSize_); + VELOX_CHECK(lruList_.empty()); + VELOX_CHECK(expireList_.empty()); + VELOX_CHECK(keys_.empty()); + VELOX_CHECK_EQ(curSize_, 0); } -template -inline std::optional SimpleLRUCache::get(const Key& key) { - ++numLookups_; - auto it = lru_.find(key); - if (it == lru_.end()) { - return std::nullopt; +template +inline bool SimpleLRUCache::add( + Key key, + Value* value, + size_t size) { + return addInternal(key, value, size, /*pinned=*/false); +} + +template +inline bool SimpleLRUCache::addPinned( + Key key, + Value* value, + size_t size) { + return addInternal(key, value, size, /*pinned=*/true); +} + +template +inline void +SimpleLRUCache::removeExpiredEntries() { + if (expireDurationMs_ == 0) { + return; + } + const auto currentTimeMs = getCurrentTimeMs(); + auto it = expireList_.begin(); + while (it != expireList_.end()) { + if (it->expireTimeMs > currentTimeMs) { + return; + } + if (it->numPins > 0) { + ++it; + continue; + } + Element* expiredEntry = &*it; + it = expireList_.erase(it); + freeEntry(expiredEntry); } +} + +template +inline bool SimpleLRUCache::addInternal( + Key key, + Value* value, + size_t size, + bool pinned) { + removeExpiredEntries(); + + if (keys_.find(key) != keys_.end()) { + return false; + } + if (pinnedSize_ + size > maxSize_) { + return false; + } + const int64_t spaceNeeded = curSize_ + size - maxSize_; + if (spaceNeeded > 0) { + free(spaceNeeded); + } + + Element* e = new Element; + e->key = std::move(key); + e->value = value; + e->size = size; + e->numPins = !!pinned; + if (pinned) { + pinnedSize_ += size; + } + keys_.emplace(e->key, e); + lruList_.push_back(*e); + if (expireDurationMs_ != 0) { + e->expireTimeMs = getCurrentTimeMs() + expireDurationMs_; + expireList_.push_back(*e); + } + curSize_ += size; + return true; +} +template +inline Value* SimpleLRUCache::get( + const Key& key) { + removeExpiredEntries(); + + ++numLookups_; + auto it = keys_.find(key); + if (it == keys_.end()) { + return nullptr; + } + Element* entry = it->second; + if (entry->numPins++ == 0) { + pinnedSize_ += entry->size; + } + VELOX_DCHECK(entry->lruEntry.is_linked()); + entry->lruEntry.unlink(); + lruList_.push_back(*entry); ++numHits_; - return it->second; + return it->second->value; +} + +template +inline void SimpleLRUCache::release( + const Key& key) { + Element* e = keys_[key]; + if (--e->numPins == 0) { + pinnedSize_ -= e->size; + } + removeExpiredEntries(); +} + +template +inline size_t SimpleLRUCache::free(size_t size) { + auto it = lruList_.begin(); + size_t freed = 0; + while (it != lruList_.end() && freed < size) { + if (it->numPins == 0) { + Element* evictedEntry = &*it; + it = lruList_.erase(it); + freed += freeEntry(evictedEntry); + } else { + ++it; + } + } + return freed; } -template -inline void SimpleLRUCache::clear() { - lru_.clear(); +template +inline size_t SimpleLRUCache::freeEntry( + Element* e) { + VELOX_CHECK_EQ(e->numPins, 0); + // NOTE: the list hook dtor will unlink the entry from list so we don't need + // to explicitly unlink here. + const auto freedSize = e->size; + curSize_ -= freedSize; + keys_.erase(e->key); + delete e->value; + delete e; + return freedSize; } } // namespace facebook::velox diff --git a/velox/common/caching/SsdCache.cpp b/velox/common/caching/SsdCache.cpp index 16ebf0f539c89..29b44e8abd4b9 100644 --- a/velox/common/caching/SsdCache.cpp +++ b/velox/common/caching/SsdCache.cpp @@ -16,47 +16,60 @@ #include "velox/common/caching/SsdCache.h" #include #include +#include "velox/common/base/Exceptions.h" #include "velox/common/caching/FileIds.h" #include "velox/common/file/FileSystems.h" +#include "velox/common/testutil/TestValue.h" #include "velox/common/time/Timer.h" #include #include +using facebook::velox::common::testutil::TestValue; + namespace facebook::velox::cache { -SsdCache::SsdCache( - std::string_view filePrefix, - uint64_t maxBytes, - int32_t numShards, - folly::Executor* executor, - int64_t checkpointIntervalBytes, - bool disableFileCow) - : filePrefix_(filePrefix), - numShards_(numShards), +SsdCache::SsdCache(const Config& config) + : filePrefix_(config.filePrefix), + numShards_(config.numShards), groupStats_(std::make_unique()), - executor_(executor) { + executor_(config.executor) { // Make sure the given path of Ssd files has the prefix for local file system. // Local file system would be derived based on the prefix. VELOX_CHECK( - filePrefix_.find("/") == 0, + filePrefix_.find('/') == 0, "Ssd path '{}' does not start with '/' that points to local file system.", filePrefix_); + VELOX_CHECK_NOT_NULL(executor_); + + VELOX_SSD_CACHE_LOG(INFO) << "SSD cache config: " << config.toString(); + + auto checksumReadVerificationEnabled = config.checksumReadVerificationEnabled; + if (config.checksumReadVerificationEnabled && !config.checksumEnabled) { + VELOX_SSD_CACHE_LOG(WARNING) + << "Checksum read has been disabled as checksum is not enabled."; + checksumReadVerificationEnabled = false; + } filesystems::getFileSystem(filePrefix_, nullptr) - ->mkdir(std::filesystem::path(filePrefix).parent_path().string()); + ->mkdir(std::filesystem::path(filePrefix_).parent_path().string()); files_.reserve(numShards_); // Cache size must be a multiple of this so that each shard has the same max // size. - uint64_t sizeQuantum = numShards_ * SsdFile::kRegionSize; - int32_t fileMaxRegions = bits::roundUp(maxBytes, sizeQuantum) / sizeQuantum; + const uint64_t sizeQuantum = numShards_ * SsdFile::kRegionSize; + const int32_t fileMaxRegions = + bits::roundUp(config.maxBytes, sizeQuantum) / sizeQuantum; for (auto i = 0; i < numShards_; ++i) { - files_.push_back(std::make_unique( + const auto fileConfig = SsdFile::Config( fmt::format("{}{}", filePrefix_, i), i, fileMaxRegions, - checkpointIntervalBytes / numShards, - disableFileCow)); + config.checkpointIntervalBytes / config.numShards, + config.disableFileCow, + config.checksumEnabled, + checksumReadVerificationEnabled, + executor_); + files_.push_back(std::make_unique(fileConfig)); } } @@ -66,20 +79,21 @@ SsdFile& SsdCache::file(uint64_t fileId) { } bool SsdCache::startWrite() { - if (isShutdown_) { - return false; - } - if (writesInProgress_.fetch_add(numShards_) == 0) { + std::lock_guard l(mutex_); + checkNotShutdownLocked(); + if (writesInProgress_ == 0) { // No write was pending, so now all shards are counted as writing. + writesInProgress_ += numShards_; return true; } - // There were writes in progress, so compensate for the increment. - writesInProgress_.fetch_sub(numShards_); + VELOX_CHECK_GE(writesInProgress_, 0); return false; } void SsdCache::write(std::vector pins) { - VELOX_CHECK_LE(numShards_, writesInProgress_); + VELOX_CHECK_EQ(numShards_, writesInProgress_); + + TestValue::adjust("facebook::velox::cache::SsdCache::write", this); const auto startTimeUs = getCurrentTimeMicro(); @@ -97,6 +111,7 @@ void SsdCache::write(std::vector pins) { ++numNoStore; continue; } + struct PinHolder { std::vector pins; @@ -104,8 +119,8 @@ void SsdCache::write(std::vector pins) { : pins(std::move(_pins)) {} }; - // We move the mutable vector of pins to the executor. These must - // be wrapped in a shared struct to be passed via lambda capture. + // We move the mutable vector of pins to the executor. These must be wrapped + // in a shared struct to be passed via lambda capture. auto pinHolder = std::make_shared(std::move(shards[i])); executor_->add([this, i, pinHolder, bytes, startTimeUs]() { try { @@ -116,12 +131,13 @@ void SsdCache::write(std::vector pins) { VELOX_SSD_CACHE_LOG(WARNING) << "Ignoring error in SsdFile::write: " << e.what(); } + pinHolder->pins.clear(); if (--writesInProgress_ == 0) { // Typically occurs every few GB. Allows detecting unusually slow rates // from failing devices. VELOX_SSD_CACHE_LOG(INFO) << fmt::format( - "Wrote {}MB, {} MB/s", - bytes >> 20, + "Wrote {}, {} bytes/s", + succinctBytes(bytes), static_cast(bytes) / (getCurrentTimeMicro() - startTimeUs)); } }); @@ -129,6 +145,38 @@ void SsdCache::write(std::vector pins) { writesInProgress_.fetch_sub(numNoStore); } +void SsdCache::checkpoint() { + VELOX_CHECK_EQ(numShards_, writesInProgress_); + for (auto i = 0; i < numShards_; ++i) { + executor_->add([this, i]() { + files_[i]->checkpoint(/*force=*/true); + --writesInProgress_; + }); + } +} + +bool SsdCache::removeFileEntries( + const folly::F14FastSet& filesToRemove, + folly::F14FastSet& filesRetained) { + if (!startWrite()) { + return false; + } + + bool success = true; + for (auto i = 0; i < numShards_; i++) { + try { + success &= files_[i]->removeFileEntries(filesToRemove, filesRetained); + } catch (const std::exception& e) { + VELOX_SSD_CACHE_LOG(ERROR) + << "Error removing file entries from SSD shard " + << files_[i]->shardId() << ": " << e.what(); + success = false; + } + --writesInProgress_; + } + return success; +} + SsdCacheStats SsdCache::stats() const { SsdCacheStats stats; for (auto& file : files_) { @@ -137,38 +185,68 @@ SsdCacheStats SsdCache::stats() const { return stats; } -void SsdCache::clear() { - for (auto& file : files_) { - file->clear(); - } -} - std::string SsdCache::toString() const { - auto data = stats(); - uint64_t capacity = maxBytes(); + const auto data = stats(); + const uint64_t capacity = maxBytes(); std::stringstream out; - out << "Ssd cache IO: Write " << (data.bytesWritten >> 20) << "MB read " - << (data.bytesRead >> 20) << "MB Size " << (capacity >> 30) - << "GB Occupied " << (data.bytesCached >> 30) << "GB"; - out << (data.entriesCached >> 10) << "K entries."; + out << "Ssd cache IO: Write " << succinctBytes(data.bytesWritten) << " read " + << succinctBytes(data.bytesRead) << " Size " << succinctBytes(capacity) + << " Occupied " << succinctBytes(data.bytesCached); + out << " " << (data.entriesCached >> 10) << "K entries."; out << "\nGroupStats: " << groupStats_->toString(capacity); return out.str(); } -void SsdCache::testingDeleteFiles() { - for (auto& file : files_) { - file->deleteFile(); +void SsdCache::shutdown() { + { + std::lock_guard l(mutex_); + if (shutdown_) { + VELOX_SSD_CACHE_LOG(INFO) << "SSD cache has already been shutdown"; + } + shutdown_ = true; } -} -void SsdCache::shutdown() { - isShutdown_ = true; + VELOX_SSD_CACHE_LOG(INFO) << "SSD cache is shutting down"; while (writesInProgress_) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); // NOLINT } for (auto& file : files_) { file->checkpoint(true); } + VELOX_SSD_CACHE_LOG(INFO) << "SSD cache has been shutdown"; +} + +void SsdCache::clear() { + for (auto& file : files_) { + file->clear(); + } +} + +void SsdCache::testingDeleteFiles() { + for (auto& file : files_) { + file->testingDeleteFile(); + } +} + +void SsdCache::testingDeleteCheckpoints() { + for (auto& file : files_) { + file->deleteCheckpoint(); + } +} + +uint64_t SsdCache::testingTotalLogEvictionFilesSize() { + uint64_t size = 0; + for (auto& file : files_) { + std::filesystem::path p{file->getEvictLogFilePath()}; + size += std::filesystem::file_size(p); + } + return size; +} + +void SsdCache::waitForWriteToFinish() { + while (writesInProgress_ != 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // NOLINT + } } } // namespace facebook::velox::cache diff --git a/velox/common/caching/SsdCache.h b/velox/common/caching/SsdCache.h index 4bbf8583e736d..386e79ed8eecb 100644 --- a/velox/common/caching/SsdCache.h +++ b/velox/common/caching/SsdCache.h @@ -26,30 +26,73 @@ namespace facebook::velox::cache { class SsdCache { public: + struct Config { + Config() = default; + + Config( + const std::string& _filePrefix, + uint64_t _maxBytes, + int32_t _numShards, + folly::Executor* _executor, + uint64_t _checkpointIntervalBytes = 0, + bool _disableFileCow = false, + bool _checksumEnabled = false, + bool _checksumReadVerificationEnabled = false) + : filePrefix(_filePrefix), + maxBytes(_maxBytes), + numShards(_numShards), + checkpointIntervalBytes(_checkpointIntervalBytes), + disableFileCow(_disableFileCow), + checksumEnabled(_checksumEnabled), + checksumReadVerificationEnabled(_checksumReadVerificationEnabled), + executor(_executor){}; + + std::string filePrefix; + uint64_t maxBytes; + int32_t numShards; + + /// Checkpoint after every 'checkpointIntervalBytes'/'numShards' written + /// into each file. 0 means no checkpointing. + uint64_t checkpointIntervalBytes; + + /// True if copy on write should be disabled. + bool disableFileCow; + + /// If true, checksum write to SSD is enabled. + bool checksumEnabled; + + /// If true, checksum read verification from SSD is enabled. + bool checksumReadVerificationEnabled; + + /// Executor for async fsync in checkpoint. + folly::Executor* executor; + + std::string toString() const { + return fmt::format( + "{} shards, capacity {}, checkpoint size {}, file cow {}, checksum {}, read verification {}", + numShards, + succinctBytes(maxBytes), + succinctBytes(checkpointIntervalBytes), + (disableFileCow ? "DISABLED" : "ENABLED"), + (checksumEnabled ? "ENABLED" : "DISABLED"), + (checksumReadVerificationEnabled ? "ENABLED" : "DISABLED")); + } + }; + /// Constructs a cache with backing files at path 'filePrefix'.. /// ranges from 0 to 'numShards' - 1. /// 'maxBytes' is the total capacity of the cache. This is rounded up to the /// next multiple of kRegionSize * 'numShards'. This means that all the shards /// have an equal number of regions. For 2 shards and 200MB size, the size /// rounds up to 256M with 2 shards each of 128M (2 regions). - /// If 'checkpointIntervalBytes' is non-0, the cache makes a durable + /// If 'checkpointIntervalBytes' is non-zero, the cache makes a durable /// checkpointed state that survives restart after each /// 'checkpointIntervalBytes' written. - /// If 'setNoCowFlagForSsdFiles' is true, the cache sets 'no copy on write' - /// flag to each file. This prevents the cache to go over the 'maxBytes', - /// eventually use up all disk space and stop working. Should be set to true - /// for file systems supporting COW (like brtfs). /// If 'disableFileCow' is true, the cache disables the file COW (copy on /// write) feature if the underlying filesystem (such as brtfs) supports it. /// This prevents the actual cache space usage on disk from exceeding the /// 'maxBytes' limit and stop working. - SsdCache( - std::string_view filePrefix, - uint64_t maxBytes, - int32_t numShards, - folly::Executor* executor, - int64_t checkpointIntervalBytes = 0, - bool disableFileCow = false); + SsdCache(const Config& config); /// Returns the shard corresponding to 'fileId'. 'fileId' is a file id from /// e.g. FileCacheKey. @@ -72,10 +115,27 @@ class SsdCache { /// Stores the entries of 'pins' into the corresponding files. Sets the file /// for the successfully stored entries. May evict existing entries from - /// unpinned regions. startWrite() must have been called first and it must - /// have returned true. + /// unpinned regions. + /// + /// NOTE: startWrite() must have been called first and it must have returned + /// true. void write(std::vector pins); + /// Invoked to write checkpoints to all ssd files. This is used by Prestissimo + /// worker operation. + /// + /// NOTE: startWrite() must have been called first and it must have returned + /// true. + void checkpoint(); + + /// Removes cached entries from all SsdFiles for files in the fileNum set + /// 'filesToRemove'. If successful, return true, and 'filesRetained' contains + /// entries that should not be removed, ex., from pinned regions. Otherwise, + /// return false and 'filesRetained' could be ignored. + bool removeFileEntries( + const folly::F14FastSet& filesToRemove, + folly::F14FastSet& filesRetained); + /// Returns stats aggregated from all shards. SsdCacheStats stats() const; @@ -83,32 +143,54 @@ class SsdCache { return *groupStats_; } + /// Stops writing to the cache files and waits for pending writes to finish. + /// If checkpointing is on, makes a checkpoint. + void shutdown(); + + std::string toString() const; + + const std::string& filePrefix() const { + return filePrefix_; + } + /// Drops all entries. Outstanding pins become invalid but reading them will /// mostly succeed since the files will not be rewritten until new content is /// stored. + /// + /// NOTE: it is used by test and Prestissimo worker operation. void clear(); + /// Waits until the pending ssd cache writes or checkpoints to finish. Used by + /// test and Prestissimo worker operation. + void waitForWriteToFinish(); + /// Deletes backing files. Used in testing. void testingDeleteFiles(); - /// Stops writing to the cache files and waits for pending writes to finish. - /// If checkpointing is on, makes a checkpoint. - void shutdown(); + /// Deletes checkpoint files. Used in testing. + void testingDeleteCheckpoints(); - std::string toString() const; + /// Returns the total size of eviction log files. Used by test only. + uint64_t testingTotalLogEvictionFilesSize(); private: + void checkNotShutdownLocked() { + VELOX_CHECK( + !shutdown_, "Unexpected write after SSD cache has been shutdown"); + } + const std::string filePrefix_; const int32_t numShards_; + // Stats for selecting entries to save from AsyncDataCache. + const std::unique_ptr groupStats_; + folly::Executor* const executor_; + mutable std::mutex mutex_; + std::vector> files_; // Count of shards with unfinished writes. - std::atomic writesInProgress_{0}; - - // Stats for selecting entries to save from AsyncDataCache. - std::unique_ptr groupStats_; - folly::Executor* executor_; - std::atomic isShutdown_{false}; + std::atomic_int32_t writesInProgress_{0}; + bool shutdown_{false}; }; } // namespace facebook::velox::cache diff --git a/velox/common/caching/SsdFile.cpp b/velox/common/caching/SsdFile.cpp index f1b4483386f7e..7f500973917a1 100644 --- a/velox/common/caching/SsdFile.cpp +++ b/velox/common/caching/SsdFile.cpp @@ -15,12 +15,15 @@ */ #include "velox/common/caching/SsdFile.h" + #include #include #include "velox/common/base/AsyncSource.h" +#include "velox/common/base/Crc.h" #include "velox/common/base/SuccinctPrinter.h" #include "velox/common/caching/FileIds.h" #include "velox/common/caching/SsdCache.h" +#include "velox/common/process/TraceContext.h" #include #ifdef linux @@ -32,6 +35,9 @@ #include #include +#include "velox/common/base/Counters.h" +#include "velox/common/base/StatsReporter.h" + DEFINE_bool(ssd_odirect, true, "Use O_DIRECT for SSD cache IO"); DEFINE_bool(ssd_verify_write, false, "Read back data after writing to SSD"); @@ -79,6 +85,14 @@ void addEntryToIovecs(AsyncDataCacheEntry& entry, std::vector& iovecs) { }; } } + +// Returns the number of entries in a cache 'entry'. +uint32_t numIoVectorsFromEntry(AsyncDataCacheEntry& entry) { + if (entry.tinyData() != nullptr) { + return 1; + } + return entry.data().numRuns(); +} } // namespace SsdPin::SsdPin(SsdFile& file, SsdRun run) : file_(&file), run_(run) { @@ -116,24 +130,23 @@ std::string SsdPin::toString() const { run_.size()); } -SsdFile::SsdFile( - const std::string& filename, - int32_t shardId, - int32_t maxRegions, - int64_t checkpointIntervalBytes, - bool disableFileCow, - folly::Executor* executor) - : fileName_(filename), - maxRegions_(maxRegions), - shardId_(shardId), - checkpointIntervalBytes_(checkpointIntervalBytes), - executor_(executor) { +SsdFile::SsdFile(const Config& config) + : fileName_(config.fileName), + maxRegions_(config.maxRegions), + disableFileCow_(config.disableFileCow), + checksumEnabled_(config.checksumEnabled), + checksumReadVerificationEnabled_( + config.checksumEnabled && config.checksumReadVerificationEnabled), + shardId_(config.shardId), + checkpointIntervalBytes_(config.checkpointIntervalBytes), + executor_(config.executor) { + process::TraceContext trace("SsdFile::SsdFile"); int32_t oDirect = 0; #ifdef linux oDirect = FLAGS_ssd_odirect ? O_DIRECT : 0; #endif // linux fd_ = open(fileName_.c_str(), O_CREAT | O_RDWR | oDirect, S_IRUSR | S_IWUSR); - if (FOLLY_UNLIKELY(fd_ < 0)) { + if (fd_ < 0) { ++stats_.openFileErrors; } // TODO: add fault tolerant handling for open file errors. @@ -141,30 +154,28 @@ SsdFile::SsdFile( fd_, 0, "Cannot open or create {}. Error: {}", - filename, + fileName_, folly::errnoStr(errno)); - if (disableFileCow) { + if (disableFileCow_) { disableCow(fd_); } readFile_ = std::make_unique(fd_); - uint64_t size = lseek(fd_, 0, SEEK_END); - numRegions_ = size / kRegionSize; - if (numRegions_ > maxRegions_) { - numRegions_ = maxRegions_; - } + const uint64_t size = lseek(fd_, 0, SEEK_END); + numRegions_ = std::min(size / kRegionSize, maxRegions_); fileSize_ = numRegions_ * kRegionSize; - if (size % kRegionSize > 0 || size > numRegions_ * kRegionSize) { - ftruncate(fd_, fileSize_); + if ((size % kRegionSize > 0) || (size > numRegions_ * kRegionSize)) { + ::ftruncate(fd_, fileSize_); } // The existing regions in the file are writable. writableRegions_.resize(numRegions_); std::iota(writableRegions_.begin(), writableRegions_.end(), 0); tracker_.resize(maxRegions_); - regionSizes_.resize(maxRegions_); - regionPins_.resize(maxRegions_); - if (checkpointIntervalBytes_) { + regionSizes_.resize(maxRegions_, 0); + erasedRegionSizes_.resize(maxRegions_, 0); + regionPins_.resize(maxRegions_, 0); + if (checkpointEnabled()) { initializeCheckpoint(); } } @@ -220,7 +231,7 @@ CoalesceIoStats SsdFile::load( if (pins.empty()) { return CoalesceIoStats(); } - int payloadTotal = 0; + size_t totalPayloadBytes = 0; for (auto i = 0; i < pins.size(); ++i) { const auto runSize = ssdPins[i].run().size(); auto* entry = pins[i].checkedEntry(); @@ -231,7 +242,7 @@ CoalesceIoStats SsdFile::load( succinctBytes(runSize), succinctBytes(entry->size())); } - payloadTotal += entry->size(); + totalPayloadBytes += entry->size(); regionRead(regionIndex(ssdPins[i].run().offset()), runSize); ++stats_.entriesRead; stats_.bytesRead += entry->size(); @@ -240,9 +251,9 @@ CoalesceIoStats SsdFile::load( // Do coalesced IO for the pins. For short payloads, the break-even between // discrete pread calls and a single preadv that discards gaps is ~25K per // gap. For longer payloads this is ~50-100K. - auto stats = readPins( + const auto stats = readPins( pins, - payloadTotal / pins.size() < 10000 ? 25000 : 50000, + totalPayloadBytes / pins.size() < 10000 ? 25000 : 50000, // Max ranges in one preadv call. Longest gap + longest cache entry are // under 12 ranges. If a system has a limit of 1K ranges, coalesce limit // of 1000 is safe. @@ -258,6 +269,9 @@ CoalesceIoStats SsdFile::load( for (auto i = 0; i < ssdPins.size(); ++i) { pins[i].checkedEntry()->setSsdFile(this, ssdPins[i].run().offset()); + auto* entry = pins[i].checkedEntry(); + auto ssdRun = ssdPins[i].run(); + maybeVerifyChecksum(*entry, ssdRun); } return stats; } @@ -265,6 +279,7 @@ CoalesceIoStats SsdFile::load( void SsdFile::read( uint64_t offset, const std::vector>& buffers) { + process::TraceContext trace("SsdFile::read"); readFile_->preadv(offset, buffers); } @@ -279,7 +294,7 @@ std::optional> SsdFile::getSpace( return std::nullopt; } } - assert(!writableRegions_.empty()); + VELOX_CHECK(!writableRegions_.empty()); const auto region = writableRegions_[0]; const auto offset = regionSizes_[region]; auto available = kRegionSize - offset; @@ -306,6 +321,7 @@ std::optional> SsdFile::getSpace( } bool SsdFile::growOrEvictLocked() { + process::TraceContext trace("SsdFile::growOrEvictLocked"); if (numRegions_ < maxRegions_) { const auto newSize = (numRegions_ + 1) * kRegionSize; const auto rc = ::ftruncate(fd_, newSize); @@ -313,16 +329,17 @@ bool SsdFile::growOrEvictLocked() { fileSize_ = newSize; writableRegions_.push_back(numRegions_); regionSizes_[numRegions_] = 0; + erasedRegionSizes_[numRegions_] = 0; ++numRegions_; return true; } ++stats_.growFileErrors; - LOG(ERROR) << "Failed to grow cache file " << fileName_ << " to " - << newSize; + VELOX_SSD_CACHE_LOG(ERROR) + << "Failed to grow cache file " << fileName_ << " to " << newSize; } - auto candidates = + const auto candidates = tracker_.findEvictionCandidates(3, numRegions_, regionPins_); if (candidates.empty()) { suspended_ = true; @@ -331,6 +348,7 @@ bool SsdFile::growOrEvictLocked() { logEviction(candidates); clearRegionEntriesLocked(candidates); + stats_.regionsEvicted += candidates.size(); writableRegions_ = std::move(candidates); suspended_ = false; return true; @@ -353,68 +371,86 @@ void SsdFile::clearRegionEntriesLocked(const std::vector& regions) { // full, it will get a score boost to be a little ahead of the best. tracker_.regionCleared(region); regionSizes_[region] = 0; + erasedRegionSizes_[region] = 0; } } void SsdFile::write(std::vector& pins) { + process::TraceContext trace("SsdFile::write"); // Sorts the pins by their file/offset. In this way what is adjacent in // storage is likely adjacent on SSD. std::sort(pins.begin(), pins.end()); - uint64_t total = 0; for (const auto& pin : pins) { auto* entry = pin.checkedEntry(); VELOX_CHECK_NULL(entry->ssdFile()); - total += entry->size(); } - int32_t storeIndex = 0; - while (storeIndex < pins.size()) { - auto space = getSpace(pins, storeIndex); + int32_t writeIndex = 0; + while (writeIndex < pins.size()) { + auto space = getSpace(pins, writeIndex); if (!space.has_value()) { // No space can be reclaimed. The pins are freed when the caller is freed. + ++stats_.writeSsdDropped; return; } auto [offset, available] = space.value(); - int32_t numWritten = 0; - int32_t bytes = 0; - std::vector iovecs; - for (auto i = storeIndex; i < pins.size(); ++i) { + int32_t numWrittenEntries = 0; + uint64_t writeOffset = offset; + int32_t writeLength = 0; + std::vector writeIovecs; + for (auto i = writeIndex; i < pins.size(); ++i) { auto* entry = pins[i].checkedEntry(); const auto entrySize = entry->size(); - if (bytes + entrySize > available) { + const auto numIovecs = numIoVectorsFromEntry(*entry); + VELOX_CHECK_LE(numIovecs, IOV_MAX); + if (writeIovecs.size() + numIovecs > IOV_MAX) { + // Writes out the accumulated iovecs if it exceeds IOV_MAX limit. + if (!write(writeOffset, writeLength, writeIovecs)) { + // If write fails, we return without adding the pins to the cache. The + // entries are unchanged. + return; + } + writeIovecs.clear(); + available -= writeLength; + writeOffset += writeLength; + writeLength = 0; + } + if (writeLength + entrySize > available) { break; } - addEntryToIovecs(*entry, iovecs); - bytes += entrySize; - ++numWritten; - } - VELOX_CHECK_GE(fileSize_, offset + bytes); - - const auto rc = folly::pwritev(fd_, iovecs.data(), iovecs.size(), offset); - if (rc != bytes) { - VELOX_SSD_CACHE_LOG(ERROR) - << "Failed to write to SSD, file name: " << fileName_ - << ", fd: " << fd_ << ", size: " << iovecs.size() - << ", offset: " << offset << ", error code: " << errno - << ", error string: " << folly::errnoStr(errno); - ++stats_.writeSsdErrors; - // If write fails, we return without adding the pins to the cache. The - // entries are unchanged. - return; + addEntryToIovecs(*entry, writeIovecs); + writeLength += entrySize; + ++numWrittenEntries; + } + if (writeLength > 0) { + VELOX_CHECK(!writeIovecs.empty()); + if (!write(writeOffset, writeLength, writeIovecs)) { + return; + } + writeIovecs.clear(); + available -= writeLength; + writeOffset += writeLength; + writeLength = 0; } + VELOX_CHECK_GE(fileSize_, writeOffset); { std::lock_guard l(mutex_); - for (auto i = storeIndex; i < storeIndex + numWritten; ++i) { + for (auto i = writeIndex; i < writeIndex + numWrittenEntries; ++i) { auto* entry = pins[i].checkedEntry(); + VELOX_CHECK_NULL(entry->ssdFile()); entry->setSsdFile(this, offset); const auto size = entry->size(); FileCacheKey key = { entry->key().fileNum, static_cast(entry->offset())}; - entries_[std::move(key)] = SsdRun(offset, size); + uint32_t checksum = 0; + if (checksumEnabled_) { + checksum = checksumEntry(*entry); + } + entries_[std::move(key)] = SsdRun(offset, size, checksum); if (FLAGS_ssd_verify_write) { - verifyWrite(*entry, SsdRun(offset, size)); + verifyWrite(*entry, SsdRun(offset, size, checksum)); } offset += size; ++stats_.entriesWritten; @@ -422,15 +458,31 @@ void SsdFile::write(std::vector& pins) { bytesAfterCheckpoint_ += size; } } - storeIndex += numWritten; + writeIndex += numWrittenEntries; } - if ((checkpointIntervalBytes_ > 0) && - (bytesAfterCheckpoint_ >= checkpointIntervalBytes_)) { + if (checkpointEnabled()) { checkpoint(); } } +bool SsdFile::write( + uint64_t offset, + uint64_t length, + const std::vector& iovecs) { + const auto ret = folly::pwritev(fd_, iovecs.data(), iovecs.size(), offset); + if (ret == length) { + return true; + } + VELOX_SSD_CACHE_LOG(ERROR) + << "Failed to write to SSD, file name: " << fileName_ << ", fd: " << fd_ + << ", size: " << iovecs.size() << ", offset: " << offset + << ", error code: " << errno + << ", error string: " << folly::errnoStr(errno); + ++stats_.writeSsdErrors; + return false; +} + namespace { int32_t indexOfFirstMismatch(char* x, char* y, int n) { for (auto i = 0; i < n; ++i) { @@ -443,10 +495,11 @@ int32_t indexOfFirstMismatch(char* x, char* y, int n) { } // namespace void SsdFile::verifyWrite(AsyncDataCacheEntry& entry, SsdRun ssdRun) { + process::TraceContext trace("SsdFile::verifyWrite"); auto testData = std::make_unique(entry.size()); const auto rc = ::pread(fd_, testData.get(), entry.size(), ssdRun.offset()); VELOX_CHECK_EQ(rc, entry.size()); - if (entry.tinyData() != 0) { + if (entry.tinyData() != nullptr) { if (::memcmp(testData.get(), entry.tinyData(), entry.size()) != 0) { VELOX_FAIL("bad read back"); } @@ -457,11 +510,9 @@ void SsdFile::verifyWrite(AsyncDataCacheEntry& entry, SsdRun ssdRun) { for (auto i = 0; i < data.numRuns(); ++i) { const auto run = data.runAt(i); const auto compareSize = std::min(bytesLeft, run.numBytes()); - auto badIndex = indexOfFirstMismatch( + const auto badIndex = indexOfFirstMismatch( run.data(), testData.get() + offset, compareSize); - if (badIndex != -1) { - VELOX_FAIL("Bad read back"); - } + VELOX_CHECK_EQ(badIndex, -1, "Bad read back"); bytesLeft -= run.numBytes(); offset += run.numBytes(); if (bytesLeft <= 0) { @@ -473,16 +524,22 @@ void SsdFile::verifyWrite(AsyncDataCacheEntry& entry, SsdRun ssdRun) { void SsdFile::updateStats(SsdCacheStats& stats) const { // Lock only in tsan build. Incrementing the counters has no synchronized - // emantics. + // semantics. std::shared_lock l(mutex_); stats.entriesWritten += stats_.entriesWritten; stats.bytesWritten += stats_.bytesWritten; + stats.checkpointsWritten += stats_.checkpointsWritten; stats.entriesRead += stats_.entriesRead; stats.bytesRead += stats_.bytesRead; + stats.checkpointsRead += stats_.checkpointsRead; stats.entriesCached += entries_.size(); - for (auto& regionSize : regionSizes_) { - stats.bytesCached += regionSize; + stats.regionsCached += numRegions_; + for (auto i = 0; i < numRegions_; i++) { + stats.bytesCached += (regionSizes_[i] - erasedRegionSizes_[i]); } + stats.entriesAgedOut += stats_.entriesAgedOut; + stats.regionsAgedOut += stats_.regionsAgedOut; + stats.regionsEvicted += stats_.regionsEvicted; for (auto pins : regionPins_) { stats.numPins += pins; } @@ -496,17 +553,22 @@ void SsdFile::updateStats(SsdCacheStats& stats) const { stats.writeCheckpointErrors += stats_.writeCheckpointErrors; stats.readSsdErrors += stats_.readSsdErrors; stats.readCheckpointErrors += stats_.readCheckpointErrors; + stats.readSsdCorruptions += stats_.readSsdCorruptions; + stats.readWithoutChecksumChecks += stats_.readWithoutChecksumChecks; } void SsdFile::clear() { std::lock_guard l(mutex_); entries_.clear(); std::fill(regionSizes_.begin(), regionSizes_.end(), 0); + std::fill(erasedRegionSizes_.begin(), erasedRegionSizes_.end(), 0); writableRegions_.resize(numRegions_); std::iota(writableRegions_.begin(), writableRegions_.end(), 0); + tracker_.clear(); } -void SsdFile::deleteFile() { +void SsdFile::testingDeleteFile() { + process::TraceContext trace("SsdFile::testingDeleteFile"); if (fd_) { close(fd_); fd_ = 0; @@ -518,8 +580,85 @@ void SsdFile::deleteFile() { } } +bool SsdFile::removeFileEntries( + const folly::F14FastSet& filesToRemove, + folly::F14FastSet& filesRetained) { + if (filesToRemove.empty()) { + VELOX_SSD_CACHE_LOG(INFO) + << "Removed 0 entry from " << fileName_ << ". And erased 0 region with " + << kMaxErasedSizePct << "% entries removed."; + return true; + } + + std::lock_guard l(mutex_); + + int64_t entriesAgedOut = 0; + auto it = entries_.begin(); + while (it != entries_.end()) { + const FileCacheKey& cacheKey = it->first; + const SsdRun& ssdRun = it->second; + + if (!cacheKey.fileNum.hasValue()) { + ++it; + continue; + } + if (filesToRemove.count(cacheKey.fileNum.id()) == 0) { + ++it; + continue; + } + + auto region = regionIndex(ssdRun.offset()); + if (regionPins_[region] > 0) { + filesRetained.insert(cacheKey.fileNum.id()); + ++it; + continue; + } + + ++entriesAgedOut; + erasedRegionSizes_[region] += ssdRun.size(); + + it = entries_.erase(it); + } + + std::vector toFree; + toFree.reserve(numRegions_); + for (auto region = 0; region < numRegions_; ++region) { + if (regionPins_[region] == 0 && + erasedRegionSizes_[region] > + regionSizes_[region] * kMaxErasedSizePct / 100) { + toFree.push_back(region); + } + } + if (toFree.size() > 0) { + VELOX_CHECK(!suspended_); + logEviction(toFree); + clearRegionEntriesLocked(toFree); + writableRegions_.reserve( + std::min(writableRegions_.size() + toFree.size(), numRegions_)); + folly::F14FastSet existingWritableRegions( + writableRegions_.begin(), writableRegions_.end()); + for (int32_t region : toFree) { + if (existingWritableRegions.count(region) == 0) { + writableRegions_.push_back(region); + } + VELOX_CHECK_EQ(regionSizes_[region], 0); + VELOX_CHECK_EQ(erasedRegionSizes_[region], 0); + } + } + + stats_.entriesAgedOut += entriesAgedOut; + stats_.regionsAgedOut += toFree.size(); + stats_.regionsEvicted += toFree.size(); + VELOX_SSD_CACHE_LOG(INFO) + << "Removed " << entriesAgedOut << " entries from " << fileName_ + << ". And erased " << toFree.size() << " regions with " + << kMaxErasedSizePct << "% entries removed, and " << entries_.size() + << " left."; + return true; +} + void SsdFile::logEviction(const std::vector& regions) { - if (checkpointIntervalBytes_ > 0) { + if (checkpointEnabled()) { const int32_t rc = ::write( evictLogFd_, regions.data(), regions.size() * sizeof(regions[0])); if (rc != regions.size() * sizeof(regions[0])) { @@ -544,17 +683,17 @@ void SsdFile::deleteCheckpoint(bool keepLog) { } checkpointDeleted_ = true; - const auto logPath = fileName_ + kLogExtension; + const auto logPath = getEvictLogFilePath(); int32_t logRc = 0; if (!keepLog) { logRc = ::unlink(logPath.c_str()); } - const auto checkpointPath = fileName_ + kCheckpointExtension; + const auto checkpointPath = getCheckpointFilePath(); const auto checkpointRc = ::unlink(checkpointPath.c_str()); if ((logRc != 0) || (checkpointRc != 0)) { ++stats_.deleteCheckpointErrors; VELOX_SSD_CACHE_LOG(ERROR) - << "Error in deleting log and checkpoint. log: " << logRc + << "Error in deleting log and checkpoint. log: " << logRc << " checkpoint: " << checkpointRc; } } @@ -580,14 +719,28 @@ inline const char* asChar(const T* ptr) { } // namespace void SsdFile::checkpoint(bool force) { + process::TraceContext trace("SsdFile::checkpoint"); std::lock_guard l(mutex_); - if (!force && (bytesAfterCheckpoint_ < checkpointIntervalBytes_)) { + if (!needCheckpoint(force)) { return; } + VELOX_SSD_CACHE_LOG(INFO) + << "Checkpointing shard " << shardId_ << ", force: " << force + << " bytesAfterCheckpoint: " << succinctBytes(bytesAfterCheckpoint_) + << " checkpointIntervalBytes: " + << succinctBytes(checkpointIntervalBytes_); + checkpointDeleted_ = false; bytesAfterCheckpoint_ = 0; try { + const auto checkRc = [&](int32_t rc, const std::string& errMsg) { + if (rc < 0) { + VELOX_FAIL("{} with rc {} :{}", errMsg, rc, folly::errnoStr(errno)); + } + return rc; + }; + // We schedule the potentially long fsync of the cache file on another // thread of the cache write executor, if available. If there is none, we do // the sync on this thread at the end. @@ -597,53 +750,55 @@ void SsdFile::checkpoint(bool force) { executor_->add([fileSync]() { fileSync->prepare(); }); } - const auto checkRc = [&](int32_t rc, const std::string& errMsg) { - if (rc < 0) { - VELOX_FAIL("{} with rc {} :{}", errMsg, rc, folly::errnoStr(errno)); - } - return rc; - }; - std::ofstream state; - auto checkpointPath = fileName_ + kCheckpointExtension; - state.exceptions(std::ofstream::failbit); - state.open(checkpointPath, std::ios_base::out | std::ios_base::trunc); - // The checkpoint state file contains: - // int32_t The 4 bytes of kCheckpointMagic, - // int32_t maxRegions, - // int32_t numRegions, - // regionScores from the 'tracker_', - // {fileId, fileName} pairs, - // kMapMarker, - // {fileId, offset, SSdRun} triples, - // kEndMarker. - state.write(kCheckpointMagic, sizeof(int32_t)); - state.write(asChar(&maxRegions_), sizeof(maxRegions_)); - state.write(asChar(&numRegions_), sizeof(numRegions_)); - - // Copy the region scores before writing out for tsan. - const auto scoresCopy = tracker_.copyScores(); - state.write(asChar(scoresCopy.data()), maxRegions_ * sizeof(uint64_t)); - std::unordered_set fileNums; - for (const auto& entry : entries_) { - const auto fileNum = entry.first.fileNum.id(); - if (fileNums.insert(fileNum).second) { - state.write(asChar(&fileNum), sizeof(fileNum)); - const auto name = fileIds().string(fileNum); - const int32_t length = name.size(); - state.write(asChar(&length), sizeof(length)); - state.write(name.data(), length); + const auto checkpointPath = getCheckpointFilePath(); + try { + state.exceptions(std::ofstream::failbit); + state.open(checkpointPath, std::ios_base::out | std::ios_base::trunc); + // The checkpoint state file contains: + // int32_t The 4 bytes of checkpoint version, + // int32_t maxRegions, + // int32_t numRegions, + // regionScores from the 'tracker_', + // {fileId, fileName} pairs, + // kMapMarker, + // {fileId, offset, SSdRun} triples, + // kEndMarker. + state.write(checkpointVersion().data(), sizeof(int32_t)); + state.write(asChar(&maxRegions_), sizeof(maxRegions_)); + state.write(asChar(&numRegions_), sizeof(numRegions_)); + + // Copy the region scores before writing out for tsan. + const auto scoresCopy = tracker_.copyScores(); + state.write(asChar(scoresCopy.data()), maxRegions_ * sizeof(uint64_t)); + std::unordered_set fileNums; + for (const auto& entry : entries_) { + const auto fileNum = entry.first.fileNum.id(); + if (fileNums.insert(fileNum).second) { + state.write(asChar(&fileNum), sizeof(fileNum)); + const auto name = fileIds().string(fileNum); + const int32_t length = name.size(); + state.write(asChar(&length), sizeof(length)); + state.write(name.data(), length); + } } - } - const auto mapMarker = kCheckpointMapMarker; - state.write(asChar(&mapMarker), sizeof(mapMarker)); - for (auto& pair : entries_) { - auto id = pair.first.fileNum.id(); - state.write(asChar(&id), sizeof(id)); - state.write(asChar(&pair.first.offset), sizeof(pair.first.offset)); - auto offsetAndSize = pair.second.bits(); - state.write(asChar(&offsetAndSize), sizeof(offsetAndSize)); + const auto mapMarker = kCheckpointMapMarker; + state.write(asChar(&mapMarker), sizeof(mapMarker)); + for (auto& pair : entries_) { + const auto id = pair.first.fileNum.id(); + state.write(asChar(&id), sizeof(id)); + state.write(asChar(&pair.first.offset), sizeof(pair.first.offset)); + const auto offsetAndSize = pair.second.fileBits(); + state.write(asChar(&offsetAndSize), sizeof(offsetAndSize)); + if (checksumEnabled_) { + const auto checksum = pair.second.checksum(); + state.write(asChar(&checksum), sizeof(checksum)); + } + } + } catch (const std::exception& e) { + fileSync->close(); + std::rethrow_exception(std::current_exception()); } // NOTE: we need to ensure cache file data sync update completes before @@ -657,6 +812,8 @@ void SsdFile::checkpoint(bool force) { if (state.bad()) { ++stats_.writeCheckpointErrors; checkRc(-1, "Write of checkpoint file"); + } else { + ++stats_.checkpointsWritten; } state.close(); @@ -665,6 +822,11 @@ void SsdFile::checkpoint(bool force) { const auto checkpointFd = checkRc( ::open(checkpointPath.c_str(), O_WRONLY), "Open of checkpoint file for sync"); + // TODO: add this as file open option after we migrate to use velox + // filesystem for ssd file access. + if (disableFileCow_) { + disableCow(checkpointFd); + } VELOX_CHECK_GE(checkpointFd, 0); checkRc(::fsync(checkpointFd), "Sync of checkpoint file"); ::close(checkpointFd); @@ -674,29 +836,40 @@ void SsdFile::checkpoint(bool force) { // log evictions. The latter might lead to data consistent issue. checkRc(::ftruncate(evictLogFd_, 0), "Truncate of event log"); checkRc(::fsync(evictLogFd_), "Sync of evict log"); + + VELOX_SSD_CACHE_LOG(INFO) + << "Checkpoint persisted with " << entries_.size() << " cache entries"; } catch (const std::exception& e) { try { checkpointError(-1, e.what()); - } catch (const std::exception& inner) { + } catch (const std::exception&) { } // Ignore nested exception. } } void SsdFile::initializeCheckpoint() { - if (checkpointIntervalBytes_ == 0) { + if (!checkpointEnabled()) { return; } + bool hasCheckpoint = true; - std::ifstream state(fileName_ + kCheckpointExtension); + std::ifstream state(getCheckpointFilePath()); if (!state.is_open()) { hasCheckpoint = false; ++stats_.openCheckpointErrors; - VELOX_SSD_CACHE_LOG(INFO) - << "Starting shard " << shardId_ << " without checkpoint"; + VELOX_SSD_CACHE_LOG(WARNING) << fmt::format( + "Starting shard {} without checkpoint, with checksum write {}, read verification {}, checkpoint file {}", + shardId_, + checksumEnabled_ ? "enabled" : "disabled", + checksumReadVerificationEnabled_ ? "enabled" : "disabled", + getCheckpointFilePath()); } - const auto logPath = fileName_ + kLogExtension; + const auto logPath = getEvictLogFilePath(); evictLogFd_ = ::open(logPath.c_str(), O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + if (disableFileCow_) { + disableCow(evictLogFd_); + } if (evictLogFd_ < 0) { ++stats_.openLogErrors; // Failure to open the log at startup is a process terminating error. @@ -719,8 +892,54 @@ void SsdFile::initializeCheckpoint() { << e.what() << ": Starting without checkpoint"; entries_.clear(); deleteCheckpoint(true); - } catch (const std::exception& e) { + } catch (const std::exception&) { + } + } +} + +uint32_t SsdFile::checksumEntry(const AsyncDataCacheEntry& entry) const { + bits::Crc32 crc; + if (entry.tinyData()) { + crc.process_bytes(entry.tinyData(), entry.size()); + } else { + int64_t bytesLeft = entry.size(); + const auto& data = entry.data(); + for (auto i = 0; i < data.numRuns() && bytesLeft > 0; ++i) { + const auto run = data.runAt(i); + const auto bytesToProcess = std::min(bytesLeft, run.numBytes()); + crc.process_bytes(run.data(), bytesToProcess); + bytesLeft -= bytesToProcess; } + VELOX_CHECK_EQ(bytesLeft, 0); + } + return crc.checksum(); +} + +void SsdFile::maybeVerifyChecksum( + const AsyncDataCacheEntry& entry, + const SsdRun& ssdRun) { + if (!checksumReadVerificationEnabled_) { + return; + } + VELOX_DCHECK_EQ(ssdRun.size(), entry.size()); + if (ssdRun.size() != entry.size()) { + ++stats_.readWithoutChecksumChecks; + VELOX_CACHE_LOG_EVERY_MS(WARNING, 1'000) + << "SSD read without checksum due to cache request size mismatch, SSD cache size " + << ssdRun.size() << " request size " << entry.size() + << ", cache request: " << entry.toString(); + return; + } + + // Verifies that the checksum matches after we read from SSD. + const auto checksum = checksumEntry(entry); + if (checksum != ssdRun.checksum()) { + ++stats_.readSsdCorruptions; + VELOX_FAIL( + "IOERR: Corrupt SSD cache entry - File: {}, Offset: {}, Size: {}", + fileName_, + ssdRun.offset(), + ssdRun.size()); } } @@ -751,28 +970,36 @@ T readNumber(std::ifstream& stream) { } // namespace void SsdFile::readCheckpoint(std::ifstream& state) { - char magic[4]; - state.read(magic, sizeof(magic)); - VELOX_CHECK_EQ(strncmp(magic, kCheckpointMagic, 4), 0); + char versionMagic[4]; + state.read(versionMagic, sizeof(versionMagic)); + const auto checkpoinHasChecksum = + isChecksumEnabledOnCheckpointVersion(std::string(versionMagic, 4)); + if (checksumEnabled_ && !checkpoinHasChecksum) { + VELOX_SSD_CACHE_LOG(WARNING) << fmt::format( + "Starting shard {} without checkpoint: checksum is enabled but the checkpoint was made without checksum, so skip the checkpoint recovery, checkpoint file {}", + shardId_, + getCheckpointFilePath()); + return; + } + const auto maxRegions = readNumber(state); VELOX_CHECK_EQ( maxRegions, maxRegions_, "Trying to start from checkpoint with a different capacity"); numRegions_ = readNumber(state); - std::vector scores(maxRegions); - state.read(asChar(scores.data()), maxRegions_ * sizeof(uint64_t)); + std::vector scores(maxRegions); + state.read(asChar(scores.data()), maxRegions_ * sizeof(double)); std::unordered_map idMap; for (;;) { - auto id = readNumber(state); + const auto id = readNumber(state); if (id == kCheckpointMapMarker) { break; } std::string name; name.resize(readNumber(state)); state.read(name.data(), name.size()); - auto lease = StringIdLease(fileIds(), name); - idMap[id] = std::move(lease); + idMap[id] = StringIdLease(fileIds(), id, name); } const auto logSize = ::lseek(evictLogFd_, 0, SEEK_END); @@ -783,22 +1010,49 @@ void SsdFile::readCheckpoint(std::ifstream& state) { for (auto region : evicted) { evictedMap.insert(region); } + + std::vector regionCacheSizes(numRegions_, 0); for (;;) { - const uint64_t fileNum = readNumber(state); + const auto fileNum = readNumber(state); if (fileNum == kCheckpointEndMarker) { break; } - const uint64_t offset = readNumber(state); - const auto run = SsdRun(readNumber(state)); + const auto offset = readNumber(state); + const auto fileBits = readNumber(state); + uint32_t checksum = 0; + if (checkpoinHasChecksum) { + checksum = readNumber(state); + } + const auto run = SsdRun(fileBits, checksum); + const auto region = regionIndex(run.offset()); // Check that the recovered entry does not fall in an evicted region. - if (evictedMap.find(regionIndex(run.offset())) == evictedMap.end()) { - // The file may have a different id on restore. - auto it = idMap.find(fileNum); - VELOX_CHECK(it != idMap.end()); - FileCacheKey key{it->second, offset}; - entries_[std::move(key)] = run; + if (evictedMap.find(region) != evictedMap.end()) { + continue; } + // The file may have a different id on restore. + const auto it = idMap.find(fileNum); + VELOX_CHECK(it != idMap.end()); + FileCacheKey key{it->second, offset}; + entries_[std::move(key)] = run; + regionCacheSizes[region] += run.size(); + regionSizes_[region] = std::max( + regionSizes_[region], regionOffset(run.offset()) + run.size()); } + + // NOTE: we might erase entries from a region for TTL eviction, so we need to + // set the region size to the max offset of the recovered cache entry from the + // region. Correspondingly, we substract the cached size from the region size + // to get the erased size. + for (auto region = 0; region < numRegions_; ++region) { + VELOX_CHECK_LE(regionSizes_[region], kRegionSize); + VELOX_CHECK_LE(regionCacheSizes[region], regionSizes_[region]); + erasedRegionSizes_[region] = + regionSizes_[region] - regionCacheSizes[region]; + } + + ++stats_.checkpointsRead; + stats_.entriesRecovered += entries_.size(); + // The state is successfully read. Install the access frequency scores and // evicted regions. VELOX_CHECK_EQ(scores.size(), tracker_.regionScores().size()); @@ -808,12 +1062,21 @@ void SsdFile::readCheckpoint(std::ifstream& state) { writableRegions_.push_back(region); } tracker_.setRegionScores(scores); + + uint64_t cachedBytes{0}; + for (const auto regionSize : regionSizes_) { + cachedBytes += regionSize; + } VELOX_SSD_CACHE_LOG(INFO) << fmt::format( - "Starting shard {} from checkpoint with {} entries, {} regions with {} free.", + "Starting shard {} from checkpoint with {} entries, {} cached data, {} regions with {} free, with checksum write {}, read verification {}, checkpoint file {}", shardId_, entries_.size(), + succinctBytes(cachedBytes), numRegions_, - writableRegions_.size()); + writableRegions_.size(), + checksumEnabled_ ? "enabled" : "disabled", + checksumReadVerificationEnabled_ ? "enabled" : "disabled", + getCheckpointFilePath()); } } // namespace facebook::velox::cache diff --git a/velox/common/caching/SsdFile.h b/velox/common/caching/SsdFile.h index 31583cfcd23bf..434d41c342b9e 100644 --- a/velox/common/caching/SsdFile.h +++ b/velox/common/caching/SsdFile.h @@ -16,71 +16,82 @@ #pragma once +#include +#include + #include "velox/common/caching/AsyncDataCache.h" #include "velox/common/caching/SsdFileTracker.h" #include "velox/common/file/File.h" -#include - DECLARE_bool(ssd_odirect); DECLARE_bool(ssd_verify_write); namespace facebook::velox::cache { -// A 64 bit word describing a SSD cache entry in an SsdFile. The low -// 23 bits are the size, for a maximum entry size of 8MB. The high -// bits are the offset. +/// A 64 bit word describing a SSD cache entry in an SsdFile. The low 23 bits +/// are the size, for a maximum entry size of 8MB. The high bits are the offset. class SsdRun { public: static constexpr int32_t kSizeBits = 23; - SsdRun() : bits_(0) {} + SsdRun() : fileBits_(0) {} - SsdRun(uint64_t offset, uint32_t size) - : bits_((offset << kSizeBits) | ((size - 1))) { + SsdRun(uint64_t offset, uint32_t size, uint32_t checksum) + : fileBits_((offset << kSizeBits) | ((size - 1))), checksum_(checksum) { VELOX_CHECK_LT(offset, 1L << (64 - kSizeBits)); - VELOX_CHECK_LT(size - 1, 1 << kSizeBits); + VELOX_CHECK_NE(size, 0); + VELOX_CHECK_LE(size, 1 << kSizeBits); } - SsdRun(uint64_t bits) : bits_(bits) {} + SsdRun(uint64_t fileBits, uint32_t checksum) + : fileBits_(fileBits), checksum_(checksum) {} SsdRun(const SsdRun& other) = default; SsdRun(SsdRun&& other) = default; void operator=(const SsdRun& other) { - bits_ = other.bits_; + fileBits_ = other.fileBits_; + checksum_ = other.checksum_; } void operator=(SsdRun&& other) { - bits_ = other.bits_; + fileBits_ = other.fileBits_; + checksum_ = other.checksum_; } uint64_t offset() const { - return (bits_ >> kSizeBits); + return (fileBits_ >> kSizeBits); } uint32_t size() const { - return (bits_ & ((1 << kSizeBits) - 1)) + 1; + return (fileBits_ & ((1 << kSizeBits) - 1)) + 1; + } + + /// Returns the checksum computed with crc32. + uint32_t checksum() const { + return checksum_; } - // Returns raw bits for serialization. - uint64_t bits() const { - return bits_; + /// Returns raw bits for offset and size for serialization. + uint64_t fileBits() const { + return fileBits_; } private: - uint64_t bits_; + // Contains the file offset and size. + uint64_t fileBits_; + uint32_t checksum_; }; -// Represents an SsdFile entry that is planned for load or being -// loaded. This is destroyed after load. Destruction decrements the -// pin count of the corresponding region of 'file_'. While there are -// pins, the region cannot be evicted. +/// Represents an SsdFile entry that is planned for load or being loaded. This +/// is destroyed after load. Destruction decrements the pin count of the +/// corresponding region of 'file_'. While there are pins, the region cannot be +/// evicted. class SsdPin { public: SsdPin() : file_(nullptr) {} - // Constructs a pin referencing 'run' in 'file'. The region must be - // pinned before constructing the pin. + /// Constructs a pin referencing 'run' in 'file'. The region must be pinned + /// before constructing the pin. SsdPin(SsdFile& file, SsdRun run); SsdPin(const SsdPin& other) = delete; @@ -118,7 +129,7 @@ class SsdPin { SsdRun run_; }; -// Metrics for SSD cache. Maintained by SsdFile and aggregated by SsdCache. +/// Metrics for SSD cache. Maintained by SsdFile and aggregated by SsdCache. struct SsdCacheStats { SsdCacheStats() {} @@ -129,10 +140,17 @@ struct SsdCacheStats { void operator=(const SsdCacheStats& other) { entriesWritten = tsanAtomicValue(other.entriesWritten); bytesWritten = tsanAtomicValue(other.bytesWritten); + checkpointsWritten = tsanAtomicValue(other.checkpointsWritten); entriesRead = tsanAtomicValue(other.entriesRead); + entriesRecovered = tsanAtomicValue(other.entriesRecovered); bytesRead = tsanAtomicValue(other.bytesRead); + checkpointsRead = tsanAtomicValue(other.checkpointsRead); entriesCached = tsanAtomicValue(other.entriesCached); + regionsCached = tsanAtomicValue(other.regionsCached); bytesCached = tsanAtomicValue(other.bytesCached); + entriesAgedOut = tsanAtomicValue(other.entriesAgedOut); + regionsAgedOut = tsanAtomicValue(other.regionsAgedOut); + regionsEvicted = tsanAtomicValue(other.regionsEvicted); numPins = tsanAtomicValue(other.numPins); openFileErrors = tsanAtomicValue(other.openFileErrors); @@ -141,93 +159,172 @@ struct SsdCacheStats { deleteCheckpointErrors = tsanAtomicValue(other.deleteCheckpointErrors); growFileErrors = tsanAtomicValue(other.growFileErrors); writeSsdErrors = tsanAtomicValue(other.writeSsdErrors); + writeSsdDropped = tsanAtomicValue(other.writeSsdDropped); writeCheckpointErrors = tsanAtomicValue(other.writeCheckpointErrors); readSsdErrors = tsanAtomicValue(other.readSsdErrors); readCheckpointErrors = tsanAtomicValue(other.readCheckpointErrors); + readSsdCorruptions = tsanAtomicValue(other.readSsdCorruptions); + readWithoutChecksumChecks = + tsanAtomicValue(other.readWithoutChecksumChecks); } - tsan_atomic entriesWritten{0}; - tsan_atomic bytesWritten{0}; - tsan_atomic entriesRead{0}; - tsan_atomic bytesRead{0}; + SsdCacheStats operator-(const SsdCacheStats& other) const { + SsdCacheStats result; + result.entriesWritten = entriesWritten - other.entriesWritten; + result.bytesWritten = bytesWritten - other.bytesWritten; + result.checkpointsWritten = checkpointsWritten - other.checkpointsWritten; + result.entriesRead = entriesRead - other.entriesRead; + result.entriesRecovered = entriesRecovered - other.entriesRecovered; + result.bytesRead = bytesRead - other.bytesRead; + result.checkpointsRead = checkpointsRead - other.checkpointsRead; + result.entriesAgedOut = entriesAgedOut - other.entriesAgedOut; + result.regionsAgedOut = regionsAgedOut - other.regionsAgedOut; + result.regionsEvicted = regionsEvicted - other.regionsEvicted; + result.openFileErrors = openFileErrors - other.openFileErrors; + result.openCheckpointErrors = + openCheckpointErrors - other.openCheckpointErrors; + result.openLogErrors = openLogErrors - other.openLogErrors; + result.deleteCheckpointErrors = + deleteCheckpointErrors - other.deleteCheckpointErrors; + result.growFileErrors = growFileErrors - other.growFileErrors; + result.writeSsdErrors = writeSsdErrors - other.writeSsdErrors; + result.writeSsdDropped = writeSsdDropped - other.writeSsdDropped; + result.writeCheckpointErrors = + writeCheckpointErrors - other.writeCheckpointErrors; + result.readSsdCorruptions = readSsdCorruptions - other.readSsdCorruptions; + result.readSsdErrors = readSsdErrors - other.readSsdErrors; + result.readCheckpointErrors = + readCheckpointErrors - other.readCheckpointErrors; + result.readWithoutChecksumChecks = + readWithoutChecksumChecks - other.readWithoutChecksumChecks; + return result; + } + + void clear() { + *this = SsdCacheStats(); + } + + /// Snapshot stats tsan_atomic entriesCached{0}; + tsan_atomic regionsCached{0}; tsan_atomic bytesCached{0}; tsan_atomic numPins{0}; + /// Cumulative stats + tsan_atomic entriesWritten{0}; + tsan_atomic bytesWritten{0}; + tsan_atomic checkpointsWritten{0}; + tsan_atomic entriesRead{0}; + tsan_atomic entriesRecovered{0}; + tsan_atomic bytesRead{0}; + tsan_atomic checkpointsRead{0}; + tsan_atomic entriesAgedOut{0}; + tsan_atomic regionsAgedOut{0}; + tsan_atomic regionsEvicted{0}; tsan_atomic openFileErrors{0}; tsan_atomic openCheckpointErrors{0}; tsan_atomic openLogErrors{0}; tsan_atomic deleteCheckpointErrors{0}; tsan_atomic growFileErrors{0}; tsan_atomic writeSsdErrors{0}; + tsan_atomic writeSsdDropped{0}; tsan_atomic writeCheckpointErrors{0}; tsan_atomic readSsdErrors{0}; tsan_atomic readCheckpointErrors{0}; + tsan_atomic readSsdCorruptions{0}; + tsan_atomic readWithoutChecksumChecks{0}; }; -// A shard of SsdCache. Corresponds to one file on SSD. The data -// backed by each SsdFile is selected on a hash of the storage file -// number of the cached data. Each file consists of an integer number -// of 64MB regions. Each region has a pin count and an read -// count. Cache replacement takes place region by region, preferring -// regions with a smaller read count. Entries do not span -// regions. Otherwise entries are consecutive byte ranges inside -// their region. +/// A shard of SsdCache. Corresponds to one file on SSD. The data backed by each +/// SsdFile is selected on a hash of the storage file number of the cached data. +/// Each file consists of an integer number of 64MB regions. Each region has a +/// pin count and an read count. Cache replacement takes place region by region, +/// preferring regions with a smaller read count. Entries do not span regions. +/// Otherwise entries are consecutive byte ranges inside their region. class SsdFile { public: + struct Config { + Config( + const std::string& _fileName, + int32_t _shardId, + int32_t _maxRegions, + uint64_t _checkpointIntervalBytes = 0, + bool _disableFileCow = false, + bool _checksumEnabled = false, + bool _checksumReadVerificationEnabled = false, + folly::Executor* _executor = nullptr) + : fileName(_fileName), + shardId(_shardId), + maxRegions(_maxRegions), + checkpointIntervalBytes(_checkpointIntervalBytes), + disableFileCow(_disableFileCow), + checksumEnabled(_checksumEnabled), + checksumReadVerificationEnabled( + _checksumEnabled && _checksumReadVerificationEnabled), + executor(_executor){}; + + /// Name of cache file, used as prefix for checkpoint files. + const std::string fileName; + + /// Shard index within the SsdCache. + const int32_t shardId; + + /// Maximum size of the backing file in kRegionSize units. + const int32_t maxRegions; + + /// Checkpoint after every 'checkpointIntervalBytes' written into this + /// file. 0 means no checkpointing. This is set to 0 if checkpointing fails. + uint64_t checkpointIntervalBytes; + + /// True if copy on write should be disabled. + bool disableFileCow; + + /// If true, checksum write to SSD is enabled. + bool checksumEnabled; + + /// If true, checksum read verification from SSD is enabled. + bool checksumReadVerificationEnabled; + + /// Executor for async fsync in checkpoint. + folly::Executor* executor; + }; + static constexpr uint64_t kRegionSize = 1 << 26; // 64MB - // Constructs a cache backed by filename. Discards any previous - // contents of filename. - SsdFile( - const std::string& filename, - int32_t shardId, - int32_t maxRegions, - int64_t checkpointInternalBytes = 0, - bool disableFileCow = false, - folly::Executor* executor = nullptr); - - // Adds entries of 'pins' to this file. 'pins' must be in read mode and - // those pins that are successfully added to SSD are marked as being on SSD. - // The file of the entries must be a file that is backed by 'this'. + /// Constructs a cache backed by filename. Discards any previous contents of + /// filename. + SsdFile(const Config& config); + + /// Adds entries of 'pins' to this file. 'pins' must be in read mode and + /// those pins that are successfully added to SSD are marked as being on SSD. + /// The file of the entries must be a file that is backed by 'this'. void write(std::vector& pins); - // Finds an entry for 'key'. If no entry is found, the returned pin is empty. + /// Finds an entry for 'key'. If no entry is found, the returned pin is empty. SsdPin find(RawFileCacheKey key); - // Erases 'key' + /// Erases 'key' bool erase(RawFileCacheKey key); - - // Copies the data in 'ssdPins' into 'pins'. Coalesces IO for nearby - // entries if they are in ascending order and near enough. + /// Copies the data in 'ssdPins' into 'pins'. Coalesces IO for nearby + /// entries if they are in ascending order and near enough. CoalesceIoStats load( const std::vector& ssdPins, const std::vector& pins); - // Increments the pin count of the region of 'offset'. + /// Increments the pin count of the region of 'offset'. void pinRegion(uint64_t offset); - // Decrements the pin count of the region of 'offset'. If the pin count goes - // to zero and evict is due, starts the eviction. + /// Decrements the pin count of the region of 'offset'. If the pin count goes + /// to zero and evict is due, starts the eviction. void unpinRegion(uint64_t offset); - // Asserts that the region of 'offset' is pinned. This is called by - // the pin holder. The pin count can be read without mutex. + /// Asserts that the region of 'offset' is pinned. This is called by the pin + /// holder. The pin count can be read without mutex. void checkPinned(uint64_t offset) const { tsan_lock_guard l(mutex_); VELOX_CHECK_GT(regionPins_[regionIndex(offset)], 0); } - // Returns the region number corresponding to offset. - static int32_t regionIndex(uint64_t offset) { - return offset / kRegionSize; - } - - // Updates the read count of a region. - void regionRead(int32_t region, int32_t size) { - tracker_.regionRead(region, size); - } - int32_t maxRegions() const { return maxRegions_; } @@ -236,34 +333,99 @@ class SsdFile { return shardId_; } - // Adds 'stats_' to 'stats'. + /// Adds 'stats_' to 'stats'. void updateStats(SsdCacheStats& stats) const; - // Resets this' to a post-construction empty state. See SsdCache::clear(). - void clear(); + /// Remove cached entries of files in the fileNum set 'filesToRemove'. If + /// successful, return true, and 'filesRetained' contains entries that should + /// not be removed, ex., from pinned regions. Otherwise, return false and + /// 'filesRetained' could be ignored. + bool removeFileEntries( + const folly::F14FastSet& filesToRemove, + folly::F14FastSet& filesRetained); + + /// Writes a checkpoint state that can be recovered from. The checkpoint is + /// serialized on 'mutex_'. If 'force' is false, rechecks that at least + /// 'checkpointIntervalBytes_' have been written since last checkpoint and + /// silently returns if not. + void checkpoint(bool force = false); - // Deletes the backing file. Used in testing. - void deleteFile(); + /// Deletes checkpoint files. If 'keepLog' is true, truncates and syncs the + /// eviction log and leaves this open. + void deleteCheckpoint(bool keepLog = false); - // Writes a checkpoint state that can be recovered from. The - // checkpoint is serialized on 'mutex_'. If 'force' is false, - // rechecks that at least 'checkpointIntervalBytes_' have been - // written since last checkpoint and silently returns if not. - void checkpoint(bool force = false); + /// Returns the SSD file path. + const std::string& fileName() const { + return fileName_; + } + + /// Returns the eviction log file path. + std::string getEvictLogFilePath() const { + return fileName_ + kLogExtension; + } + + /// Returns the checkpoint file path. + std::string getCheckpointFilePath() const { + return fileName_ + kCheckpointExtension; + } + + /// Deletes the backing file. Used in testing. + void testingDeleteFile(); + + /// Resets this' to a post-construction empty state. See SsdCache::clear(). + /// + /// NOTE: this is only used by test and Prestissimo worker operation. + void clear(); /// Returns true if copy on write is disabled for this file. Used in testing. bool testingIsCowDisabled() const; + std::vector testingCopyScores() { + return tracker_.copyScores(); + } + + int32_t testingNumWritableRegions() const { + return writableRegions_.size(); + } + + const folly::F14FastMap& testingEntries() { + return entries_; + } + + bool testingChecksumReadVerificationEnabled() const { + return checksumReadVerificationEnabled_; + } + private: - // 4 first bytes of a checkpoint file. Allows distinguishing between format - // versions. - static constexpr const char* kCheckpointMagic = "CPT1"; // Magic number separating file names from cache entry data in checkpoint // file. static constexpr int64_t kCheckpointMapMarker = 0xfffffffffffffffe; // Magic number at end of completed checkpoint file. static constexpr int64_t kCheckpointEndMarker = 0xcbedf11e; + static constexpr int kMaxErasedSizePct = 50; + + // Updates the read count of a region. + void regionRead(int32_t region, int32_t size) { + tracker_.regionRead(region, size); + } + + // Returns the region number corresponding to 'offset'. + static int32_t regionIndex(uint64_t offset) { + return offset / kRegionSize; + } + + // Returns the offset within a region corresponding to 'offset'. + static int32_t regionOffset(uint64_t offset) { + return offset % kRegionSize; + } + + // The first 4 bytes of a checkpoint file contains version string to indicate + // if checksum write is enabled or not. + std::string checkpointVersion() const { + return checksumEnabled_ ? "CPT2" : "CPT1"; + } + // Increments the pin count of the region of 'offset'. Caller must hold // 'mutex_'. void pinRegionLocked(uint64_t offset) { @@ -292,13 +454,9 @@ class SsdFile { // Verifies that 'entry' has the data at 'run'. void verifyWrite(AsyncDataCacheEntry& entry, SsdRun run); - // Deletes checkpoint files. If 'keepLog' is true, truncates and syncs the - // eviction log and leaves this open. - void deleteCheckpoint(bool keepLog = false); - - // Reads a checkpoint state file and sets 'this' accordingly if read - // is successful. Return true for successful read. A failed read - // deletes the checkpoint and leaves the log truncated open. + // Reads a checkpoint state file and sets 'this' accordingly if read is + // successful. Return true for successful read. A failed read deletes the + // checkpoint and leaves the log truncated open. void readCheckpoint(std::ifstream& state); // Logs an error message, deletes the checkpoint and stop making new @@ -312,10 +470,41 @@ class SsdFile { // the files for making new checkpoints. void initializeCheckpoint(); - // Synchronously logs that 'regions' are no longer valid in a possibly xisting - // checkpoint. + // Writes 'iovecs' to the SSD file at the 'offset'. Returns true if the write + // succeeds; otherwise, log the error and return false. + bool + write(uint64_t offset, uint64_t length, const std::vector& iovecs); + + // Synchronously logs that 'regions' are no longer valid in a possibly + // existing checkpoint. void logEviction(const std::vector& regions); + // Computes the checksum of data in cache 'entry'. + uint32_t checksumEntry(const AsyncDataCacheEntry& entry) const; + + // Returns true if checkpoint has been enabled. + bool checkpointEnabled() const { + return checkpointIntervalBytes_ > 0; + } + + // Returns true if checkpoint is needed. + bool needCheckpoint(bool force) const { + if (!checkpointEnabled()) { + return false; + } + return force || (bytesAfterCheckpoint_ >= checkpointIntervalBytes_); + } + + void maybeVerifyChecksum( + const AsyncDataCacheEntry& entry, + const SsdRun& ssdRun); + + // Returns true if checksum write is enabled for the given version. + static bool isChecksumEnabledOnCheckpointVersion( + const std::string& checkpointVersion) { + return checkpointVersion == "CPT2"; + } + static constexpr const char* kLogExtension = ".log"; static constexpr const char* kCheckpointExtension = ".cpt"; @@ -325,11 +514,20 @@ class SsdFile { // Maximum size of the backing file in kRegionSize units. const int32_t maxRegions_; - // Serializes access to all private data members. - mutable std::shared_mutex mutex_; + // True if copy on write should be disabled. + const bool disableFileCow_; + + // If true, checksum write to SSD is enabled. + const bool checksumEnabled_; + + // If true, checksum read verification from SSD is enabled. + const bool checksumReadVerificationEnabled_; // Shard index within 'cache_'. - int32_t shardId_; + const int32_t shardId_; + + // Serializes access to all private data members. + mutable std::shared_mutex mutex_; // Number of kRegionSize regions in the file. int32_t numRegions_{0}; @@ -340,10 +538,12 @@ class SsdFile { bool suspended_{false}; // Number of used bytes in each region. A new entry must fit between the - // offset and the end of the region. This is subscripted with the region + // offset and the end of the region. This is sub-scripted with the region // index. The regionIndex times kRegionSize is an offset into the file. std::vector regionSizes_; + std::vector erasedRegionSizes_; + // Indices of regions available for writing new entries. std::vector writableRegions_; @@ -368,9 +568,8 @@ class SsdFile { // Counters. SsdCacheStats stats_; - // Checkpoint after every 'checkpointIntervalBytes_' written into - // this file. 0 means no checkpointing. This is set to 0 if - // checkpointing fails. + // Checkpoint after every 'checkpointIntervalBytes_' written into this file. 0 + // means no checkpointing. This is set to 0 if checkpointing fails. int64_t checkpointIntervalBytes_{0}; // Executor for async fsync in checkpoint. diff --git a/velox/common/caching/SsdFileTracker.cpp b/velox/common/caching/SsdFileTracker.cpp index 32e3c307e352b..9e05695b1248e 100644 --- a/velox/common/caching/SsdFileTracker.cpp +++ b/velox/common/caching/SsdFileTracker.cpp @@ -30,19 +30,19 @@ void SsdFileTracker::fileTouched(int32_t totalEntries) { } void SsdFileTracker::regionFilled(int32_t region) { - const uint64_t best = + const double best = *std::max_element(regionScores_.begin(), regionScores_.end()); - regionScores_[region] = std::max(regionScores_[region], best * 1.1); + regionScores_[region] = std::max(regionScores_[region], best * 1.1); } std::vector SsdFileTracker::findEvictionCandidates( int32_t numCandidates, int32_t numRegions, const std::vector& regionPins) { - // Calculates average score of regions wiht no pins. Returns up to - // 'numCandidates' unpinned regions with score <= average, lowest - // scoring region first. - int64_t scoreSum = 0; + // Calculates average score of regions with no pins. Returns up to + // 'numCandidates' unpinned regions with score <= average, lowest scoring + // region first. + double scoreSum = 0; int32_t numUnpinned = 0; for (int i = 0; i < numRegions; ++i) { if (regionPins[i] > 0) { diff --git a/velox/common/caching/SsdFileTracker.h b/velox/common/caching/SsdFileTracker.h index e8dd2154fea78..5ab825f50b88f 100644 --- a/velox/common/caching/SsdFileTracker.h +++ b/velox/common/caching/SsdFileTracker.h @@ -24,9 +24,9 @@ namespace facebook::velox::cache { -// Tracks reads on an SsdFile. Reads are counted for fixed size regions and -// periodically decayed. Not thread safe, synchronization is the caller's -// responsibility. +/// Tracks reads on an SsdFile. Reads are counted for fixed size regions and +/// periodically decayed. Not thread safe, synchronization is the caller's +/// responsibility. class SsdFileTracker { public: void resize(int32_t numRegions) { @@ -42,9 +42,9 @@ class SsdFileTracker { } // Marks that a region has been filled and transits from writable to - // evictable. Set its score to be at least the best score + - // a small margin so that it gets time to live. Otherwise it has had - // the least time to get hits and would be the first evicted. + // evictable. Set its score to be at least the best score + a small margin so + // that it gets time to live. Otherwise, it has had the least time to get hits + // and would be the first evicted. void regionFilled(int32_t region); // Increments event count and periodically decays @@ -52,21 +52,21 @@ class SsdFileTracker { // tracked file. void fileTouched(int32_t totalEntries); - // Returns up to 'numCandidates' least used regions. 'numRegions' is - // the count of existing regions. This can be less than the size of - // the tracker if the file cannot grow to full size. Regions with a - // non-zero count in 'regionPins' are not considered. + /// Returns up to 'numCandidates' least used regions. 'numRegions' is the + /// count of existing regions. This can be less than the size of the tracker + /// if the file cannot grow to full size. Regions with a non-zero count in + /// 'regionPins' are not considered. std::vector findEvictionCandidates( int32_t numCandidates, int32_t numRegions, const std::vector& regionPins); // Expose the region access data. Used in checkpointing cache state. - std::vector>& regionScores() { + std::vector>& regionScores() { return regionScores_; } - void setRegionScores(const std::vector& scores) { + void setRegionScores(const std::vector& scores) { VELOX_CHECK_EQ(scores.size(), regionScores_.size()); for (auto i = 0; i < scores.size(); ++i) { regionScores_[i] = scores[i]; @@ -76,18 +76,25 @@ class SsdFileTracker { /// Exports a copy of the scores. Tsan will report an error if a /// pointer to atomics is passed to write(). Therefore copy the /// atomics into non-atomics before writing. - std::vector copyScores() { - std::vector scores(regionScores_.size()); + std::vector copyScores() { + std::vector scores(regionScores_.size()); for (auto i = 0; i < scores.size(); ++i) { scores[i] = tsanAtomicValue(regionScores_[i]); } return scores; } + /// Resets scores of all regions. + /// + /// NOTE: this is only used by test and Prestissimo worker operation. + void clear() { + std::fill(regionScores_.begin(), regionScores_.end(), 0); + } + private: static constexpr int32_t kDecayInterval = 1000; - std::vector> regionScores_; + std::vector> regionScores_; // Count of lookups. The scores are decayed every time the count goes // over kDecayInterval or half count of cache entries, whichever comes first. diff --git a/velox/common/caching/StringIdMap.cpp b/velox/common/caching/StringIdMap.cpp index 9d500ff19aeba..c8c88542da1e9 100644 --- a/velox/common/caching/StringIdMap.cpp +++ b/velox/common/caching/StringIdMap.cpp @@ -29,25 +29,25 @@ uint64_t StringIdMap::id(std::string_view string) { void StringIdMap::release(uint64_t id) { std::lock_guard l(mutex_); - auto it = idToString_.find(id); - if (it != idToString_.end()) { + auto it = idToEntry_.find(id); + if (it != idToEntry_.end()) { VELOX_CHECK_LT( 0, it->second.numInUse, "Extra release of id in StringIdMap"); if (--it->second.numInUse == 0) { pinnedSize_ -= it->second.string.size(); auto strIter = stringToId_.find(it->second.string); - assert(strIter != stringToId_.end()); + VELOX_DCHECK(strIter != stringToId_.end()); stringToId_.erase(strIter); - idToString_.erase(it); + idToEntry_.erase(it); } } } void StringIdMap::addReference(uint64_t id) { std::lock_guard l(mutex_); - auto it = idToString_.find(id); + auto it = idToEntry_.find(id); VELOX_CHECK( - it != idToString_.end(), + it != idToEntry_.end(), "Trying to add a reference to id {} that is not in StringIdMap", id); @@ -58,16 +58,15 @@ uint64_t StringIdMap::makeId(std::string_view string) { std::lock_guard l(mutex_); auto it = stringToId_.find(string); if (it != stringToId_.end()) { - auto entry = idToString_.find(it->second); - VELOX_CHECK(entry != idToString_.end()); + auto entry = idToEntry_.find(it->second); + VELOX_CHECK(entry != idToEntry_.end()); if (++entry->second.numInUse == 1) { pinnedSize_ += entry->second.string.size(); } - return it->second; } Entry entry; - entry.string = std::string(string); + entry.string = string; // Check that we do not use an id twice. In practice this never // happens because the int64 counter would have to wrap around for // this. Even if this happened, the time spent in the loop would @@ -75,13 +74,44 @@ uint64_t StringIdMap::makeId(std::string_view string) { // be in the 100K range. do { entry.id = ++lastId_; - } while (idToString_.find(entry.id) != idToString_.end()); + } while (idToEntry_.find(entry.id) != idToEntry_.end()); entry.numInUse = 1; - pinnedSize_ += entry.string.size(); - auto id = entry.id; - auto& entryInTable = idToString_[id] = std::move(entry); - stringToId_[entryInTable.string] = entry.id; + pinnedSize_ += string.size(); + const auto id = entry.id; + idToEntry_[id] = std::move(entry); + stringToId_[string] = id; return lastId_; } +uint64_t StringIdMap::recoverId(uint64_t id, std::string_view string) { + std::lock_guard l(mutex_); + auto it = stringToId_.find(string); + if (it != stringToId_.end()) { + VELOX_CHECK_EQ( + id, it->second, "Multiple recover ids assigned to {}", string); + auto entry = idToEntry_.find(it->second); + VELOX_CHECK(entry != idToEntry_.end()); + if (++entry->second.numInUse == 1) { + pinnedSize_ += entry->second.string.size(); + } + return id; + } + + VELOX_CHECK_EQ( + idToEntry_.count(id), + 0, + "Reused recover id {} assigned to {}", + id, + string); + + Entry entry; + entry.string = string; + entry.id = id; + lastId_ = std::max(lastId_, id); + entry.numInUse = 1; + pinnedSize_ += string.size(); + idToEntry_[id] = std::move(entry); + stringToId_[string] = id; + return id; +} } // namespace facebook::velox diff --git a/velox/common/caching/StringIdMap.h b/velox/common/caching/StringIdMap.h index c5ec74fd4121a..582de4d09c537 100644 --- a/velox/common/caching/StringIdMap.h +++ b/velox/common/caching/StringIdMap.h @@ -28,39 +28,66 @@ class StringIdMap { public: static constexpr uint64_t kNoId = ~0UL; - StringIdMap() {} + StringIdMap() = default; StringIdMap(const StringIdMap& other) = delete; StringIdMap(StringIdMap&& other) = delete; void operator=(const StringIdMap& other) = delete; void operator=(StringIdMap&& other) = delete; - // Returns the id of 'string' or kNoId if the string is not known. + /// Returns the id of 'string' or kNoId if the string is not known. uint64_t id(std::string_view string); - // Returns the total length of strings involved in currently referenced - // mappings. + /// Returns the total length of strings involved in currently referenced + /// mappings. int64_t pinnedSize() const { return pinnedSize_; } - // Returns the id for 'string' and increments its use count. Assigns a - // new id if none exists. must be released with release() when no longer used. + /// Returns the id for 'string' and increments its use count. Assigns a + /// new id if none exists. Must be released with release() when no longer + /// used. uint64_t makeId(std::string_view string); + /// Returns the id for 'string' and increments its use count. Assigns a new id + /// if none exists. Must be released with release() when no longer used. + /// Recovers string id map by assigning 'id' to 'string' and increments its + /// use count. The function returns the recovered 'id'. It throws if 'id' has + /// already been assigned to other string or 'string' has already assigned a + /// different id. Must be released with release() when no longer used. + /// + /// NOTE: this is used by SSD cache to recover the file name to file id + /// mapping. This is to ensure the same file to be mapped to the same SSD file + /// shard after recover. + uint64_t recoverId(uint64_t id, std::string_view string); + // Decrements the use count of id and may free the associated memory if no // uses remain. void release(uint64_t id); - // Increments the use count of 'id'. + /// Increments the use count of 'id'. void addReference(uint64_t id); - // Returns a copy of the string associated with id or empty string if id has - // no string. + /// Returns a copy of the string associated with id or empty string if id has + /// no string. std::string string(uint64_t id) { std::lock_guard l(mutex_); - auto it = idToString_.find(id); - return it == idToString_.end() ? "" : it->second.string; + auto it = idToEntry_.find(id); + return it == idToEntry_.end() ? "" : it->second.string; + } + + /// Resets StringIdMap. + void testingReset() { + std::lock_guard l(mutex_); + stringToId_.clear(); + idToEntry_.clear(); + lastId_ = 0; + pinnedSize_ = 0; + } + + uint64_t testingLastId() const { + std::lock_guard l(mutex_); + return lastId_; } private: @@ -70,23 +97,26 @@ class StringIdMap { uint32_t numInUse{}; }; - std::mutex mutex_; + mutable std::mutex mutex_; folly::F14FastMap stringToId_; - folly::F14FastMap idToString_; - uint64_t lastId_{}; - uint64_t pinnedSize_{}; + folly::F14FastMap idToEntry_; + uint64_t lastId_{0}; + uint64_t pinnedSize_{0}; }; -// Keeps a string-id association live for the duration of this. +/// Keeps a string-id association live for the duration of this. class StringIdLease { public: StringIdLease() = default; - // Makes a lease for 'string' and makes sure it has an id. + /// Makes a lease for 'string' and makes sure it has an id. StringIdLease(StringIdMap& ids, std::string_view string) : ids_(&ids), id_(ids_->makeId(string)) {} - // Makes a new lease for an id that already references a string. + StringIdLease(StringIdMap& ids, uint64_t id, std::string_view string) + : ids_(&ids), id_(ids_->recoverId(id, string)) {} + + /// Makes a new lease for an id that already references a string. StringIdLease(StringIdMap& ids, uint64_t id) : ids_(&ids), id_(id) { ids_->addReference(id_); } @@ -106,6 +136,15 @@ class StringIdLease { other.id_ = StringIdMap::kNoId; } + void operator=(const StringIdLease& other) { + clear(); + ids_ = other.ids_; + if (ids_ && other.id_ != StringIdMap::kNoId) { + ids_->addReference(other.id_); + } + id_ = other.id_; + } + void operator=(StringIdLease&& other) noexcept { clear(); ids_ = other.ids_; @@ -130,15 +169,6 @@ class StringIdLease { return id_ != StringIdMap::kNoId; } - void operator=(const StringIdLease& other) { - clear(); - ids_ = other.ids_; - if (ids_ && other.id_ != StringIdMap::kNoId) { - ids_->addReference(other.id_); - } - id_ = other.id_; - } - uint64_t id() const { return id_; } diff --git a/velox/common/caching/tests/AsyncDataCacheTest.cpp b/velox/common/caching/tests/AsyncDataCacheTest.cpp index 4c2714db04b19..bb334036cb19f 100644 --- a/velox/common/caching/tests/AsyncDataCacheTest.cpp +++ b/velox/common/caching/tests/AsyncDataCacheTest.cpp @@ -14,12 +14,17 @@ * limitations under the License. */ +#include "folly/experimental/EventCount.h" +#include "velox/common/base/Semaphore.h" #include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/caching/CacheTTLController.h" #include "velox/common/caching/FileIds.h" #include "velox/common/caching/SsdCache.h" #include "velox/common/file/FileSystems.h" #include "velox/common/memory/Memory.h" #include "velox/common/memory/MmapAllocator.h" +#include "velox/common/testutil/ScopedTestTime.h" +#include "velox/common/testutil/TestValue.h" #include "velox/exec/tests/utils/TempDirectoryPath.h" #include @@ -28,11 +33,11 @@ #include #include -#include #include using namespace facebook::velox; using namespace facebook::velox::cache; +using namespace facebook::velox::common::testutil; using facebook::velox::memory::MemoryAllocator; @@ -45,8 +50,19 @@ struct Request { SsdPin ssdPin; }; -class AsyncDataCacheTest : public testing::Test { +struct TestParam { + bool checksumEnabled; + bool checksumVerificationEnabled; +}; + +class AsyncDataCacheTest : public ::testing::TestWithParam { public: + static std::vector getTestParams() { + static std::vector testParams = { + {false, false}, {true, false}, {true, true}}; + return testParams; + } + // Deterministically fills 'allocation' based on 'sequence' static void initializeContents(int64_t sequence, memory::Allocation& alloc) { for (int32_t i = 0; i < alloc.numRuns(); ++i) { @@ -62,25 +78,49 @@ class AsyncDataCacheTest : public testing::Test { protected: static constexpr int32_t kNumFiles = 100; + static constexpr int32_t kNumSsdShards = 4; + + static void SetUpTestCase() { + TestValue::enable(); + } void SetUp() override { filesystems::registerLocalFileSystem(); } void TearDown() override { - if (executor_) { - executor_->join(); - } if (cache_) { - auto ssdCache = cache_->ssdCache(); + cache_->shutdown(); + auto* ssdCache = cache_->ssdCache(); if (ssdCache) { ssdCache->testingDeleteFiles(); } - cache_->shutdown(); } + if (loadExecutor_ != nullptr) { + loadExecutor_->join(); + } + filenames_.clear(); + CacheTTLController::testingClear(); + fileIds().testingReset(); } - void initializeCache(uint64_t maxBytes, int64_t ssdBytes = 0) { + void waitForPendingLoads() { + while (numPendingLoads_ > 0) { + std::this_thread::sleep_for(std::chrono::microseconds(2000)); // NOLINT + } + } + + void initializeCache( + uint64_t maxBytes, + int64_t ssdBytes = 0, + uint64_t checkpointIntervalBytes = 0, + bool eraseCheckpoint = false, + AsyncDataCache::Options cacheOptions = {}) { + if (cache_ != nullptr) { + cache_->shutdown(); + } + cache_.reset(); + std::unique_ptr ssdCache; if (ssdBytes > 0) { // tmpfs does not support O_DIRECT, so turn this off for testing. @@ -88,27 +128,31 @@ class AsyncDataCacheTest : public testing::Test { // Make a new tempDirectory only if one is not already set. The // second creation of cache must find the checkpoint of the // previous one. - if (tempDirectory_ == nullptr) { + if (tempDirectory_ == nullptr || eraseCheckpoint) { tempDirectory_ = exec::test::TempDirectoryPath::create(); } - ssdCache = std::make_unique( - fmt::format("{}/cache", tempDirectory_->path), + SsdCache::Config config( + fmt::format("{}/cache", tempDirectory_->getPath()), ssdBytes, - 4, - executor(), - ssdBytes / 20); - } - - if (cache_ != nullptr) { - cache_->shutdown(); + kNumSsdShards, + ssdExecutor(), + checkpointIntervalBytes > 0 ? checkpointIntervalBytes : ssdBytes / 20, + false, + GetParam().checksumEnabled, + GetParam().checksumVerificationEnabled); + ssdCache = std::make_unique(config); } - cache_.reset(); - allocator_.reset(); - memory::MmapAllocator::Options options; - options.capacity = maxBytes; - allocator_ = std::make_shared(options); - cache_ = AsyncDataCache::create(allocator_.get(), std::move(ssdCache)); + memory::MemoryManagerOptions options; + options.useMmapAllocator = true; + options.allocatorCapacity = maxBytes; + options.arbitratorCapacity = maxBytes; + options.arbitratorReservedCapacity = 0; + options.trackDefaultUsage = true; + manager_ = std::make_unique(options); + allocator_ = static_cast(manager_->allocator()); + cache_ = + AsyncDataCache::create(allocator_, std::move(ssdCache), cacheOptions); if (filenames_.empty()) { for (auto i = 0; i < kNumFiles; ++i) { auto name = fmt::format("testing_file_{}", i); @@ -125,9 +169,14 @@ class AsyncDataCacheTest : public testing::Test { void loadOne(uint64_t fileNum, Request& request, bool injectError); // Brings the data for the ranges in 'requests' into cache. The individual - // entries should be accessed with loadOne(). - void - loadBatch(uint64_t fileNum, std::vector& requests, bool injectError); + // entries should be accessed with loadOne(). 'requests' are handled with one + // TestingCoalescedSsdLoad and one TestingCoalescedLoad. Call + // semaphore.acquire() twice if needing to wait for the two loads to finish. + void loadBatch( + uint64_t fileNum, + std::vector& requests, + bool injectError, + Semaphore* semaphore = nullptr); // Gets a pin on each of 'requests' individually. This checks the contents via // cache_'s verifyHook. @@ -140,13 +189,15 @@ class AsyncDataCacheTest : public testing::Test { } } + void loadNFiles(int32_t numFiles, std::vector offsets); + // Loads a sequence of entries from a number of files. Looks up a // number of entries, then loads the ones that nobody else is // loading. Stops after loading 'loadBytes' worth of entries. If // 'errorEveryNBatches' is non-0, every nth load batch will have a // bad read and wil be dropped. The entries of the failed batch read // will still be accessed one by one. If 'largeEveryNBatches' is - // non-0, allocates and freees a single allocation of 'largeBytes' + // non-0, allocates and frees a single allocation of 'largeBytes' // every so many batches. This creates extra memory pressure, as // happens when allocating large hash tables in queries. void loadLoop( @@ -162,7 +213,7 @@ class AsyncDataCacheTest : public testing::Test { std::vector threads; threads.reserve(numThreads); for (int32_t i = 0; i < numThreads; ++i) { - threads.push_back(std::thread([this, i, func]() { func(i); })); + threads.push_back(std::thread([i, func]() { func(i); })); } for (auto& thread : threads) { thread.join(); @@ -192,6 +243,12 @@ class AsyncDataCacheTest : public testing::Test { } } + static void waitForSsdWriteToFinish(const SsdCache* ssdCache) { + while (ssdCache->writeInProgress()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // NOLINT + } + } + CachePin newEntry(uint64_t offset, int32_t size) { folly::SemiFuture wait(false); try { @@ -206,15 +263,26 @@ class AsyncDataCacheTest : public testing::Test { }; } - folly::IOThreadPoolExecutor* executor() { + folly::IOThreadPoolExecutor* loadExecutor() { static std::mutex mutex; std::lock_guard l(mutex); - if (!executor_) { + if (loadExecutor_ == nullptr) { // We have up to 20 threads. Some tests run at max 16 threads so // that there are threads left over for SSD background write. - executor_ = std::make_unique(20); + loadExecutor_ = std::make_unique(20); } - return executor_.get(); + return loadExecutor_.get(); + } + + folly::IOThreadPoolExecutor* ssdExecutor() { + static std::mutex mutex; + std::lock_guard l(mutex); + if (ssdExecutor_ == nullptr) { + // We have up to 20 threads. Some tests run at max 16 threads so + // that there are threads left over for SSD background write. + ssdExecutor_ = std::make_unique(20); + } + return ssdExecutor_.get(); } void clearAllocations(std::deque& allocations) { @@ -225,11 +293,14 @@ class AsyncDataCacheTest : public testing::Test { } std::shared_ptr tempDirectory_; - std::shared_ptr allocator_; + std::unique_ptr manager_; + memory::MemoryAllocator* allocator_; std::shared_ptr cache_; std::vector filenames_; - std::unique_ptr executor_; + std::unique_ptr loadExecutor_; + std::unique_ptr ssdExecutor_; int32_t numLargeRetries_{0}; + std::atomic_int64_t numPendingLoads_{0}; }; class TestingCoalescedLoad : public CoalescedLoad { @@ -382,7 +453,8 @@ void AsyncDataCacheTest::loadOne( void AsyncDataCacheTest::loadBatch( uint64_t fileNum, std::vector& requests, - bool injectError) { + bool injectError, + Semaphore* semaphore) { // Pattern for loading a set of buffers from a file: Divide the requested // ranges between already loaded and loadable from storage. std::vector fromStorage; @@ -415,13 +487,22 @@ void AsyncDataCacheTest::loadBatch( } auto load = std::make_shared( std::move(keys), std::move(sizes), cache_, injectError); - executor()->add([load]() { + ++numPendingLoads_; + loadExecutor()->add([this, load, semaphore]() { + SCOPE_EXIT { + --numPendingLoads_; + }; try { load->loadOrFuture(nullptr); } catch (const std::exception& e) { // Expecting error, ignore. }; + if (semaphore) { + semaphore->release(); + } }); + } else if (semaphore) { + semaphore->release(); } if (!fromSsd.empty()) { @@ -439,13 +520,50 @@ void AsyncDataCacheTest::loadBatch( std::move(ssdPins), cache_, injectError); - executor()->add([load]() { + ++numPendingLoads_; + loadExecutor()->add([this, load, semaphore]() { + SCOPE_EXIT { + --numPendingLoads_; + }; try { load->loadOrFuture(nullptr); } catch (const std::exception& e) { // Expecting error, ignore. }; + if (semaphore) { + semaphore->release(); + } }); + } else if (semaphore) { + semaphore->release(); + } +} + +void AsyncDataCacheTest::loadNFiles( + int32_t numFiles, + std::vector offsets) { + Semaphore semaphore(0); + + std::vector batch; + int32_t numLoads = 0; + for (auto file = 0; file < numFiles; ++file) { + auto fileNum = filenames_[file].id(); + if (auto instance = CacheTTLController::getInstance()) { + instance->addOpenFileInfo(fileNum); + } + for (auto i = 0; i < offsets.size() - 1; i++) { + batch.emplace_back(offsets[i], offsets[i + 1] - offsets[i]); + if (batch.size() == 8 || i == (offsets.size() - 2)) { + loadBatch(fileNum, batch, false, &semaphore); + batch.clear(); + numLoads += + 2; // One TestingCoalescedSsdLoad and one TestingCoalescedLoad. + } + } + } + + for (auto i = 0; i < numLoads; i++) { + semaphore.acquire(); } } @@ -511,7 +629,7 @@ void AsyncDataCacheTest::loadLoop( } } -TEST_F(AsyncDataCacheTest, pin) { +TEST_P(AsyncDataCacheTest, pin) { constexpr int64_t kSize = 25000; initializeCache(1 << 20); auto& exec = folly::QueuedImmediateExecutor::instance(); @@ -578,14 +696,14 @@ TEST_F(AsyncDataCacheTest, pin) { EXPECT_EQ(0, cache_->incrementPrefetchPages(0)); } -TEST_F(AsyncDataCacheTest, replace) { +TEST_P(AsyncDataCacheTest, replace) { constexpr int64_t kMaxBytes = 64 << 20; FLAGS_velox_exception_user_stacktrace_enabled = false; initializeCache(kMaxBytes); // Load 10x the max size, inject an error every 21 batches. loadLoop(0, kMaxBytes * 10, 21); - if (executor_) { - executor_->join(); + if (loadExecutor_ != nullptr) { + loadExecutor_->join(); } auto stats = cache_->refreshStats(); EXPECT_LT(0, stats.numHit); @@ -596,16 +714,11 @@ TEST_F(AsyncDataCacheTest, replace) { cache_->incrementCachedPages(0)); } -TEST_F(AsyncDataCacheTest, evictAccounting) { +TEST_P(AsyncDataCacheTest, evictAccounting) { constexpr int64_t kMaxBytes = 64 << 20; FLAGS_velox_exception_user_stacktrace_enabled = false; initializeCache(kMaxBytes); - auto memoryManager = - std::make_unique(memory::MemoryManagerOptions{ - .capacity = (int64_t)allocator_->capacity(), - .trackDefaultUsage = true, - .allocator = allocator_.get()}); - auto pool = memoryManager->addLeafPool("test"); + auto pool = manager_->addLeafPool("test"); // We make allocations that we exchange for larger ones later. This will evict // cache. We check that the evictions are not counted on the pool even if they @@ -614,16 +727,17 @@ TEST_F(AsyncDataCacheTest, evictAccounting) { memory::ContiguousAllocation large; pool->allocateNonContiguous(1200, allocation); pool->allocateContiguous(1200, large); - EXPECT_EQ(memory::AllocationTraits::kPageSize * 2400, pool->currentBytes()); + EXPECT_EQ(memory::AllocationTraits::pageBytes(2400), pool->usedBytes()); loadLoop(0, kMaxBytes * 1.1); + waitForPendingLoads(); pool->allocateNonContiguous(2400, allocation); pool->allocateContiguous(2400, large); - EXPECT_EQ(memory::AllocationTraits::kPageSize * 4800, pool->currentBytes()); + EXPECT_EQ(memory::AllocationTraits::pageBytes(4800), pool->usedBytes()); auto stats = cache_->refreshStats(); EXPECT_LT(0, stats.numEvict); } -TEST_F(AsyncDataCacheTest, largeEvict) { +TEST_P(AsyncDataCacheTest, largeEvict) { constexpr int64_t kMaxBytes = 256 << 20; constexpr int32_t kNumThreads = 24; FLAGS_velox_exception_user_stacktrace_enabled = false; @@ -633,8 +747,8 @@ TEST_F(AsyncDataCacheTest, largeEvict) { runThreads(kNumThreads, [&](int32_t /*i*/) { loadLoop(0, kMaxBytes * 1.2, 0, 1, kMaxBytes / 4); }); - if (executor_) { - executor_->join(); + if (loadExecutor_ != nullptr) { + loadExecutor_->join(); } auto stats = cache_->refreshStats(); EXPECT_LT(0, stats.numEvict); @@ -644,7 +758,7 @@ TEST_F(AsyncDataCacheTest, largeEvict) { LOG(INFO) << "Reties after failed evict: " << numLargeRetries_; } -TEST_F(AsyncDataCacheTest, outOfCapacity) { +TEST_P(AsyncDataCacheTest, outOfCapacity) { const int64_t kMaxBytes = 64 << 20; // 64MB as MmapAllocator's min size is 64MB const int32_t kSize = 16 << 10; @@ -695,7 +809,7 @@ void corruptFile(const std::string& path) { } } // namespace -TEST_F(AsyncDataCacheTest, DISABLED_ssd) { +TEST_P(AsyncDataCacheTest, DISABLED_ssd) { #ifdef TSAN_BUILD // NOTE: scale down the test data set to prevent tsan tester from running out // of memory. @@ -719,8 +833,8 @@ TEST_F(AsyncDataCacheTest, DISABLED_ssd) { // data may not get written if reading is faster than writing. Error out once // every 11 load batches. // - // Note that executor() must have more threads so that background - // write does not wait for the workload. + // NOTE: loadExecutor() must have more threads so that background write does + // not wait for the workload. runThreads(16, [&](int32_t /*i*/) { loadLoop(0, kSsdBytes, 11); }); LOG(INFO) << "Stats after first pass: " << cache_->toString(); auto ssdStats = cache_->ssdCache()->stats(); @@ -757,7 +871,7 @@ TEST_F(AsyncDataCacheTest, DISABLED_ssd) { cache_->ssdCache()->clear(); // We cut the tail off one of the cache shards. - corruptFile(fmt::format("{}/cache0.cpt", tempDirectory_->path)); + corruptFile(fmt::format("{}/cache0.cpt", tempDirectory_->getPath())); // We open the cache from checkpoint. Reading checks the data integrity, here // we check that more data was read than written. initializeCache(kRamBytes, kSsdBytes); @@ -770,17 +884,18 @@ TEST_F(AsyncDataCacheTest, DISABLED_ssd) { ASSERT_EQ(ssdStatsFromCP.readCheckpointErrors, 1); } -TEST_F(AsyncDataCacheTest, invalidSsdPath) { +TEST_P(AsyncDataCacheTest, invalidSsdPath) { auto testPath = "hdfs:/test/prefix_"; uint64_t ssdBytes = 256UL << 20; + SsdCache::Config config(testPath, ssdBytes, 4, ssdExecutor(), ssdBytes / 20); VELOX_ASSERT_THROW( - SsdCache(testPath, ssdBytes, 4, executor(), ssdBytes / 20), + SsdCache(config), fmt::format( "Ssd path '{}' does not start with '/' that points to local file system.", testPath)); } -TEST_F(AsyncDataCacheTest, cacheStats) { +TEST_P(AsyncDataCacheTest, cacheStats) { CacheStats stats; stats.tinySize = 234; stats.largeSize = 1024; @@ -789,6 +904,8 @@ TEST_F(AsyncDataCacheTest, cacheStats) { stats.numEntries = 100; stats.numExclusive = 20; stats.numShared = 30; + stats.sharedPinnedBytes = 10 << 20; + stats.exclusivePinnedBytes = 10 << 20; stats.numEmptyEntries = 20; stats.numPrefetch = 30; stats.prefetchBytes = 100; @@ -798,29 +915,56 @@ TEST_F(AsyncDataCacheTest, cacheStats) { stats.numEvict = 463; stats.numEvictChecks = 348; stats.numWaitExclusive = 244; + stats.numAgedOut = 10; stats.allocClocks = 1320; stats.sumEvictScore = 123; + stats.numStales = 100; ASSERT_EQ( stats.toString(), "Cache size: 2.56KB tinySize: 257B large size: 2.31KB\n" - "Cache entries: 100 read pins: 30 write pins: 20 num write wait: 244 empty entries: 20\n" - "Cache access miss: 2041 hit: 46 hit bytes: 1.34KB eviction: 463 eviction checks: 348\n" + "Cache entries: 100 read pins: 30 write pins: 20 pinned shared: 10.00MB pinned exclusive: 10.00MB\n" + " num write wait: 244 empty entries: 20\n" + "Cache access miss: 2041 hit: 46 hit bytes: 1.34KB eviction: 463 savable eviction: 0 eviction checks: 348 aged out: 10 stales: 100\n" "Prefetch entries: 30 bytes: 100B\n" "Alloc Megaclocks 0"); + CacheStats statsDelta = stats - stats; + ASSERT_EQ(statsDelta.tinySize, 0); + ASSERT_EQ(statsDelta.largeSize, 0); + ASSERT_EQ(statsDelta.tinyPadding, 0); + ASSERT_EQ(statsDelta.largePadding, 0); + ASSERT_EQ(statsDelta.numEntries, 0); + ASSERT_EQ(statsDelta.numExclusive, 0); + ASSERT_EQ(statsDelta.numShared, 0); + ASSERT_EQ(statsDelta.sharedPinnedBytes, 0); + ASSERT_EQ(statsDelta.exclusivePinnedBytes, 0); + ASSERT_EQ(statsDelta.numEmptyEntries, 0); + ASSERT_EQ(statsDelta.numPrefetch, 0); + ASSERT_EQ(statsDelta.prefetchBytes, 0); + ASSERT_EQ(statsDelta.numHit, 0); + ASSERT_EQ(statsDelta.hitBytes, 0); + ASSERT_EQ(statsDelta.numNew, 0); + ASSERT_EQ(statsDelta.numEvict, 0); + ASSERT_EQ(statsDelta.numEvictChecks, 0); + ASSERT_EQ(statsDelta.numWaitExclusive, 0); + ASSERT_EQ(statsDelta.numAgedOut, 0); + ASSERT_EQ(statsDelta.allocClocks, 0); + ASSERT_EQ(statsDelta.sumEvictScore, 0); + ASSERT_EQ(statsDelta.numStales, 0); + constexpr uint64_t kRamBytes = 32 << 20; constexpr uint64_t kSsdBytes = 512UL << 20; initializeCache(kRamBytes, kSsdBytes); - ASSERT_EQ( - cache_->toString(), + const std::string expectedDetailedCacheOutput = "AsyncDataCache:\n" "Cache size: 0B tinySize: 0B large size: 0B\n" - "Cache entries: 0 read pins: 0 write pins: 0 num write wait: 0 empty entries: 0\n" - "Cache access miss: 0 hit: 0 hit bytes: 0B eviction: 0 eviction checks: 0\n" + "Cache entries: 0 read pins: 0 write pins: 0 pinned shared: 0B pinned exclusive: 0B\n" + " num write wait: 0 empty entries: 0\n" + "Cache access miss: 0 hit: 0 hit bytes: 0B eviction: 0 savable eviction: 0 eviction checks: 0 aged out: 0 stales: 0\n" "Prefetch entries: 0 bytes: 0B\n" "Alloc Megaclocks 0\n" "Allocated pages: 0 cached pages: 0\n" - "Backing: Memory Allocator[MMAP capacity 16.00KB allocated pages 0 mapped pages 0 external mapped pages 0\n" + "Backing: Memory Allocator[MMAP total capacity 64.00MB free capacity 64.00MB allocated pages 0 mapped pages 0 external mapped pages 0\n" "[size 1: 0(0MB) allocated 0 mapped]\n" "[size 2: 0(0MB) allocated 0 mapped]\n" "[size 4: 0(0MB) allocated 0 mapped]\n" @@ -831,6 +975,574 @@ TEST_F(AsyncDataCacheTest, cacheStats) { "[size 128: 0(0MB) allocated 0 mapped]\n" "[size 256: 0(0MB) allocated 0 mapped]\n" "]\n" - "SSD: Ssd cache IO: Write 0MB read 0MB Size 0GB Occupied 0GB0K entries.\n" - "GroupStats: "); + "SSD: Ssd cache IO: Write 0B read 0B Size 512.00MB Occupied 0B 0K entries.\n" + "GroupStats: "; + ASSERT_EQ(cache_->toString(), expectedDetailedCacheOutput); + ASSERT_EQ(cache_->toString(true), expectedDetailedCacheOutput); + const std::string expectedShortCacheOutput = + "AsyncDataCache:\n" + "Cache size: 0B tinySize: 0B large size: 0B\n" + "Cache entries: 0 read pins: 0 write pins: 0 pinned shared: 0B pinned exclusive: 0B\n" + " num write wait: 0 empty entries: 0\n" + "Cache access miss: 0 hit: 0 hit bytes: 0B eviction: 0 savable eviction: 0 eviction checks: 0 aged out: 0 stales: 0\n" + "Prefetch entries: 0 bytes: 0B\n" + "Alloc Megaclocks 0\n" + "Allocated pages: 0 cached pages: 0\n"; + ASSERT_EQ(cache_->toString(false), expectedShortCacheOutput); } + +TEST_P(AsyncDataCacheTest, cacheStatsWithSsd) { + CacheStats stats; + stats.numHit = 234; + stats.numEvict = 1024; + stats.ssdStats = std::make_shared(); + stats.ssdStats->bytesWritten = 1; + stats.ssdStats->bytesRead = 1; + + const CacheStats otherStats; + const CacheStats deltaStats = stats - otherStats; + ASSERT_EQ(deltaStats.numHit, 234); + ASSERT_EQ(deltaStats.numEvict, 1024); + ASSERT_TRUE(deltaStats.ssdStats != nullptr); + ASSERT_EQ(deltaStats.ssdStats->bytesWritten, 1); + ASSERT_EQ(deltaStats.ssdStats->bytesRead, 1); + const std::string expectedDeltaCacheStats = + "Cache size: 0B tinySize: 0B large size: 0B\nCache entries: 0 read pins: 0 write pins: 0 pinned shared: 0B pinned exclusive: 0B\n num write wait: 0 empty entries: 0\nCache access miss: 0 hit: 234 hit bytes: 0B eviction: 1024 savable eviction: 0 eviction checks: 0 aged out: 0 stales: 0\nPrefetch entries: 0 bytes: 0B\nAlloc Megaclocks 0"; + ASSERT_EQ(deltaStats.toString(), expectedDeltaCacheStats); +} + +TEST_P(AsyncDataCacheTest, staleEntry) { + constexpr uint64_t kRamBytes = 1UL << 30; + // Disable SSD cache to test in-memory cache stale entry only. + initializeCache(kRamBytes, 0, 0); + StringIdLease file(fileIds(), std::string_view("staleEntry")); + const uint64_t offset = 1000; + const uint64_t size = 200; + folly::SemiFuture wait(false); + RawFileCacheKey key{file.id(), offset}; + auto pin = cache_->findOrCreate(key, size, &wait); + ASSERT_FALSE(pin.empty()); + ASSERT_TRUE(wait.isReady()); + ASSERT_TRUE(pin.entry()->isExclusive()); + pin.entry()->setExclusiveToShared(); + ASSERT_FALSE(pin.entry()->isExclusive()); + auto stats = cache_->refreshStats(); + ASSERT_EQ(stats.numStales, 0); + ASSERT_EQ(stats.numEntries, 1); + ASSERT_EQ(stats.numHit, 0); + + auto validPin = cache_->findOrCreate(key, size, &wait); + ASSERT_FALSE(validPin.empty()); + ASSERT_TRUE(wait.isReady()); + ASSERT_FALSE(validPin.entry()->isExclusive()); + stats = cache_->refreshStats(); + ASSERT_EQ(stats.numStales, 0); + ASSERT_EQ(stats.numEntries, 1); + ASSERT_EQ(stats.numHit, 1); + + // Stale cache access with large cache size. + auto stalePin = cache_->findOrCreate(key, 2 * size, &wait); + ASSERT_FALSE(stalePin.empty()); + ASSERT_TRUE(wait.isReady()); + ASSERT_TRUE(stalePin.entry()->isExclusive()); + stalePin.entry()->setExclusiveToShared(); + stats = cache_->refreshStats(); + ASSERT_EQ(stats.numStales, 1); + ASSERT_EQ(stats.numEntries, 1); + ASSERT_EQ(stats.numHit, 1); +} + +TEST_P(AsyncDataCacheTest, shrinkCache) { + constexpr uint64_t kRamBytes = 128UL << 20; + constexpr uint64_t kSsdBytes = 512UL << 20; + constexpr int kTinyDataSize = AsyncDataCacheEntry::kTinyDataSize - 1; + const int numEntries{10}; + constexpr int kLargeDataSize = kTinyDataSize * 2; + ASSERT_LE(numEntries * (kTinyDataSize + kLargeDataSize), kRamBytes); + + std::vector tinyCacheKeys; + std::vector largeCacheKeys; + std::vector fileLeases; + for (int i = 0; i < numEntries; ++i) { + fileLeases.emplace_back( + StringIdLease(fileIds(), fmt::format("shrinkCacheFile{}", i))); + tinyCacheKeys.emplace_back(RawFileCacheKey{fileLeases.back().id(), 0}); + largeCacheKeys.emplace_back( + RawFileCacheKey{fileLeases.back().id(), kLargeDataSize}); + } + + struct { + bool shrinkAll; + bool hasSsd; + bool releaseAll; + + std::string debugString() const { + return fmt::format( + "shrinkAll {}, hasSsd {}, releaseAll {}", + shrinkAll, + hasSsd, + releaseAll); + } + } testSettings[] = { + {true, false, false}, + {true, true, false}, + {true, false, true}, + {true, true, true}, + {false, false, true}, + {false, true, true}, + {false, false, false}, + {false, true, false}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + + initializeCache(kRamBytes, testData.hasSsd ? kSsdBytes : 0); + std::vector pins; + for (int i = 0; i < numEntries; ++i) { + auto tinyPin = cache_->findOrCreate(tinyCacheKeys[i], kTinyDataSize); + ASSERT_FALSE(tinyPin.empty()); + ASSERT_TRUE(tinyPin.entry()->tinyData() != nullptr); + ASSERT_TRUE(tinyPin.entry()->data().empty()); + ASSERT_FALSE(tinyPin.entry()->isPrefetch()); + ASSERT_FALSE(tinyPin.entry()->ssdSaveable()); + pins.push_back(std::move(tinyPin)); + auto largePin = cache_->findOrCreate(largeCacheKeys[i], kLargeDataSize); + ASSERT_FALSE(largePin.entry()->tinyData() != nullptr); + ASSERT_FALSE(largePin.entry()->data().empty()); + ASSERT_FALSE(largePin.entry()->isPrefetch()); + ASSERT_FALSE(largePin.entry()->ssdSaveable()); + pins.push_back(std::move(largePin)); + } + auto stats = cache_->refreshStats(); + ASSERT_EQ(stats.numEntries, numEntries * 2); + ASSERT_EQ(stats.numEmptyEntries, 0); + ASSERT_EQ(stats.numExclusive, numEntries * 2); + ASSERT_EQ(stats.numEvict, 0); + ASSERT_EQ(stats.numHit, 0); + ASSERT_EQ(stats.tinySize, kTinyDataSize * numEntries); + ASSERT_EQ(stats.largeSize, kLargeDataSize * numEntries); + ASSERT_EQ(stats.sharedPinnedBytes, 0); + ASSERT_GE( + stats.exclusivePinnedBytes, + (kTinyDataSize + kLargeDataSize) * numEntries); + ASSERT_EQ(stats.prefetchBytes, 0); + ASSERT_EQ(stats.numPrefetch, 0); + + const auto numMappedPagesBeforeShrink = allocator_->numMapped(); + ASSERT_GT(numMappedPagesBeforeShrink, 0); + + // Everything gets pinged in memory. + VELOX_ASSERT_THROW(cache_->shrink(0), ""); + ASSERT_EQ(cache_->shrink(testData.shrinkAll ? kRamBytes : 1), 0); + + if (!testData.releaseAll) { + for (auto& pin : pins) { + pin.entry()->setExclusiveToShared(); + } + pins.clear(); + if (testData.shrinkAll) { + ASSERT_GE( + cache_->shrink(kRamBytes), + (kLargeDataSize + kTinyDataSize) * numEntries); + } else { + ASSERT_GE(cache_->shrink(2 * kTinyDataSize), kTinyDataSize); + } + } else { + pins.clear(); + // We expect everything has been freed. + ASSERT_EQ( + cache_->shrink(testData.shrinkAll ? kRamBytes : 2 * kTinyDataSize), + 0); + } + stats = cache_->refreshStats(); + const auto numMappedPagesAfterShrink = allocator_->numMapped(); + if (testData.shrinkAll || testData.releaseAll) { + ASSERT_EQ(stats.numEntries, 0); + ASSERT_EQ(stats.numEmptyEntries, 2 * numEntries); + ASSERT_EQ(stats.numExclusive, 0); + ASSERT_EQ(stats.numEvict, 2 * numEntries); + ASSERT_EQ(stats.numHit, 0); + ASSERT_EQ(stats.tinySize, 0); + ASSERT_EQ(stats.largeSize, 0); + ASSERT_EQ(stats.sharedPinnedBytes, 0); + ASSERT_GE(stats.exclusivePinnedBytes, 0); + ASSERT_EQ(stats.prefetchBytes, 0); + ASSERT_EQ(stats.numPrefetch, 0); + if (testData.shrinkAll) { + ASSERT_EQ(numMappedPagesAfterShrink, 0); + } else { + ASSERT_LT(numMappedPagesAfterShrink, numMappedPagesBeforeShrink); + } + } else { + ASSERT_LT(stats.numEntries, 2 * numEntries); + ASSERT_GT(stats.numEntries, 0); + ASSERT_GE(stats.numEmptyEntries, 1); + ASSERT_EQ(stats.numExclusive, 0); + ASSERT_GE(stats.numEvict, 1); + ASSERT_EQ(stats.numHit, 0); + ASSERT_GT(stats.tinySize, 0); + ASSERT_GT(stats.largeSize, 0); + ASSERT_EQ(stats.sharedPinnedBytes, 0); + ASSERT_GE(stats.exclusivePinnedBytes, 0); + ASSERT_EQ(stats.prefetchBytes, 0); + ASSERT_EQ(stats.numPrefetch, 0); + ASSERT_LT(numMappedPagesAfterShrink, numMappedPagesBeforeShrink); + } + } +} + +TEST_P(AsyncDataCacheTest, shutdown) { + constexpr uint64_t kRamBytes = 16 << 20; + constexpr uint64_t kSsdBytes = 64UL << 20; + + for (const auto asyncShutdown : {false, true}) { + SCOPED_TRACE(fmt::format("asyncShutdown {}", asyncShutdown)); + // Initialize cache with a big checkpointIntervalBytes, giving eviction log + // chance to survive. + initializeCache( + kRamBytes, + kSsdBytes, + /*checkpointIntervalBytes=*/(1ULL << 30) * kNumSsdShards); + ASSERT_EQ(cache_->ssdCache()->stats().openCheckpointErrors, 4); + + // Write large amount of data, making sure eviction is triggered and the log + // file is not zero. + loadLoop(0, 16 * kSsdBytes); + ASSERT_EQ(cache_->ssdCache()->stats().checkpointsWritten, 0); + ASSERT_GT(cache_->ssdCache()->stats().regionsEvicted, 0); + ASSERT_GT(cache_->ssdCache()->testingTotalLogEvictionFilesSize(), 0); + + // Shutdown cache. + if (!asyncShutdown) { + waitForSsdWriteToFinish(cache_->ssdCache()); + } + // NOTE: we need to wait for async load to complete before shutdown as async + // data cache doesn't handle the cache access after the cache shutdown. + if (loadExecutor_ != nullptr) { + loadExecutor_->join(); + loadExecutor_.reset(); + } + const uint64_t bytesWrittenBeforeShutdown = + cache_->ssdCache()->stats().bytesWritten; + cache_->ssdCache()->shutdown(); + const uint64_t bytesWrittenAfterShutdown = + cache_->ssdCache()->stats().bytesWritten; + + if (asyncShutdown) { + // The written bytes before shutdown is not larger than before shutdown. + ASSERT_LE(bytesWrittenBeforeShutdown, bytesWrittenAfterShutdown); + } else { + // No new data has been written after shutdown. + ASSERT_EQ(bytesWrittenBeforeShutdown, bytesWrittenAfterShutdown); + } + // Eviction log files have been truncated. + ASSERT_EQ(cache_->ssdCache()->testingTotalLogEvictionFilesSize(), 0); + + // Shutdown again making sure no issue is triggered. + cache_->ssdCache()->shutdown(); + + // New cache write attempt is blocked and triggers exception. + VELOX_ASSERT_THROW( + cache_->ssdCache()->startWrite(), + "Unexpected write after SSD cache has been shutdown"); + + // Re-initialize cache. + cache_->ssdCache()->clear(); + initializeCache(kRamBytes, kSsdBytes, kSsdBytes * 10); + // Checkpoint files are intact and readable. + ASSERT_EQ(cache_->ssdCache()->stats().openCheckpointErrors, 0); + ASSERT_EQ(cache_->ssdCache()->stats().readCheckpointErrors, 0); + cache_->ssdCache()->testingDeleteCheckpoints(); + } +} + +DEBUG_ONLY_TEST_P(AsyncDataCacheTest, shrinkWithSsdWrite) { + constexpr uint64_t kRamBytes = 128UL << 20; + constexpr uint64_t kSsdBytes = 512UL << 20; + constexpr int kDataSize = 4096; + initializeCache(kRamBytes, kSsdBytes); + const int numEntries{10}; + std::vector cachePins; + uint64_t offset = 0; + for (int i = 0; i < numEntries; ++i) { + cachePins.push_back(newEntry(offset, kDataSize)); + offset += kDataSize; + } + for (auto& pin : cachePins) { + pin.entry()->setExclusiveToShared(); + } + + std::atomic_bool writeStartFlag{false}; + folly::EventCount writeStartWait; + std::atomic_bool writeWaitFlag{true}; + folly::EventCount writeWait; + SCOPED_TESTVALUE_SET( + "facebook::velox::cache::SsdCache::write", + std::function(([&](const SsdCache* cache) { + writeStartFlag = true; + writeStartWait.notifyAll(); + writeWait.await([&]() { return !writeWaitFlag.load(); }); + }))); + + // Starts a write thread running at background. + std::thread ssdWriteThread([&]() { + ASSERT_TRUE(cache_->ssdCache()->startWrite()); + cache_->saveToSsd(); + }); + + // Wait for the write thread to start, and block it while do cache shrink. + writeStartWait.await([&]() { return writeStartFlag.load(); }); + ASSERT_TRUE(cache_->ssdCache()->writeInProgress()); + + cachePins.clear(); + cache_->shrink(kRamBytes); + auto stats = cache_->refreshStats(); + // Shrink can only reclaim some entries but not all as some of the cache + // entries have been pickup for ssd write which is not evictable. + ASSERT_LT(stats.numEntries, numEntries); + ASSERT_GT(stats.numEmptyEntries, 0); + ASSERT_GT(stats.numEvict, 0); + ASSERT_GT(stats.numShared, 0); + ASSERT_EQ(stats.numExclusive, 0); + ASSERT_EQ(stats.numWaitExclusive, 0); + + // Wait for write to complete. + writeWaitFlag = false; + writeWait.notifyAll(); + ssdWriteThread.join(); + waitForSsdWriteToFinish(cache_->ssdCache()); + + stats = cache_->refreshStats(); + ASSERT_GT(stats.numEntries, stats.numEmptyEntries); + + ASSERT_GT(cache_->shrink(kRamBytes), 0); + stats = cache_->refreshStats(); + ASSERT_EQ(stats.numEntries, 0); + ASSERT_EQ(stats.numEmptyEntries, numEntries); +} + +DEBUG_ONLY_TEST_P(AsyncDataCacheTest, ttl) { + constexpr uint64_t kRamBytes = 32 << 20; + constexpr uint64_t kSsdBytes = 128UL << 20; + + initializeCache(kRamBytes, kSsdBytes); + CacheTTLController::create(*cache_); + + std::vector offsets(32); + std::generate(offsets.begin(), offsets.end(), [&, n = 0]() mutable { + return n += (kRamBytes / kNumFiles / offsets.size()); + }); + + ScopedTestTime stt; + auto loadTime1 = getCurrentTimeSec(); + auto loadTime2 = loadTime1 + 100; + + stt.setCurrentTestTimeSec(loadTime1); + loadNFiles(filenames_.size() * 2 / 3, offsets); + waitForSsdWriteToFinish(cache_->ssdCache()); + auto statsT1 = cache_->refreshStats(); + + stt.setCurrentTestTimeSec(loadTime2); + loadNFiles(filenames_.size(), offsets); + + runThreads(2, [&](int32_t /*i*/) { + CacheTTLController::getInstance()->applyTTL( + getCurrentTimeSec() - loadTime1 - 2); + }); + + auto statsTtl = cache_->refreshStats(); + EXPECT_EQ(statsTtl.numAgedOut, statsT1.numEntries); + EXPECT_EQ(statsTtl.ssdStats->entriesAgedOut, statsT1.ssdStats->entriesCached); +} + +TEST_P(AsyncDataCacheTest, makeEvictable) { + constexpr uint64_t kRamBytes = 128UL << 20; + constexpr uint64_t kSsdBytes = 512UL << 20; + constexpr int kDataSize = 4096; + for (const bool evictable : {false, true}) { + SCOPED_TRACE(fmt::format("evictable: {}", evictable)); + initializeCache(kRamBytes, kSsdBytes); + const int numEntries{10}; + std::vector cachePins; + uint64_t offset = 0; + for (int i = 0; i < numEntries; ++i) { + cachePins.push_back(newEntry(offset, kDataSize)); + offset += kDataSize; + } + for (auto& pin : cachePins) { + pin.entry()->setExclusiveToShared(!evictable); + } + if (evictable) { + std::vector keys; + keys.reserve(cachePins.size()); + for (const auto& pin : cachePins) { + keys.push_back(RawFileCacheKey{ + pin.checkedEntry()->key().fileNum.id(), + pin.checkedEntry()->key().offset}); + } + cachePins.clear(); + for (const auto& key : keys) { + cache_->makeEvictable(key); + } + } + const auto cacheEntries = cache_->testingCacheEntries(); + for (const auto& cacheEntry : cacheEntries) { + ASSERT_EQ(cacheEntry->ssdSaveable(), !evictable); + ASSERT_EQ(cacheEntry->testingAccessStats().numUses, 0); + if (evictable) { + ASSERT_EQ(cacheEntry->testingAccessStats().lastUse, 0); + } else { + ASSERT_NE(cacheEntry->testingAccessStats().lastUse, 0); + } + } + auto* ssdCache = cache_->ssdCache(); + if (ssdCache == nullptr) { + continue; + } + ssdCache->waitForWriteToFinish(); + if (evictable) { + ASSERT_EQ(ssdCache->stats().entriesCached, 0); + } else { + if (cache_->testingSsdSavable() == 0) { + ASSERT_GT(ssdCache->stats().entriesCached, 0); + } else { + // Ssd write only gets triggered after a certain ssd space usage + // threshold. + ASSERT_GE(ssdCache->stats().entriesCached, 0); + } + } + } +} + +TEST_P(AsyncDataCacheTest, ssdWriteOptions) { + constexpr uint64_t kRamBytes = 16UL << 20; // 16 MB + constexpr uint64_t kSsdBytes = 64UL << 20; // 64 MB + + // Test if ssd write behavior with different settings. + struct { + double maxWriteRatio; + double ssdSavableRatio; + int32_t minSsdSavableBytes; + bool expectedSaveToSsd; + + std::string debugString() const { + return fmt::format( + "maxWriteRatio {}, ssdSavableRatio {}, minSsdSavableBytes {}, expectedSaveToSsd {}", + maxWriteRatio, + ssdSavableRatio, + minSsdSavableBytes, + expectedSaveToSsd); + } + } testSettings[] = { + {0.8, 0.95, 32UL << 20, false}, + {0.8, 0.95, 4UL << 20, false}, + {0.8, 0.3, 32UL << 20, false}, + {0.8, 0.3, 4UL << 20, true}, + {0.0, 0.95, 0, true}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + initializeCache( + kRamBytes, + kSsdBytes, + 0, + true, + {testData.maxWriteRatio, + testData.ssdSavableRatio, + testData.minSsdSavableBytes}); + // Load data half of the in-memory capacity. + loadLoop(0, kRamBytes / 2); + waitForPendingLoads(); + auto stats = cache_->refreshStats(); + if (testData.expectedSaveToSsd) { + EXPECT_GT(stats.ssdStats->entriesWritten, 0); + } else { + EXPECT_EQ(stats.ssdStats->entriesWritten, 0); + } + if (testData.maxWriteRatio < 0.0001) { + // SSD cache write stops right after the first entry in each shard. + // Only a few entries can be written. + EXPECT_LE(stats.ssdStats->entriesWritten, 20); + } + } +} + +TEST_P(AsyncDataCacheTest, appendSsdSaveable) { + constexpr uint64_t kRamBytes = 64UL << 20; // 64 MB + constexpr uint64_t kSsdBytes = 128UL << 20; // 128 MB + + // Test if ssd write behavior with different settings. + struct { + double maxWriteRatio; + double ssdSavableRatio; + int32_t minSsdSavableBytes; + bool appendAll; + + std::string debugString() const { + return fmt::format( + "maxWriteRatio {}, ssdSavableRatio {}, minSsdSavableBytes {}, appendAll {}", + maxWriteRatio, + ssdSavableRatio, + minSsdSavableBytes, + appendAll); + } + } testSettings[] = { + {0.0, 10000.0, 1ULL << 30, true}, {0.0, 10000.0, 1UL << 30, false}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + + initializeCache( + kRamBytes, + kSsdBytes, + /*checkpointIntervalBytes=*/1UL << 30, + /*eraseCheckpoint=*/true, + {testData.maxWriteRatio, + testData.ssdSavableRatio, + testData.minSsdSavableBytes}); + // Load data half of the in-memory capacity. + loadLoop(0, kRamBytes / 2); + waitForPendingLoads(); + auto stats = cache_->refreshStats(); + + ASSERT_TRUE(cache_->ssdCache()->startWrite()); + cache_->saveToSsd(testData.appendAll); + + cache_->ssdCache()->waitForWriteToFinish(); + stats = cache_->refreshStats(); + if (testData.appendAll) { + // There might be some cache evictions. + ASSERT_GE(stats.ssdStats->entriesWritten, stats.numEntries); + } else { + ASSERT_EQ(stats.ssdStats->entriesWritten, cache_->testingNumShards()); + } + } +} + +TEST_P(AsyncDataCacheTest, checkpoint) { + constexpr uint64_t kRamBytes = 16UL << 20; // 16 MB + constexpr uint64_t kSsdBytes = 64UL << 20; // 64 MB + + initializeCache( + kRamBytes, + kSsdBytes, + /*checkpointIntervalBytes=*/1ULL << 30, + /*eraseCheckpoint=*/true); + // Load data half of the in-memory capacity. + loadLoop(0, kRamBytes / 2); + waitForPendingLoads(); + auto stats = cache_->refreshStats(); + ASSERT_EQ(stats.ssdStats->checkpointsWritten, 0); + ASSERT_TRUE(cache_->ssdCache()->startWrite()); + cache_->ssdCache()->checkpoint(); + cache_->ssdCache()->waitForWriteToFinish(); + stats = cache_->refreshStats(); + ASSERT_EQ(stats.ssdStats->checkpointsWritten, kNumSsdShards); +} + +// TODO: add concurrent fuzzer test. + +INSTANTIATE_TEST_SUITE_P( + AsyncDataCacheTest, + AsyncDataCacheTest, + ::testing::ValuesIn(AsyncDataCacheTest::getTestParams())); diff --git a/velox/common/caching/tests/CMakeLists.txt b/velox/common/caching/tests/CMakeLists.txt index afe42f2b7087f..cc7fafe2a752e 100644 --- a/velox/common/caching/tests/CMakeLists.txt +++ b/velox/common/caching/tests/CMakeLists.txt @@ -14,24 +14,43 @@ add_executable(simple_lru_cache_test SimpleLRUCacheTest.cpp) add_test(simple_lru_cache_test simple_lru_cache_test) -target_link_libraries(simple_lru_cache_test PRIVATE Folly::folly glog::glog - gtest gtest_main) +target_link_libraries( + simple_lru_cache_test + PRIVATE + Folly::folly + velox_time + glog::glog + GTest::gtest + GTest::gtest_main) -add_executable(velox_cache_test StringIdMapTest.cpp AsyncDataCacheTest.cpp - SsdFileTest.cpp SsdFileTrackerTest.cpp) +add_executable( + velox_cache_test + AsyncDataCacheTest.cpp + CacheTTLControllerTest.cpp + SsdFileTest.cpp + SsdFileTrackerTest.cpp + StringIdMapTest.cpp) add_test(velox_cache_test velox_cache_test) target_link_libraries( velox_cache_test - PRIVATE velox_caching - velox_file - velox_memory - velox_temp_path - Folly::folly - glog::glog - gtest - gtest_main) + PRIVATE + velox_caching + velox_file + velox_memory + velox_temp_path + Folly::folly + glog::glog + GTest::gtest + GTest::gtest_main) add_executable(cached_factory_test CachedFactoryTest.cpp) add_test(cached_factory_test cached_factory_test) -target_link_libraries(cached_factory_test PRIVATE velox_process Folly::folly - glog::glog gtest gtest_main) +target_link_libraries( + cached_factory_test + PRIVATE + velox_process + Folly::folly + velox_time + glog::glog + GTest::gtest + GTest::gtest_main) diff --git a/velox/common/caching/tests/CacheTTLControllerTest.cpp b/velox/common/caching/tests/CacheTTLControllerTest.cpp new file mode 100644 index 0000000000000..6f7f39454af90 --- /dev/null +++ b/velox/common/caching/tests/CacheTTLControllerTest.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/caching/CacheTTLController.h" + +#include "gtest/gtest.h" +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/caching/AsyncDataCache.h" +#include "velox/common/caching/SsdCache.h" +#include "velox/common/memory/MmapAllocator.h" + +using namespace facebook::velox; +using namespace facebook::velox::memory; + +namespace facebook::velox::cache { + +class CacheTTLControllerTest : public ::testing::Test { + protected: + void SetUp() override { + allocator_ = std::make_shared( + MmapAllocator::Options{.capacity = 1024L * 1024L}); + cache_ = AsyncDataCache::create(allocator_.get()); + } + + std::shared_ptr allocator_; + std::shared_ptr cache_; +}; + +TEST_F(CacheTTLControllerTest, addOpenFileInfo) { + CacheTTLController::create(*cache_); + + EXPECT_TRUE(CacheTTLController::getInstance()->addOpenFileInfo(123L)); + EXPECT_FALSE(CacheTTLController::getInstance()->addOpenFileInfo(123L)); + + EXPECT_TRUE(CacheTTLController::getInstance()->addOpenFileInfo(456L)); +} + +TEST_F(CacheTTLControllerTest, getCacheAgeStats) { + CacheTTLController::create(*cache_); + + int64_t fileOpenTime = getCurrentTimeSec(); + for (auto i = 0; i < 1000; i++) { + CacheTTLController::getInstance()->addOpenFileInfo(i, fileOpenTime + i); + } + + int64_t current = getCurrentTimeSec(); + EXPECT_GE( + CacheTTLController::getInstance()->getCacheAgeStats().maxAgeSecs, + current - fileOpenTime); +} +} // namespace facebook::velox::cache diff --git a/velox/common/caching/tests/CachedFactoryTest.cpp b/velox/common/caching/tests/CachedFactoryTest.cpp index 4a04bf533735d..e8161a9792569 100644 --- a/velox/common/caching/tests/CachedFactoryTest.cpp +++ b/velox/common/caching/tests/CachedFactoryTest.cpp @@ -16,140 +16,167 @@ #include "velox/common/caching/CachedFactory.h" +#include "folly/Random.h" #include "folly/executors/EDFThreadPoolExecutor.h" #include "folly/executors/thread_factory/NamedThreadFactory.h" #include "folly/synchronization/Latch.h" #include "gtest/gtest.h" +#include "velox/common/base/tests/GTestUtils.h" using namespace facebook::velox; + namespace { struct DoublerGenerator { - int operator()(const int& value) { - ++generated_; - return value * 2; + std::unique_ptr operator()( + const int& value, + const void* properties = nullptr) { + ++generated; + return std::make_unique(value * 2); } - std::atomic generated_ = 0; + std::atomic generated = 0; }; -template -T getCachedValue(std::pair& value) { - return value.second; -} - -template -bool isCached(std::pair& value) { - return value.first; -} - -template -std::pair cacheHit(const T& value) { - return std::make_pair(true, value); -} - -template -std::pair cacheMiss(const T& value) { - return std::make_pair(false, value); -} - +struct IdentityGenerator { + std::unique_ptr operator()( + const int& value, + const void* properties = nullptr) { + return std::make_unique(value); + } +}; } // namespace TEST(CachedFactoryTest, basicGeneration) { auto generator = std::make_unique(); - auto* generated = &generator->generated_; + auto* generated = &generator->generated; CachedFactory factory( std::make_unique>(1000), std::move(generator)); - EXPECT_EQ(factory.maxSize(), 1000); + ASSERT_EQ(factory.maxSize(), 1000); + ASSERT_EQ(factory.currentSize(), 0); + { auto val1 = factory.generate(1); - EXPECT_EQ(val1, cacheMiss(2)); - EXPECT_EQ(*generated, 1); - + ASSERT_EQ(*val1, 2); + ASSERT_EQ(*generated, 1); + ASSERT_FALSE(val1.fromCache()); auto val2 = factory.generate(1); - EXPECT_EQ(val2, cacheHit(2)); - EXPECT_EQ(*generated, 1); - EXPECT_EQ(factory.currentSize(), 1); + ASSERT_EQ(*val2, 2); + ASSERT_EQ(*generated, 1); + ASSERT_TRUE(val2.fromCache()); + ASSERT_EQ(factory.currentSize(), 1); + ASSERT_EQ(factory.cacheStats().pinnedSize, 1); } + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); + { auto val3 = factory.generate(1); - EXPECT_EQ(val3, cacheHit(2)); - EXPECT_EQ(*generated, 1); - + ASSERT_EQ(*val3, 2); + ASSERT_EQ(*generated, 1); + ASSERT_TRUE(val3.fromCache()); auto val4 = factory.generate(2); - EXPECT_EQ(val4, cacheMiss(4)); - EXPECT_EQ(*generated, 2); - + ASSERT_EQ(*val4, 4); + ASSERT_EQ(*generated, 2); + ASSERT_FALSE(val4.fromCache()); auto val5 = factory.generate(3); - EXPECT_EQ(val5, cacheMiss(6)); - EXPECT_EQ(*generated, 3); - EXPECT_EQ(factory.currentSize(), 3); + ASSERT_EQ(*val5, 6); + ASSERT_EQ(*generated, 3); + ASSERT_FALSE(val5.fromCache()); + ASSERT_EQ(factory.currentSize(), 3); + ASSERT_EQ(factory.cacheStats().pinnedSize, 3); } + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); - auto val6 = factory.generate(1); - EXPECT_EQ(val6, cacheHit(2)); - EXPECT_EQ(*generated, 3); - - auto val7 = factory.generate(4); - EXPECT_EQ(val7, cacheMiss(8)); - EXPECT_EQ(*generated, 4); + { + auto val6 = factory.generate(1); + ASSERT_EQ(*val6, 2); + ASSERT_EQ(*generated, 3); + ASSERT_TRUE(val6.fromCache()); + auto val7 = factory.generate(4); + ASSERT_EQ(*val7, 8); + ASSERT_EQ(*generated, 4); + ASSERT_FALSE(val7.fromCache()); + auto val8 = factory.generate(3); + ASSERT_EQ(*val8, 6); + ASSERT_EQ(*generated, 4); + ASSERT_TRUE(val8.fromCache()); + ASSERT_EQ(factory.currentSize(), 4); + ASSERT_EQ(factory.cacheStats().pinnedSize, 3); + } + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); - auto val8 = factory.generate(3); - EXPECT_EQ(val8, cacheHit(6)); - EXPECT_EQ(*generated, 4); - EXPECT_EQ(factory.currentSize(), 4); + factory.clearCache(); + ASSERT_EQ(factory.currentSize(), 0); + ASSERT_EQ(factory.cacheStats().curSize, 0); + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); } struct DoublerWithExceptionsGenerator { - int operator()(const int& value) { + std::unique_ptr operator()( + const int& value, + const void* properties = nullptr) { if (value == 3) { - throw std::invalid_argument("3 is bad"); + VELOX_FAIL("3 is bad"); } - ++generated_; - return value * 2; + ++generated; + return std::make_unique(value * 2); } - int generated_ = 0; + int generated = 0; }; TEST(CachedFactoryTest, clearCache) { auto generator = std::make_unique(); CachedFactory factory( std::make_unique>(1000), std::move(generator)); - EXPECT_EQ(factory.maxSize(), 1000); + ASSERT_EQ(factory.maxSize(), 1000); { auto val1 = factory.generate(1); - EXPECT_EQ(val1, cacheMiss(2)); + ASSERT_FALSE(val1.fromCache()); } factory.clearCache(); - EXPECT_EQ(factory.currentSize(), 0); - EXPECT_EQ(factory.generate(1), cacheMiss(2)); + ASSERT_EQ(factory.currentSize(), 0); + + ASSERT_FALSE(factory.generate(1).fromCache()); + auto cachedValue = factory.generate(1); + ASSERT_TRUE(cachedValue.fromCache()); + ASSERT_FALSE(factory.generate(2).fromCache()); + ASSERT_EQ(factory.cacheStats().pinnedSize, 1); + ASSERT_EQ(factory.cacheStats().curSize, 2); + + factory.clearCache(); + ASSERT_EQ(factory.currentSize(), 1); + ASSERT_EQ(factory.cacheStats().pinnedSize, 1); + + cachedValue.testingClear(); + ASSERT_EQ(factory.currentSize(), 1); + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); + + factory.clearCache(); + ASSERT_EQ(factory.currentSize(), 0); + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); } TEST(CachedFactoryTest, basicExceptionHandling) { auto generator = std::make_unique(); - int* generated = &generator->generated_; + int* generated = &generator->generated; CachedFactory factory( std::make_unique>(1000), std::move(generator)); auto val1 = factory.generate(1); - EXPECT_EQ(getCachedValue(val1), 2); - EXPECT_EQ(*generated, 1); - try { - auto val2 = factory.generate(3); - FAIL() << "Factory generation should have failed"; - } catch (const std::invalid_argument& e) { - // Expected. - } + ASSERT_EQ(*val1, 2); + ASSERT_EQ(*generated, 1); + VELOX_ASSERT_THROW(factory.generate(3), "3 is bad"); + val1 = factory.generate(4); - EXPECT_EQ(getCachedValue(val1), 8); - EXPECT_EQ(*generated, 2); + ASSERT_EQ(*val1, 8); + ASSERT_EQ(*generated, 2); val1 = factory.generate(1); - EXPECT_EQ(getCachedValue(val1), 2); - EXPECT_EQ(*generated, 2); + ASSERT_EQ(*val1, 2); + ASSERT_EQ(*generated, 2); } TEST(CachedFactoryTest, multiThreadedGeneration) { auto generator = std::make_unique(); - auto* generated = &generator->generated_; + auto* generated = &generator->generated; CachedFactory factory( std::make_unique>(1000), std::move(generator)); folly::EDFThreadPoolExecutor pool( @@ -157,31 +184,31 @@ TEST(CachedFactoryTest, multiThreadedGeneration) { const int numValues = 5; const int requestsPerValue = 10; folly::Latch latch(numValues * requestsPerValue); - for (int i = 0; i < requestsPerValue; i++) { - for (int j = 0; j < numValues; j++) { + for (int i = 0; i < requestsPerValue; ++i) { + for (int j = 0; j < numValues; ++j) { pool.add([&, j]() { auto value = factory.generate(j); - EXPECT_EQ(getCachedValue(value), 2 * j); + CHECK_EQ(*value, 2 * j); latch.count_down(); }); } } latch.wait(); - EXPECT_EQ(*generated, numValues); + ASSERT_EQ(*generated, numValues); } -// Same as above, but we keep the returned CachedPtrs till the end -// of the function. +// Same as above, but we keep the returned CachedPtrs till the end of the +// function. TEST(CachedFactoryTest, multiThreadedGenerationAgain) { auto generator = std::make_unique(); - auto* generated = &generator->generated_; + auto* generated = &generator->generated; CachedFactory factory( std::make_unique>(1000), std::move(generator)); folly::EDFThreadPoolExecutor pool( 100, std::make_shared("test_pool")); const int numValues = 5; const int requestsPerValue = 10; - std::vector> cachedValues(numValues * requestsPerValue); + std::vector> cachedValues(numValues * requestsPerValue); folly::Latch latch(numValues * requestsPerValue); for (int i = 0; i < requestsPerValue; i++) { for (int j = 0; j < numValues; j++) { @@ -195,33 +222,289 @@ TEST(CachedFactoryTest, multiThreadedGenerationAgain) { ASSERT_EQ(*generated, numValues); for (int i = 0; i < requestsPerValue; i++) { for (int j = 0; j < numValues; j++) { - EXPECT_EQ(getCachedValue(cachedValues[i * numValues + j]), 2 * j); + ASSERT_EQ(*cachedValues[i * numValues + j], 2 * j); } } } +TEST(CachedFactoryTest, lruCacheEviction) { + auto generator = std::make_unique(); + CachedFactory factory( + std::make_unique>(3), std::move(generator)); + ASSERT_EQ(factory.maxSize(), 3); + ASSERT_EQ(factory.currentSize(), 0); + + auto val1 = factory.generate(1); + ASSERT_FALSE(val1.fromCache()); + ASSERT_TRUE(val1.cached()); + auto val2 = factory.generate(2); + ASSERT_FALSE(val2.fromCache()); + ASSERT_TRUE(val2.cached()); + auto val3 = factory.generate(3); + ASSERT_FALSE(val3.fromCache()); + ASSERT_TRUE(val3.cached()); + ASSERT_EQ(factory.currentSize(), 3); + ASSERT_EQ(factory.cacheStats().pinnedSize, 3); + auto val4 = factory.generate(4); + ASSERT_FALSE(val4.fromCache()); + ASSERT_FALSE(val4.cached()); + + { + auto val = factory.generate(4); + ASSERT_FALSE(val.fromCache()); + ASSERT_FALSE(val.cached()); + val = factory.generate(1); + ASSERT_TRUE(val.fromCache()); + ASSERT_TRUE(val.cached()); + val = factory.generate(2); + ASSERT_TRUE(val.fromCache()); + ASSERT_TRUE(val.cached()); + val = factory.generate(3); + ASSERT_TRUE(val.fromCache()); + ASSERT_TRUE(val.cached()); + } + { + auto val = factory.generate(1); + ASSERT_TRUE(val.fromCache()); + ASSERT_TRUE(val.cached()); + } + val1.testingClear(); + val2.testingClear(); + val3.testingClear(); + + val4 = factory.generate(4); + ASSERT_FALSE(val4.fromCache()); + ASSERT_TRUE(val4.cached()); + ASSERT_EQ(factory.cacheStats().curSize, 3); + { + auto val = factory.generate(4); + ASSERT_TRUE(val.fromCache()); + ASSERT_TRUE(val.cached()); + val = factory.generate(1); + ASSERT_TRUE(val.fromCache()); + ASSERT_TRUE(val.cached()); + // Cache entry 2 should be selected for eviction. + val = factory.generate(2); + ASSERT_FALSE(val.fromCache()); + ASSERT_TRUE(val.cached()); + // Cache entry 2 insertion caused cache entry 3 eviction. + val = factory.generate(3); + ASSERT_FALSE(val.fromCache()); + ASSERT_TRUE(val.cached()); + } + ASSERT_EQ(factory.currentSize(), 3); + ASSERT_EQ(factory.cacheStats().pinnedSize, 1); +} + +TEST(CachedFactoryTest, cacheExpiration) { + auto generator = std::make_unique(); + CachedFactory factory( + std::make_unique>(3, 1'000), + std::move(generator)); + ASSERT_EQ(factory.maxSize(), 3); + ASSERT_EQ(factory.currentSize(), 0); + + auto val1 = factory.generate(1); + ASSERT_FALSE(val1.fromCache()); + ASSERT_TRUE(val1.cached()); + auto val2 = factory.generate(2); + ASSERT_FALSE(val2.fromCache()); + ASSERT_TRUE(val2.cached()); + auto val3 = factory.generate(3); + ASSERT_FALSE(val3.fromCache()); + ASSERT_TRUE(val3.cached()); + ASSERT_EQ(factory.currentSize(), 3); + ASSERT_EQ(factory.cacheStats().pinnedSize, 3); + auto val4 = factory.generate(4); + ASSERT_FALSE(val4.fromCache()); + ASSERT_FALSE(val4.cached()); + + std::this_thread::sleep_for(std::chrono::milliseconds{1'500}); + ASSERT_EQ(factory.currentSize(), 3); + ASSERT_EQ(factory.cacheStats().pinnedSize, 3); + + val4 = factory.generate(4); + ASSERT_FALSE(val4.fromCache()); + ASSERT_FALSE(val4.cached()); + ASSERT_EQ(factory.currentSize(), 3); + ASSERT_EQ(factory.cacheStats().pinnedSize, 3); + + val1.testingClear(); + ASSERT_EQ(factory.currentSize(), 2); + ASSERT_EQ(factory.cacheStats().pinnedSize, 2); + + val4 = factory.generate(4); + ASSERT_FALSE(val4.fromCache()); + ASSERT_TRUE(val4.cached()); + ASSERT_EQ(factory.currentSize(), 3); + ASSERT_EQ(factory.cacheStats().pinnedSize, 3); + + val2.testingClear(); + val3.testingClear(); + ASSERT_EQ(factory.currentSize(), 1); + ASSERT_EQ(factory.cacheStats().pinnedSize, 1); + val4.testingClear(); + ASSERT_EQ(factory.currentSize(), 1); + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); + + std::this_thread::sleep_for(std::chrono::milliseconds{1'500}); + + val1 = factory.generate(1); + ASSERT_FALSE(val1.fromCache()); + ASSERT_TRUE(val1.cached()); + ASSERT_EQ(factory.currentSize(), 1); + ASSERT_EQ(factory.cacheStats().pinnedSize, 1); +} + TEST(CachedFactoryTest, retrievedCached) { auto generator = std::make_unique(); - auto* generated = &generator->generated_; + auto* generated = &generator->generated; CachedFactory factory( std::make_unique>(1000), std::move(generator)); - for (int i = 0; i < 10; i += 2) + for (int i = 0; i < 10; i += 2) { factory.generate(i); - EXPECT_EQ(*generated, 5); + } + ASSERT_EQ(*generated, 5); + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); + ASSERT_EQ(factory.cacheStats().curSize, 5); + std::vector keys(10); - for (int i = 0; i < 10; i += 1) + for (int i = 0; i < 10; ++i) { keys[i] = i; - std::vector> cached; + if (i % 2 == 0) { + ASSERT_EQ(*factory.get(keys[i]), i * 2) << i; + } else { + ASSERT_EQ(factory.get(keys[i]).get(), nullptr); + } + } + std::vector>> cached; std::vector missing; - factory.retrieveCached(keys, &cached, &missing); - ASSERT_EQ(5, cached.size()); + factory.retrieveCached(keys, cached, missing); + ASSERT_EQ(cached.size(), 5); + ASSERT_EQ(factory.cacheStats().pinnedSize, 5); + ASSERT_EQ(factory.cacheStats().curSize, 5); + for (int i = 0; i < 5; ++i) { - EXPECT_EQ(cached[i].first, 2 * i); - EXPECT_EQ(cached[i].second, 4 * i); + ASSERT_EQ(cached[i].first, 2 * i); + ASSERT_EQ(*cached[i].second, 4 * i); + ASSERT_TRUE(cached[i].second.fromCache()); } - ASSERT_EQ(5, missing.size()); + ASSERT_EQ(missing.size(), 5); + for (int i = 0; i < 5; ++i) { - EXPECT_EQ(missing[i], 2 * i + 1); + ASSERT_EQ(missing[i], 2 * i + 1); + } + ASSERT_EQ(*generated, 5); +} + +TEST(CachedFactoryTest, clearCacheWithManyEntries) { + auto generator = std::make_unique(); + CachedFactory factory( + std::make_unique>(1000), std::move(generator)); + for (auto i = 0; i < 1000; ++i) { + factory.generate(i); + } + std::vector keys(500); + for (int i = 0; i < 500; ++i) { + keys[i] = i; + } + { + std::vector>> cached; + std::vector missing; + factory.retrieveCached(keys, cached, missing); + ASSERT_EQ(cached.size(), 500); + auto cacheStats = factory.clearCache(); + ASSERT_EQ(cacheStats.numElements, 500); + ASSERT_EQ(cacheStats.pinnedSize, 500); + } + auto cacheStats = factory.cacheStats(); + ASSERT_EQ(cacheStats.numElements, 500); + ASSERT_EQ(cacheStats.pinnedSize, 0); + + cacheStats = factory.clearCache(); + ASSERT_EQ(cacheStats.numElements, 0); + ASSERT_EQ(cacheStats.pinnedSize, 0); +} + +TEST(CachedFactoryTest, disableCache) { + auto generator = std::make_unique(); + auto* generated = &generator->generated; + CachedFactory factory(std::move(generator)); + + auto val1 = factory.generate(1); + ASSERT_FALSE(val1.fromCache()); + ASSERT_EQ(*generated, 1); + ASSERT_EQ(factory.currentSize(), 0); + ASSERT_EQ(factory.cacheStats().curSize, 0); + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); + + auto val2 = factory.generate(1); + ASSERT_FALSE(val2.fromCache()); + EXPECT_EQ(*generated, 2); + ASSERT_EQ(factory.currentSize(), 0); + ASSERT_EQ(factory.cacheStats().curSize, 0); + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); + ASSERT_EQ(factory.cacheStats().expireDurationMs, 0); + + ASSERT_EQ(factory.maxSize(), 0); + + EXPECT_EQ(factory.cacheStats(), SimpleLRUCacheStats{}); + + EXPECT_EQ(factory.clearCache(), SimpleLRUCacheStats{}); + + std::vector keys(10); + for (int i = 0; i < 10; ++i) { + keys[i] = i; + } + + std::vector>> cached; + std::vector missing; + factory.retrieveCached(keys, cached, missing); + ASSERT_EQ(cached.size(), 0); + ASSERT_EQ(missing.size(), 10); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(missing[i], i); + } +} + +TEST(CachedFactoryTest, fuzzer) { + const int numThreads = 32; + const int testDurationMs = 5'000; + const size_t expirationDurationMs = 1; + for (const bool expireCache : {false, true}) { + SCOPED_TRACE(fmt::format("expireCache: {}", expireCache)); + auto generator = std::make_unique(); + CachedFactory factory( + std::make_unique>( + 128, expireCache ? expirationDurationMs : 0), + std::move(generator)); + + std::vector threads; + threads.reserve(numThreads); + for (int i = 0; i < numThreads; ++i) { + threads.emplace_back([&factory, i]() { + folly::Random::DefaultGenerator rng(23 + i); + const auto startTimeMs = getCurrentTimeMs(); + while (startTimeMs + testDurationMs > getCurrentTimeMs()) { + const auto key = folly::Random::rand32(rng) % 256; + const auto val = factory.generate(key); + if (val.fromCache()) { + ASSERT_TRUE(val.cached()); + ASSERT_EQ(*val, key); + } + if (folly::Random::oneIn(4)) { + std::this_thread::sleep_for(std::chrono::microseconds{100}); + } + } + }); + } + for (auto& thread : threads) { + thread.join(); + } + ASSERT_EQ(factory.cacheStats().pinnedSize, 0); + ASSERT_LE(factory.cacheStats().curSize, 128); + ASSERT_LE(factory.cacheStats().numElements, 128); + ASSERT_GT(factory.cacheStats().numHits, 0); + ASSERT_GT(factory.cacheStats().numLookups, 0); } - EXPECT_EQ(*generated, 5); } diff --git a/velox/common/caching/tests/SimpleLRUCacheTest.cpp b/velox/common/caching/tests/SimpleLRUCacheTest.cpp index c7e5b4a9a1001..9193d34d0fc0c 100644 --- a/velox/common/caching/tests/SimpleLRUCacheTest.cpp +++ b/velox/common/caching/tests/SimpleLRUCacheTest.cpp @@ -15,75 +15,238 @@ */ #include "velox/common/caching/SimpleLRUCache.h" -#include #include "gtest/gtest.h" using namespace facebook::velox; -namespace { -void verifyCacheStats( - const SimpleLRUCacheStats& actual, - size_t maxSize, - size_t curSize, - size_t numHits, - size_t numLookups) { - SimpleLRUCacheStats expectedStats{maxSize, curSize, numHits, numLookups}; - EXPECT_EQ(actual, expectedStats) << " Actual " << actual.toString() - << " Expected " << expectedStats.toString(); -} -} // namespace - TEST(SimpleLRUCache, basicCaching) { SimpleLRUCache cache(1000); - EXPECT_FALSE(cache.get(1).has_value()); - EXPECT_FALSE(cache.get(2).has_value()); - - verifyCacheStats(cache.getStats(), 1000, 0, 0, 2); + ASSERT_TRUE(cache.add(1, new int(11), 1)); + int* value = cache.get(1); + ASSERT_NE(value, nullptr); + ASSERT_EQ(*value, 11); + cache.release(1); - int firstValue = 11; - ASSERT_TRUE(cache.add(1, firstValue)); - auto value = cache.get(1); - ASSERT_EQ(value, std::make_optional(11)); - - int secondValue = 22; - ASSERT_TRUE(cache.add(2, secondValue)); - - verifyCacheStats(cache.getStats(), 1000, 2, 1, 3); + int* secondValue = new int(22); + ASSERT_TRUE(cache.addPinned(2, secondValue, 1)); + *secondValue += 5; + cache.release(2); value = cache.get(1); - ASSERT_EQ(value, std::make_optional(11)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(*value, 11); + cache.release(1); value = cache.get(2); - ASSERT_EQ(value, std::make_optional(22)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(*value, 27); + cache.release(2); value = cache.get(1); - ASSERT_EQ(value, std::make_optional(11)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(*value, 11); + secondValue = cache.get(1); + ASSERT_EQ(value, secondValue); + cache.release(1); + cache.release(1); + + ASSERT_EQ( + cache.stats().toString(), + "{\n maxSize: 1000\n expireDurationMs: 0\n curSize: 2\n pinnedSize: 0\n numElements: 2\n numHits: 5\n numLookups: 5\n}\n"); +} - value = cache.get(2); - ASSERT_EQ(value, std::make_optional(22)); - verifyCacheStats(cache.getStats(), 1000, 2, 5, 7); +TEST(SimpleLRUCache, lruEviction) { + SimpleLRUCache cache(3); - cache.clear(); - verifyCacheStats(cache.getStats(), 1000, 0, 5, 7); - EXPECT_FALSE(cache.get(1).has_value()); - EXPECT_FALSE(cache.get(2).has_value()); + for (int i = 0; i < 3; ++i) { + ASSERT_TRUE(cache.add(i, new int(i), 1)); + } + ASSERT_EQ(cache.stats().numElements, 3); + ASSERT_EQ(*cache.get(0), 0); + cache.release(0); + + ASSERT_TRUE(cache.add(3, new int(3), 1)); + ASSERT_EQ(*cache.get(0), 0); + cache.release(0); + ASSERT_EQ(cache.get(1), nullptr); + ASSERT_EQ(*cache.get(3), 3); + cache.release(3); + ASSERT_EQ(cache.stats().numElements, 3); } TEST(SimpleLRUCache, eviction) { SimpleLRUCache cache(1000); for (int i = 0; i < 1010; ++i) { - ASSERT_TRUE(cache.add(i, i)); + ASSERT_TRUE(cache.add(i, new int(i), 1)); } for (int i = 0; i < 10; ++i) { - ASSERT_FALSE(cache.get(i).has_value()); + ASSERT_EQ(cache.get(i), nullptr); } - for (int i = 10; i < 1010; ++i) { - auto value = cache.get(i); - ASSERT_EQ(value, std::make_optional(i)); + int* value = cache.get(i); + ASSERT_NE(value, nullptr); + ASSERT_EQ(*value, i); + cache.release(i); + } +} + +TEST(SimpleLRUCache, pinnedEviction) { + SimpleLRUCache cache(100); + + for (int i = 0; i < 10; ++i) { + ASSERT_TRUE(cache.addPinned(i, new int(i), 1)); + } + for (int i = 10; i < 110; ++i) { + ASSERT_TRUE(cache.add(i, new int(i), 1)); + } + + for (int i = 0; i < 10; ++i) { + int* value = cache.get(i); + ASSERT_NE(value, nullptr); + ASSERT_EQ(*value, i); + cache.release(i); + cache.release(i); // Release the original pin too. + } + for (int i = 10; i < 20; ++i) { + ASSERT_EQ(cache.get(i), nullptr); + } + for (int i = 20; i < 110; ++i) { + int* value = cache.get(i); + ASSERT_NE(value, nullptr); + ASSERT_EQ(*value, i); + cache.release(i); + } +} + +TEST(SimpleLRUCache, fullyPinned) { + SimpleLRUCache cache(10); + + for (int i = 0; i < 10; ++i) { + ASSERT_TRUE(cache.addPinned(i, new int(i), 1)); + } + for (int i = 10; i < 20; ++i) { + int* value = new int(i); + ASSERT_FALSE(cache.add(i, value, 1)); + delete value; + } + for (int i = 20; i < 30; ++i) { + int* value = new int(i); + ASSERT_FALSE(cache.addPinned(i, value, 1)); + delete value; + } + + for (int i = 0; i < 10; ++i) { + int* value = cache.get(i); + ASSERT_NE(value, nullptr); + ASSERT_EQ(*value, i); + cache.release(i); + cache.release(i); // Release the original pin too. + } + for (int i = 10; i < 30; ++i) { + ASSERT_EQ(cache.get(i), nullptr); + } +} + +TEST(SimpleLRUCache, size) { + SimpleLRUCache cache(10); + ASSERT_EQ(cache.maxSize(), 10); + + for (int i = 0; i < 5; ++i) { + ASSERT_TRUE(cache.addPinned(i, new int(i), 2)); + ASSERT_EQ(cache.currentSize(), 2 * (i + 1)); + } + int* value = new int(5); + ASSERT_FALSE(cache.addPinned(5, value, 1)); + + for (int i = 0; i < 5; ++i) { + cache.release(i); + } + ASSERT_TRUE(cache.addPinned(5, value, 10)); + ASSERT_EQ(cache.currentSize(), 10); + + for (int i = 0; i < 5; ++i) { + ASSERT_EQ(cache.get(i), nullptr); + } + cache.release(5); +} + +TEST(SimpleLRUCache, insertLargerThanCacheFails) { + SimpleLRUCache cache(10); + + int* value = new int(42); + ASSERT_FALSE(cache.add(123, value, 11)); + delete value; +} + +TEST(SimpleLRUCache, expiredCacheEntries) { + SimpleLRUCache cache(100, 1'000); + + // Expires on insert new entry. + int* value1 = new int(42); + ASSERT_TRUE(cache.add(123, value1, 11)); + ASSERT_EQ(cache.currentSize(), 11); + ASSERT_EQ(cache.get(123), value1); + cache.release(123); + + std::this_thread::sleep_for(std::chrono::seconds{2}); + ASSERT_EQ(cache.currentSize(), 11); + + int* value2 = new int(32); + ASSERT_TRUE(cache.add(122, value2, 22)); + ASSERT_EQ(cache.currentSize(), 22); + ASSERT_EQ(cache.get(123), nullptr); + ASSERT_EQ(cache.get(122), value2); + cache.release(122); + + // Expires when get cache entry. + std::this_thread::sleep_for(std::chrono::seconds{2}); + ASSERT_EQ(cache.currentSize(), 22); + ASSERT_EQ(cache.get(123), nullptr); + ASSERT_EQ(cache.currentSize(), 0); + ASSERT_EQ(cache.get(122), nullptr); + ASSERT_EQ(cache.currentSize(), 0); + + // Expires when get the same cache entry. + value2 = new int(33); + ASSERT_TRUE(cache.add(124, value2, 11)); + ASSERT_EQ(cache.currentSize(), 11); + ASSERT_EQ(cache.get(124), value2); + cache.release(124); + ASSERT_EQ(cache.currentSize(), 11); + std::this_thread::sleep_for(std::chrono::seconds{2}); + ASSERT_EQ(cache.currentSize(), 11); + ASSERT_EQ(cache.get(124), nullptr); + ASSERT_EQ(cache.currentSize(), 0); + + // Adds multiple entries. + int expectedCacheSize{0}; + for (int i = 0; i < 10; ++i) { + int* value = new int(i); + ASSERT_TRUE(cache.add(i, value, i)); + ASSERT_EQ(cache.get(i), value); + cache.release(i); + expectedCacheSize += i; + ASSERT_EQ(cache.currentSize(), expectedCacheSize); } + std::this_thread::sleep_for(std::chrono::seconds{2}); + ASSERT_EQ(cache.currentSize(), expectedCacheSize); + ASSERT_EQ(cache.get(0), nullptr); + ASSERT_EQ(cache.currentSize(), 0); + + // Expire on release. + value2 = new int(64); + ASSERT_TRUE(cache.addPinned(124, value2, 11)); + ASSERT_EQ(cache.currentSize(), 11); + std::this_thread::sleep_for(std::chrono::seconds{2}); + ASSERT_EQ(cache.currentSize(), 11); + ASSERT_EQ(*cache.get(124), 64); + cache.release(124); + ASSERT_EQ(cache.currentSize(), 11); + cache.release(124); + ASSERT_EQ(cache.currentSize(), 0); + ASSERT_EQ(cache.get(124), nullptr); } diff --git a/velox/common/caching/tests/SsdFileTest.cpp b/velox/common/caching/tests/SsdFileTest.cpp index 9e6dd666d48b6..fd2835596ad34 100644 --- a/velox/common/caching/tests/SsdFileTest.cpp +++ b/velox/common/caching/tests/SsdFileTest.cpp @@ -14,10 +14,13 @@ * limitations under the License. */ +#include "velox/common/base/tests/GTestUtils.h" #include "velox/common/caching/FileIds.h" #include "velox/common/caching/SsdCache.h" +#include "velox/common/memory/Memory.h" #include "velox/exec/tests/utils/TempDirectoryPath.h" +#include #include #include #include @@ -41,32 +44,62 @@ class SsdFileTest : public testing::Test { protected: static constexpr int64_t kMB = 1 << 20; + void SetUp() override { + memory::MemoryManager::testingSetInstance({}); + } + void TearDown() override { if (ssdFile_) { - ssdFile_->deleteFile(); + ssdFile_->testingDeleteFile(); } if (cache_) { cache_->shutdown(); } + fileIds().testingReset(); } void initializeCache( - int64_t maxBytes, int64_t ssdBytes = 0, - bool setNoCowFlag = false) { + uint64_t checkpointIntervalBytes = 0, + bool checksumEnabled = false, + bool checksumReadVerificationEnabled = false, + bool disableFileCow = false) { // tmpfs does not support O_DIRECT, so turn this off for testing. FLAGS_ssd_odirect = false; - cache_ = AsyncDataCache::create(MemoryAllocator::getInstance()); - + cache_ = AsyncDataCache::create(memory::memoryManager()->allocator()); fileName_ = StringIdLease(fileIds(), "fileInStorage"); - tempDirectory_ = exec::test::TempDirectoryPath::create(); - ssdFile_ = std::make_unique( - fmt::format("{}/ssdtest", tempDirectory_->path), + initializeSsdFile( + ssdBytes, + checkpointIntervalBytes, + checksumEnabled, + checksumReadVerificationEnabled, + disableFileCow); + } + + void initializeSsdFile( + int64_t ssdBytes = 0, + uint64_t checkpointIntervalBytes = 0, + bool checksumEnabled = false, + bool checksumReadVerificationEnabled = false, + bool disableFileCow = false) { + SsdFile::Config config( + fmt::format("{}/ssdtest", tempDirectory_->getPath()), 0, // shardId bits::roundUp(ssdBytes, SsdFile::kRegionSize) / SsdFile::kRegionSize, - 0, // checkpointInternalBytes - setNoCowFlag); + checkpointIntervalBytes, + disableFileCow, + checksumEnabled, + checksumReadVerificationEnabled); + ssdFile_ = std::make_unique(config); + } + + // Corrupts the file by invalidate the last 1/10th of its content. + void corruptSsdFile(const std::string& path) { + const auto fd = ::open(path.c_str(), O_WRONLY); + const auto size = ::lseek(fd, 0, SEEK_END); + ASSERT_EQ(ftruncate(fd, size / 10 * 9), 0); + ASSERT_EQ(ftruncate(fd, size), 0); } static void initializeContents(int64_t sequence, memory::Allocation& alloc) { @@ -89,7 +122,10 @@ class SsdFileTest : public testing::Test { // Checks that the contents are consistent with what is set in // initializeContents. - static void checkContents(const memory::Allocation& alloc, int32_t numBytes) { + static void checkContents( + const memory::Allocation& alloc, + int32_t numBytes, + bool expectEqual = true) { bool first = true; int64_t sequence; int32_t bytesChecked = sizeof(int64_t); @@ -107,18 +143,22 @@ class SsdFileTest : public testing::Test { if (bytesChecked >= numBytes) { return; } - ASSERT_EQ(ptr[offset], offset + sequence); + if (expectEqual) { + ASSERT_EQ(ptr[offset], offset + sequence); + } else { + ASSERT_NE(ptr[offset], offset + sequence); + } } } } } - // Gets consecutive entries from file 'fileId' starting at 'startOffset' with + // Gets consecutive entries from file 'fileId' starting at 'startOffset' with // sizes between 'minSize' and 'maxSize'. Sizes start at 'minSize' and double // each time and go back to 'minSize' after exceeding 'maxSize'. This stops // after the total size has exceeded 'totalSize'. The entries are returned as // pins. The pins are exclusive for newly created entries and shared for - // existing ones. New entries are deterministically initialized from 'fileId' + // existing ones. New entries are deterministically initialized from 'fileId' // and the entry's offset. std::vector makePins( uint64_t fileId, @@ -166,8 +206,8 @@ class SsdFileTest : public testing::Test { std::vector ssdPins; ssdPins.reserve(pins.size()); for (auto& pin : pins) { - ssdPins.push_back(ssdFile_->find( - RawFileCacheKey{fileName_.id(), pin.entry()->key().offset})); + ssdPins.push_back(ssdFile_->find(RawFileCacheKey{ + pin.entry()->key().fileNum.id(), pin.entry()->key().offset})); EXPECT_FALSE(ssdPins.back().empty()); } ssdFile_->load(ssdPins, pins); @@ -224,6 +264,33 @@ class SsdFileTest : public testing::Test { readAndCheckPins(pins); } + // Reads back the found entries and check their contents, return the number of + // entries found. + int32_t checkEntries( + const std::vector& entries, + bool expectEqual = true) { + int32_t numFound = 0; + for (auto& entry : entries) { + std::vector pins; + pins.push_back(cache_->findOrCreate( + RawFileCacheKey{entry.key.fileNum.id(), entry.key.offset}, + entry.size, + nullptr)); + if (pins.back().entry()->isExclusive()) { + std::vector ssdPins; + ssdPins.push_back(ssdFile_->find( + RawFileCacheKey{entry.key.fileNum.id(), entry.key.offset})); + if (!ssdPins.back().empty()) { + ++numFound; + ssdFile_->load(ssdPins, pins); + checkContents( + pins[0].entry()->data(), pins[0].entry()->size(), expectEqual); + } + } + } + return numFound; + } + std::shared_ptr tempDirectory_; std::shared_ptr cache_; @@ -235,7 +302,7 @@ class SsdFileTest : public testing::Test { TEST_F(SsdFileTest, writeAndRead) { constexpr int64_t kSsdSize = 16 * SsdFile::kRegionSize; std::vector allEntries; - initializeCache(128 * kMB, kSsdSize); + initializeCache(kSsdSize); FLAGS_ssd_verify_write = true; for (auto startOffset = 0; startOffset <= kSsdSize - SsdFile::kRegionSize; startOffset += SsdFile::kRegionSize) { @@ -275,9 +342,8 @@ TEST_F(SsdFileTest, writeAndRead) { } } - // We check howmany entries are found. The earliest writes will have been + // We check how many entries are found. The earliest writes will have been // evicted. We read back the found entries and check their contents. - int32_t numFound = 0; for (auto& entry : allEntries) { std::vector pins; @@ -291,24 +357,452 @@ TEST_F(SsdFileTest, writeAndRead) { ssdPins.push_back( ssdFile_->find(RawFileCacheKey{fileName_.id(), entry.key.offset})); if (!ssdPins.back().empty()) { - ++numFound; ssdFile_->load(ssdPins, pins); checkContents(pins[0].entry()->data(), pins[0].entry()->size()); } } } + + // Test cache writes with different iobufs sizes. + for (int numPins : {0, 1, IOV_MAX - 1, IOV_MAX, IOV_MAX + 1}) { + SCOPED_TRACE(fmt::format("numPins: {}", numPins)); + auto pins = makePins(fileName_.id(), 0, 4096, 4096, 4096 * numPins); + EXPECT_EQ(pins.size(), numPins); + ssdFile_->write(pins); + readAndCheckPins(pins); + pins.clear(); + } +} + +TEST_F(SsdFileTest, checkpoint) { + constexpr int64_t kSsdSize = 16 * SsdFile::kRegionSize; + const uint64_t checkpointIntervalBytes = 5 * SsdFile::kRegionSize; + const auto fileNameAlt = StringIdLease(fileIds(), "fileInStorageAlt"); + FLAGS_ssd_verify_write = true; + initializeCache(kSsdSize, checkpointIntervalBytes); + + std::vector allEntries; + for (auto startOffset = 0; startOffset <= kSsdSize - SsdFile::kRegionSize; + startOffset += SsdFile::kRegionSize) { + auto pins = + makePins(fileName_.id(), startOffset, 4096, 2048 * 1025, 62 * kMB); + // Each region has one entry from `fileNameAlt`. + pins.push_back(cache_->findOrCreate( + RawFileCacheKey{fileNameAlt.id(), (uint64_t)startOffset}, + 1024, + nullptr)); + ssdFile_->write(pins); + for (auto& pin : pins) { + EXPECT_EQ(ssdFile_.get(), pin.entry()->ssdFile()); + allEntries.emplace_back( + pin.entry()->key(), pin.entry()->ssdOffset(), pin.entry()->size()); + }; + readAndCheckPins(pins); + } + const auto originalRegionScores = ssdFile_->testingCopyScores(); + EXPECT_EQ(originalRegionScores.size(), 16); + + // Re-initialize SSD file from checkpoint. + ssdFile_->checkpoint(true); + initializeSsdFile(kSsdSize, checkpointIntervalBytes); + const auto recoveredRegionScores = ssdFile_->testingCopyScores(); + EXPECT_EQ(recoveredRegionScores.size(), 16); + EXPECT_EQ(originalRegionScores, recoveredRegionScores); + + // Reconstruct cache pins and check the recovered content from cache file. + for (auto startOffset = 0; startOffset <= kSsdSize - SsdFile::kRegionSize; + startOffset += SsdFile::kRegionSize) { + auto pins = + makePins(fileName_.id(), startOffset, 4096, 2048 * 1025, 62 * kMB); + pins.push_back(cache_->findOrCreate( + RawFileCacheKey{fileNameAlt.id(), (uint64_t)startOffset}, + 1024, + nullptr)); + readAndCheckPins(pins); + } + // All entries can be found. + auto numEntriesFound = checkEntries(allEntries); + EXPECT_EQ(numEntriesFound, allEntries.size()); + + // Test removeFileEntries. + folly::F14FastSet filesToRemove{fileName_.id()}; + folly::F14FastSet filesRetained{}; + SsdCacheStats stats; + ssdFile_->updateStats(stats); + EXPECT_EQ(stats.entriesAgedOut, 0); + EXPECT_EQ(stats.regionsAgedOut, 0); + EXPECT_EQ(stats.regionsEvicted, 0); + + // Block eviction. + auto ssdPins = pinAllRegions(allEntries); + ssdFile_->removeFileEntries(filesToRemove, filesRetained); + EXPECT_EQ(ssdFile_->testingNumWritableRegions(), 0); + EXPECT_EQ(filesRetained.size(), 1); + numEntriesFound = checkEntries(allEntries); + EXPECT_EQ(numEntriesFound, allEntries.size()); + auto prevStats = stats; + stats.clear(); + ssdFile_->updateStats(stats); + EXPECT_EQ(stats.entriesAgedOut - prevStats.entriesAgedOut, 0); + EXPECT_EQ(stats.regionsAgedOut - prevStats.regionsAgedOut, 0); + EXPECT_EQ(stats.regionsEvicted - prevStats.regionsEvicted, 0); + + // Unblock eviction. + ssdPins.clear(); + filesRetained.clear(); + ssdFile_->removeFileEntries(filesToRemove, filesRetained); + // All regions have been evicted and marked as writable. + EXPECT_EQ(ssdFile_->testingNumWritableRegions(), 16); + EXPECT_EQ(filesRetained.size(), 0); + numEntriesFound = checkEntries(allEntries); + EXPECT_EQ(numEntriesFound, 0); + prevStats = stats; + stats.clear(); + ssdFile_->updateStats(stats); + EXPECT_EQ( + stats.entriesAgedOut - prevStats.entriesAgedOut, allEntries.size() - 16); + EXPECT_EQ(stats.regionsAgedOut - prevStats.regionsAgedOut, 16); + EXPECT_EQ(stats.regionsEvicted - prevStats.regionsEvicted, 16); + + // Re-initialize SSD file from checkpoint. Since all regions were evicted, no + // entries should be found. + initializeSsdFile(kSsdSize, checkpointIntervalBytes); + numEntriesFound = checkEntries(allEntries); + EXPECT_EQ(numEntriesFound, 0); +} + +TEST_F(SsdFileTest, fileCorruption) { + constexpr int64_t kSsdSize = 16 * SsdFile::kRegionSize; + const uint64_t checkpointIntervalBytes = 5 * SsdFile::kRegionSize; + FLAGS_ssd_verify_write = true; + + const auto populateCache = [&](std::vector& entries) { + entries.clear(); + for (auto startOffset = 0; startOffset <= kSsdSize - SsdFile::kRegionSize; + startOffset += SsdFile::kRegionSize) { + auto pins = + makePins(fileName_.id(), startOffset, 4096, 2048 * 1025, 62 * kMB); + ssdFile_->write(pins); + for (auto& pin : pins) { + EXPECT_EQ(ssdFile_.get(), pin.entry()->ssdFile()); + entries.emplace_back( + pin.entry()->key(), pin.entry()->ssdOffset(), pin.entry()->size()); + }; + } + }; + + // Initialize cache with checksum write enabled. + initializeCache(kSsdSize, checkpointIntervalBytes, true, true); + std::vector allEntries; + populateCache(allEntries); + // All entries can be found. + EXPECT_EQ(checkEntries(allEntries), allEntries.size()); + SsdCacheStats stats; + ssdFile_->updateStats(stats); + EXPECT_EQ(stats.readSsdCorruptions, 0); + + // Corrupt the SSD file, initialize the cache from checkpoint without read + // verification. + ssdFile_->checkpoint(true); + corruptSsdFile(fmt::format("{}/ssdtest", tempDirectory_->getPath())); + initializeSsdFile(kSsdSize, checkpointIntervalBytes, true, false); + // Cache can be loaded but the data of the last part is corrupted. + EXPECT_EQ(checkEntries({allEntries.begin(), allEntries.begin() + 100}), 100); + EXPECT_EQ( + checkEntries({allEntries.end() - 100, allEntries.end()}, false), 100); + // Corrupt the SSD file, initialize the cache from checkpoint with read + // verification enabled. + ssdFile_->checkpoint(true); + initializeSsdFile(kSsdSize, checkpointIntervalBytes, true, true); + // Entries at the front are still loadable. + EXPECT_EQ(checkEntries({allEntries.begin(), allEntries.begin() + 100}), 100); + stats.clear(); + ssdFile_->updateStats(stats); + EXPECT_EQ(stats.readSsdCorruptions, 0); + // The last 1/10 entries are corrupted and cannot be loaded. + VELOX_ASSERT_THROW(checkEntries(allEntries), "Corrupt SSD cache entry"); + stats.clear(); + ssdFile_->updateStats(stats); + EXPECT_GT(stats.readSsdCorruptions, 0); + // New entries can be written. + populateCache(allEntries); + + // Corrupt the Checkpoint file. Cache cannot be recovered. All entries are + // lost. + ssdFile_->checkpoint(true); + corruptSsdFile(ssdFile_->getCheckpointFilePath()); + stats.clear(); + ssdFile_->updateStats(stats); + EXPECT_EQ(stats.readCheckpointErrors, 0); + initializeSsdFile(kSsdSize, checkpointIntervalBytes, true, true); + EXPECT_EQ(checkEntries(allEntries), 0); + stats.clear(); + ssdFile_->updateStats(stats); + EXPECT_EQ(stats.readCheckpointErrors, 1); + // New entries can be written. + populateCache(allEntries); +} + +TEST_F(SsdFileTest, recoverFromCheckpointWithChecksum) { + constexpr int64_t kSsdSize = 4 * SsdFile::kRegionSize; + const uint64_t checkpointIntervalBytes = 3 * SsdFile::kRegionSize; + FLAGS_ssd_verify_write = true; + + // Test if cache data can be recovered with different settings. + struct { + bool writeEnabled; + bool readVerificationEnabled; + bool writeEnabledOnRecovery; + bool readVerificationEnabledOnRecovery; + bool expectedReadVerificationEnabled; + bool expectedReadVerificationEnabledOnRecovery; + bool expectedCheckpointOnRecovery; + + std::string debugString() const { + return fmt::format( + "writeEnabled {}, readVerificationEnabled {}, writeEnabledOnRecovery {}, readVerificationEnabledOnRecovery {}, expectedReadVerificationEnabled {}, expectedReadVerificationEnabledOnRecovery {}, expectedCheckpointOnRecovery {}", + writeEnabled, + readVerificationEnabled, + writeEnabledOnRecovery, + readVerificationEnabledOnRecovery, + expectedReadVerificationEnabled, + expectedReadVerificationEnabledOnRecovery, + expectedCheckpointOnRecovery); + } + } testSettings[] = { + {false, false, false, false, false, false, true}, + {false, false, false, true, false, false, true}, + {false, false, true, false, false, false, false}, + {false, false, true, true, false, true, false}, + {false, true, false, false, false, false, true}, + {false, true, false, true, false, false, true}, + {false, true, true, false, false, false, false}, + {false, true, true, true, false, true, false}, + {true, false, false, false, false, false, true}, + {true, false, false, true, false, false, true}, + {true, false, true, false, false, false, true}, + {true, false, true, true, false, true, true}, + {true, true, false, false, true, false, true}, + {true, true, false, true, true, false, true}, + {true, true, true, false, true, false, true}, + {true, true, true, true, true, true, true}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + // Initialize cache with checksum write enabled/disabled. + initializeCache( + kSsdSize, + checkpointIntervalBytes, + testData.writeEnabled, + testData.readVerificationEnabled); + EXPECT_EQ( + ssdFile_->testingChecksumReadVerificationEnabled(), + testData.expectedReadVerificationEnabled); + + // Populate the cache with some entries. + std::vector allEntries; + for (auto startOffset = 0; startOffset <= kSsdSize - SsdFile::kRegionSize; + startOffset += SsdFile::kRegionSize) { + auto pins = + makePins(fileName_.id(), startOffset, 4096, 2048 * 1025, 62 * kMB); + ssdFile_->write(pins); + for (auto& pin : pins) { + EXPECT_EQ(ssdFile_.get(), pin.entry()->ssdFile()); + allEntries.emplace_back( + pin.entry()->key(), pin.entry()->ssdOffset(), pin.entry()->size()); + }; + } + // All entries can be found. + EXPECT_EQ(checkEntries(allEntries), allEntries.size()); + + SsdCacheStats stats; + ssdFile_->updateStats(stats); + VELOX_CHECK_GT(stats.bytesCached, 0); + VELOX_CHECK_GT(stats.regionsCached, 0); + VELOX_CHECK_GT(stats.entriesCached, 0); + + // Try reinitializing cache from checkpoint with read verification + // enabled/disabled. + ssdFile_->checkpoint(true); + + SsdCacheStats statsAfterCheckpoint; + ssdFile_->updateStats(statsAfterCheckpoint); + ASSERT_EQ(statsAfterCheckpoint.bytesCached, stats.bytesCached); + ASSERT_EQ(statsAfterCheckpoint.regionsCached, stats.regionsCached); + ASSERT_EQ(statsAfterCheckpoint.entriesCached, stats.entriesCached); + + initializeSsdFile( + kSsdSize, + checkpointIntervalBytes, + testData.writeEnabledOnRecovery, + testData.readVerificationEnabledOnRecovery); + + SsdCacheStats statsAfterRecover; + ssdFile_->updateStats(statsAfterRecover); + if (testData.expectedCheckpointOnRecovery) { + ASSERT_EQ(statsAfterRecover.bytesCached, stats.bytesCached); + ASSERT_EQ(statsAfterRecover.regionsCached, stats.regionsCached); + ASSERT_EQ(statsAfterRecover.entriesCached, stats.entriesCached); + } else { + ASSERT_EQ(statsAfterRecover.bytesCached, 0); + ASSERT_EQ(statsAfterRecover.regionsCached, stats.regionsCached); + ASSERT_EQ(statsAfterRecover.entriesCached, 0); + } + + EXPECT_EQ( + ssdFile_->testingChecksumReadVerificationEnabled(), + testData.expectedReadVerificationEnabledOnRecovery); + + // Check if cache data is recoverable as expected. + if (testData.expectedCheckpointOnRecovery) { + EXPECT_EQ(checkEntries(allEntries), allEntries.size()); + } else { + EXPECT_EQ(checkEntries(allEntries), 0); + } + cache_->shutdown(); + memory::MemoryManager::testingSetInstance({}); + } +} + +TEST_F(SsdFileTest, recoverWithEvictedEntries) { + constexpr int64_t kSsdSize = 16 * SsdFile::kRegionSize; + const uint64_t checkpointIntervalBytes = 5 * SsdFile::kRegionSize; + const auto retainFile = + StringIdLease(fileIds(), "recoverWithEvictedEntries.Retained"); + const auto evictFile = + StringIdLease(fileIds(), "recoverWithEvictedEntries.Evicted"); + initializeCache(kSsdSize, checkpointIntervalBytes); + + std::vector allEntries; + uint32_t retainedCacheEntries{0}; + uint64_t retainedCacheSize{0}; + for (auto startOffset = 0; startOffset <= kSsdSize / 2 - SsdFile::kRegionSize; + startOffset += SsdFile::kRegionSize) { + auto pins = makePins( + retainFile.id(), + startOffset, + 4096, + 2048 * 1025, + SsdFile::kRegionSize / 2); + for (const auto& pin : pins) { + ++retainedCacheEntries; + retainedCacheSize += pin.entry()->size(); + } + ssdFile_->write(pins); + readAndCheckPins(pins); + } + + uint32_t evictedCacheEntries{0}; + uint64_t evictedCacheSize{0}; + for (auto startOffset = kSsdSize / 2; + startOffset <= kSsdSize - SsdFile::kRegionSize; + startOffset += SsdFile::kRegionSize) { + auto pins = makePins( + evictFile.id(), + startOffset + SsdFile::kRegionSize, + 4096, + 2048 * 1025, + SsdFile::kRegionSize / 2); + for (const auto& pin : pins) { + ++evictedCacheEntries; + evictedCacheSize += pin.entry()->size(); + } + ssdFile_->write(pins); + readAndCheckPins(pins); + } + + SsdCacheStats stats; + ssdFile_->updateStats(stats); + ASSERT_EQ(stats.bytesCached, retainedCacheSize + evictedCacheSize); + ASSERT_EQ(stats.regionsCached, 9); + ASSERT_EQ(stats.entriesCached, retainedCacheEntries + evictedCacheEntries); + + // Remove one file from the ssd cache. + folly::F14FastSet retainedFileIds; + ssdFile_->removeFileEntries({evictFile.id()}, retainedFileIds); + ASSERT_TRUE(retainedFileIds.empty()); + + stats.clear(); + ssdFile_->updateStats(stats); + // NOTE: remove file entries might erase region which has space utilization + // below certain threshold. + ASSERT_LE(stats.bytesCached, retainedCacheSize); + ASSERT_LE(stats.regionsCached, 9); + ASSERT_LE(stats.entriesCached, retainedCacheEntries); + + // Re-initialize SSD file from checkpoint. + ssdFile_->checkpoint(true); + initializeSsdFile(kSsdSize, checkpointIntervalBytes); + + SsdCacheStats statsAfterRecovery; + ssdFile_->updateStats(statsAfterRecovery); + ASSERT_EQ(statsAfterRecovery.bytesCached, stats.bytesCached); + ASSERT_EQ(statsAfterRecovery.regionsCached, stats.regionsCached); + ASSERT_EQ(statsAfterRecovery.entriesCached, stats.entriesCached); +} + +TEST_F(SsdFileTest, ssdReadWithoutChecksumCheck) { + constexpr int64_t kSsdSize = 16 * SsdFile::kRegionSize; + + // Initialize cache with checksum read/write enabled. + initializeCache(kSsdSize, 0, true, true); + + // Test with one SSD cache entry only. + auto pins = makePins(fileName_.id(), 0, 4096, 4096, 4096); + ssdFile_->write(pins); + ASSERT_EQ(pins.size(), 1); + pins.back().entry()->setExclusiveToShared(); + SsdCacheStats stats; + ssdFile_->updateStats(stats); + ASSERT_EQ(stats.readWithoutChecksumChecks, 0); + + std::vector entries; + for (auto& pin : pins) { + ASSERT_EQ(ssdFile_.get(), pin.entry()->ssdFile()); + entries.emplace_back( + pin.entry()->key(), pin.entry()->ssdOffset(), pin.entry()->size()); + }; + std::vector shortEntries; + for (auto& pin : pins) { + ASSERT_EQ(ssdFile_.get(), pin.entry()->ssdFile()); + shortEntries.emplace_back( + pin.entry()->key(), pin.entry()->ssdOffset(), pin.entry()->size() / 2); + }; + + pins.clear(); + cache_->clear(); + ASSERT_EQ(cache_->refreshStats().numEntries, 0); + + ASSERT_EQ(checkEntries(entries), entries.size()); + stats.clear(); + ssdFile_->updateStats(stats); + ASSERT_EQ(stats.readWithoutChecksumChecks, 0); + + cache_->clear(); + ASSERT_EQ(cache_->refreshStats().numEntries, 0); + + stats.clear(); +#ifndef NDEBUG + VELOX_ASSERT_THROW(checkEntries(shortEntries), ""); + ssdFile_->updateStats(stats); + ASSERT_EQ(stats.readWithoutChecksumChecks, 0); +#else + ASSERT_EQ(checkEntries(shortEntries), shortEntries.size()); + ssdFile_->updateStats(stats); + ASSERT_EQ(stats.readWithoutChecksumChecks, 1); +#endif } #ifdef VELOX_SSD_FILE_TEST_SET_NO_COW_FLAG TEST_F(SsdFileTest, disabledCow) { constexpr int64_t kSsdSize = 16 * SsdFile::kRegionSize; - initializeCache(128 * kMB, kSsdSize, true); + initializeCache(kSsdSize, 0, false, false, true); EXPECT_TRUE(ssdFile_->testingIsCowDisabled()); } TEST_F(SsdFileTest, notDisabledCow) { constexpr int64_t kSsdSize = 16 * SsdFile::kRegionSize; - initializeCache(128 * kMB, kSsdSize, false); + initializeCache(kSsdSize, 0, false, false, false); EXPECT_FALSE(ssdFile_->testingIsCowDisabled()); } #endif // VELOX_SSD_FILE_TEST_SET_NO_COW_FLAG diff --git a/velox/common/caching/tests/SsdFileTrackerTest.cpp b/velox/common/caching/tests/SsdFileTrackerTest.cpp index eb21084a32298..851c72773747d 100644 --- a/velox/common/caching/tests/SsdFileTrackerTest.cpp +++ b/velox/common/caching/tests/SsdFileTrackerTest.cpp @@ -48,4 +48,23 @@ TEST(SsdFileTrackerTest, tracker) { tracker.findEvictionCandidates(kNumRegions, kNumRegions, pins); std::vector expected{0, 1, 4, 5, 6, 7, 8, 9}; EXPECT_EQ(candidates, expected); + + // Test large region scores. + tracker.clear(); + for (auto region = 0; region < kNumRegions; ++region) { + tracker.regionRead(region, INT32_MAX); + tracker.regionRead(region, region * 100'000'000); + } + for (int i = 0; i < 999; ++i) { + for (auto region = 0; region < kNumRegions; ++region) { + tracker.regionFilled(region); + } + } + for (const auto score : tracker.copyScores()) { + EXPECT_TRUE(std::isinf(score)); + } + // Mark all regions to be evictable. + std::fill(pins.begin(), pins.end(), 0); + candidates = tracker.findEvictionCandidates(3, kNumRegions, pins); + EXPECT_EQ(candidates.size(), 3); } diff --git a/velox/common/caching/tests/StringIdMapTest.cpp b/velox/common/caching/tests/StringIdMapTest.cpp index 3c1d319fc8855..59d95af2e88a1 100644 --- a/velox/common/caching/tests/StringIdMapTest.cpp +++ b/velox/common/caching/tests/StringIdMapTest.cpp @@ -17,6 +17,7 @@ #include "velox/common/caching/StringIdMap.h" #include "gtest/gtest.h" +#include "velox/common/base/tests/GTestUtils.h" using namespace facebook::velox; @@ -53,3 +54,52 @@ TEST(StringIdMapTest, rehash) { EXPECT_EQ(ids[i].id(), StringIdLease(map, name).id()); } } + +TEST(StringIdMapTest, recover) { + constexpr const char* kRecoverFile1 = "file_1"; + constexpr const char* kRecoverFile2 = "file_2"; + constexpr const char* kRecoverFile3 = "file_3"; + StringIdMap map; + const uint64_t recoverId1{10}; + const uint64_t recoverId2{20}; + { + StringIdLease lease(map, recoverId1, kRecoverFile1); + ASSERT_TRUE(lease.hasValue()); + ASSERT_EQ(map.pinnedSize(), ::strlen(kRecoverFile1)); + ASSERT_EQ(map.testingLastId(), recoverId1); + VELOX_ASSERT_THROW( + std::make_unique(map, recoverId1, kRecoverFile2), + "(1 vs. 0) Reused recover id 10 assigned to file_2"); + VELOX_ASSERT_THROW( + std::make_unique(map, recoverId2, kRecoverFile1), + "(20 vs. 10) Multiple recover ids assigned to file_1"); + } + ASSERT_EQ(map.pinnedSize(), 0); + + StringIdLease lease1(map, kRecoverFile1); + ASSERT_EQ(map.pinnedSize(), ::strlen(kRecoverFile1)); + ASSERT_EQ(map.testingLastId(), recoverId1 + 1); + + { + StringIdLease lease(map, recoverId2, kRecoverFile2); + ASSERT_TRUE(lease.hasValue()); + ASSERT_EQ(lease.id(), recoverId2); + ASSERT_EQ( + map.pinnedSize(), ::strlen(kRecoverFile1) + ::strlen(kRecoverFile2)); + ASSERT_EQ(map.testingLastId(), recoverId2); + VELOX_ASSERT_THROW( + std::make_unique(map, recoverId2, kRecoverFile3), + "(1 vs. 0) Reused recover id 20 assigned to file_3"); + VELOX_ASSERT_THROW( + std::make_unique(map, recoverId2, kRecoverFile1), + "(20 vs. 11) Multiple recover ids assigned to file_1"); + StringIdLease dupLease(map, recoverId2, kRecoverFile2); + ASSERT_TRUE(lease.hasValue()); + ASSERT_EQ(lease.id(), recoverId2); + ASSERT_EQ( + map.pinnedSize(), ::strlen(kRecoverFile1) + ::strlen(kRecoverFile2)); + } + + ASSERT_EQ(map.testingLastId(), recoverId2); + ASSERT_EQ(map.pinnedSize(), ::strlen(kRecoverFile1)); +} diff --git a/velox/common/compression/CMakeLists.txt b/velox/common/compression/CMakeLists.txt index e429485151e35..25835c26c4bbd 100644 --- a/velox/common/compression/CMakeLists.txt +++ b/velox/common/compression/CMakeLists.txt @@ -16,8 +16,8 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() -add_library(velox_common_compression Compression.cpp LzoDecompressor.cpp) -target_link_libraries( +velox_add_library(velox_common_compression Compression.cpp LzoDecompressor.cpp) +velox_link_libraries( velox_common_compression PUBLIC Folly::folly PRIVATE velox_exception) diff --git a/velox/common/compression/Compression.cpp b/velox/common/compression/Compression.cpp index 3843f5902a980..e17ff941ba99f 100644 --- a/velox/common/compression/Compression.cpp +++ b/velox/common/compression/Compression.cpp @@ -56,7 +56,8 @@ CompressionKind codecTypeToCompressionKind(folly::io::CodecType type) { case folly::io::CodecType::GZIP: return CompressionKind_GZIP; default: - VELOX_UNSUPPORTED("Not support folly codec type {}", type); + VELOX_UNSUPPORTED( + "Not support folly codec type {}", folly::to(type)); } } diff --git a/velox/common/compression/Compression.h b/velox/common/compression/Compression.h index 317f9717a2ae5..70f804f2e8611 100644 --- a/velox/common/compression/Compression.h +++ b/velox/common/compression/Compression.h @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -46,3 +47,14 @@ CompressionKind stringToCompressionKind(const std::string& kind); constexpr uint64_t DEFAULT_COMPRESSION_BLOCK_SIZE = 256 * 1024; } // namespace facebook::velox::common + +template <> +struct fmt::formatter + : fmt::formatter { + auto format( + const facebook::velox::common::CompressionKind& s, + format_context& ctx) const { + return formatter::format( + facebook::velox::common::compressionKindToString(s), ctx); + } +}; diff --git a/velox/common/compression/tests/CMakeLists.txt b/velox/common/compression/tests/CMakeLists.txt index d7200934b9fd8..3b75b6fea6981 100644 --- a/velox/common/compression/tests/CMakeLists.txt +++ b/velox/common/compression/tests/CMakeLists.txt @@ -17,4 +17,5 @@ add_test(velox_common_compression_test velox_common_compression_test) target_link_libraries( velox_common_compression_test PUBLIC velox_link_libs - PRIVATE velox_common_compression velox_exception gtest gtest_main) + PRIVATE velox_common_compression velox_exception GTest::gtest + GTest::gtest_main) diff --git a/velox/common/config/CMakeLists.txt b/velox/common/config/CMakeLists.txt index f6f4df0f1992d..7780665a29251 100644 --- a/velox/common/config/CMakeLists.txt +++ b/velox/common/config/CMakeLists.txt @@ -12,6 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_spill_config SpillConfig.cpp) -target_link_libraries(velox_spill_config Folly::folly velox_exception - velox_common_compression) +if (${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif () + +velox_add_library(velox_common_config Config.cpp) +velox_link_libraries( + velox_common_config + PUBLIC velox_common_base + velox_exception + PRIVATE re2::re2) diff --git a/velox/common/config/Config.cpp b/velox/common/config/Config.cpp new file mode 100644 index 0000000000000..0582b1e755e40 --- /dev/null +++ b/velox/common/config/Config.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "velox/common/config/Config.h" + +namespace facebook::velox::config { + +double toBytesPerCapacityUnit(CapacityUnit unit) { + switch (unit) { + case CapacityUnit::BYTE: + return 1; + case CapacityUnit::KILOBYTE: + return exp2(10); + case CapacityUnit::MEGABYTE: + return exp2(20); + case CapacityUnit::GIGABYTE: + return exp2(30); + case CapacityUnit::TERABYTE: + return exp2(40); + case CapacityUnit::PETABYTE: + return exp2(50); + default: + VELOX_USER_FAIL("Invalid capacity unit '{}'", (int)unit); + } +} + +CapacityUnit valueOfCapacityUnit(const std::string& unitStr) { + std::stringstream ss; + for (const char c : unitStr) { + ss << static_cast(std::tolower(c)); + } + const auto lowerUnitStr = ss.str(); + if (lowerUnitStr == "b") { + return CapacityUnit::BYTE; + } + if (lowerUnitStr == "kb") { + return CapacityUnit::KILOBYTE; + } + if (lowerUnitStr == "mb") { + return CapacityUnit::MEGABYTE; + } + if (lowerUnitStr == "gb") { + return CapacityUnit::GIGABYTE; + } + if (lowerUnitStr == "tb") { + return CapacityUnit::TERABYTE; + } + if (lowerUnitStr == "pb") { + return CapacityUnit::PETABYTE; + } + VELOX_USER_FAIL("Invalid capacity unit '{}'", unitStr); +} + +uint64_t toCapacity(const std::string& from, CapacityUnit to) { + static const RE2 kPattern(R"(^\s*(\d+(?:\.\d+)?)\s*([a-zA-Z]+)\s*$)"); + double value; + std::string unit; + if (!RE2::FullMatch(from, kPattern, &value, &unit)) { + VELOX_USER_FAIL("Invalid capacity string '{}'", from); + } + + return value * + (toBytesPerCapacityUnit(valueOfCapacityUnit(unit)) / + toBytesPerCapacityUnit(to)); +} + +std::chrono::duration toDuration(const std::string& str) { + static const RE2 kPattern(R"(^\s*(\d+(?:\.\d+)?)\s*([a-zA-Z]+)\s*)"); + + double value; + std::string unit; + if (!RE2::FullMatch(str, kPattern, &value, &unit)) { + VELOX_USER_FAIL("Invalid duration '{}'", str); + } + if (unit == "ns") { + return std::chrono::duration(value); + } else if (unit == "us") { + return std::chrono::duration(value); + } else if (unit == "ms") { + return std::chrono::duration(value); + } else if (unit == "s") { + return std::chrono::duration(value); + } else if (unit == "m") { + return std::chrono::duration>(value); + } else if (unit == "h") { + return std::chrono::duration>(value); + } else if (unit == "d") { + return std::chrono::duration>(value); + } + VELOX_USER_FAIL("Invalid duration '{}'", str); +} + +ConfigBase& ConfigBase::set(const std::string& key, const std::string& val) { + VELOX_CHECK(mutable_, "Cannot set in immutable config"); + std::unique_lock l(mutex_); + configs_[key] = val; + return *this; +} + +ConfigBase& ConfigBase::reset() { + VELOX_CHECK(mutable_, "Cannot reset in immutable config"); + std::unique_lock l(mutex_); + configs_.clear(); + return *this; +} + +bool ConfigBase::valueExists(const std::string& key) const { + std::shared_lock l(mutex_); + return configs_.find(key) != configs_.end(); +}; + +const std::unordered_map& ConfigBase::rawConfigs() + const { + VELOX_CHECK( + !mutable_, + "Mutable config cannot return unprotected reference to raw configs."); + return configs_; +} + +std::unordered_map ConfigBase::rawConfigsCopy() + const { + std::shared_lock l(mutex_); + return configs_; +} + +folly::Optional ConfigBase::get(const std::string& key) const { + folly::Optional val; + std::shared_lock l(mutex_); + auto it = configs_.find(key); + if (it != configs_.end()) { + val = it->second; + } + return val; +} +} // namespace facebook::velox::config diff --git a/velox/common/config/Config.h b/velox/common/config/Config.h index dd5f37c55b73a..96f77d59cd7b8 100644 --- a/velox/common/config/Config.h +++ b/velox/common/config/Config.h @@ -18,84 +18,144 @@ #include #include +#include #include #include "folly/Conv.h" #include "velox/common/base/Exceptions.h" -namespace facebook::velox::common { -// The concrete config class would inherit the config base -// and then just define all the entries. -template +namespace facebook::velox::config { + +enum class CapacityUnit { + BYTE, + KILOBYTE, + MEGABYTE, + GIGABYTE, + TERABYTE, + PETABYTE +}; + +double toBytesPerCapacityUnit(CapacityUnit unit); + +CapacityUnit valueOfCapacityUnit(const std::string& unitStr); + +/// Convert capacity string with unit to the capacity number in the specified +/// units +uint64_t toCapacity(const std::string& from, CapacityUnit to); + +std::chrono::duration toDuration(const std::string& str); + +/// The concrete config class should inherit the config base and define all the +/// entries. class ConfigBase { public: template - class Entry { - private: + struct Entry { Entry( - const std::string& key, - const T& val, - std::function toStr = + const std::string& _key, + const T& _val, + std::function _toStr = [](const T& val) { return folly::to(val); }, - std::function toT = - [](const std::string& key, const std::string& val) { - auto converted = folly::tryTo(val); + std::function _toT = + [](const std::string& k, const std::string& v) { + auto converted = folly::tryTo(v); VELOX_CHECK( converted.hasValue(), fmt::format( "Invalid configuration for key '{}'. Value '{}' cannot be converted to type {}.", - key, - val, + k, + v, folly::demangle(typeid(T)))); return converted.value(); }) - : key_{key}, default_{val}, toStr_{toStr}, toT_{toT} {} + : key{_key}, defaultVal{_val}, toStr{_toStr}, toT{_toT} {} - public: - const std::string& configKey() const { - return key_; - } + const std::string key; + const T defaultVal; + const std::function toStr; + const std::function toT; + }; - private: - const std::string key_; - const T default_; - const std::function toStr_; - const std::function toT_; + ConfigBase( + std::unordered_map&& configs, + bool _mutable = false) + : configs_(std::move(configs)), mutable_(_mutable) {} - friend ConfigBase; - friend ConcreteConfig; - }; + virtual ~ConfigBase() {} template ConfigBase& set(const Entry& entry, const T& val) { - configs_[entry.key_] = entry.toStr_(val); + VELOX_CHECK(mutable_, "Cannot set in immutable config"); + std::unique_lock l(mutex_); + configs_[entry.key] = entry.toStr(val); return *this; } + ConfigBase& set(const std::string& key, const std::string& val); + template ConfigBase& unset(const Entry& entry) { - configs_.erase(entry.key_); + VELOX_CHECK(mutable_, "Cannot unset in immutable config"); + std::unique_lock l(mutex_); + configs_.erase(entry.key); return *this; } - ConfigBase& reset() { - configs_.clear(); - return *this; - } + ConfigBase& reset(); template T get(const Entry& entry) const { - auto iter = configs_.find(entry.key_); - return iter != configs_.end() ? entry.toT_(entry.key_, iter->second) - : entry.default_; + std::shared_lock l(mutex_); + auto iter = configs_.find(entry.key); + return iter != configs_.end() ? entry.toT(entry.key, iter->second) + : entry.defaultVal; + } + + template + folly::Optional get( + const std::string& key, + std::function toT = [](auto /* unused */, + auto value) { + return folly::to(value); + }) const { + auto val = get(key); + if (val.hasValue()) { + return toT(key, val.value()); + } else { + return folly::none; + } } - std::map toSerdeParams() { - return std::map{configs_.cbegin(), configs_.cend()}; + template + T get( + const std::string& key, + const T& defaultValue, + std::function toT = [](auto /* unused */, + auto value) { + return folly::to(value); + }) const { + auto val = get(key); + if (val.hasValue()) { + return toT(key, val.value()); + } else { + return defaultValue; + } } + bool valueExists(const std::string& key) const; + + const std::unordered_map& rawConfigs() const; + + std::unordered_map rawConfigsCopy() const; + protected: + mutable std::shared_mutex mutex_; std::unordered_map configs_; + + private: + folly::Optional get(const std::string& key) const; + + const bool mutable_; }; -} // namespace facebook::velox::common +} // namespace facebook::velox::config diff --git a/velox/common/config/SpillConfig.cpp b/velox/common/config/SpillConfig.cpp deleted file mode 100644 index 05936c0325d46..0000000000000 --- a/velox/common/config/SpillConfig.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "velox/common/config/SpillConfig.h" -#include "velox/common/base/Exceptions.h" - -namespace facebook::velox::common { -SpillConfig::SpillConfig( - const std::string& _filePath, - uint64_t _maxFileSize, - uint64_t _writeBufferSize, - uint64_t _minSpillRunSize, - folly::Executor* _executor, - int32_t _minSpillableReservationPct, - int32_t _spillableReservationGrowthPct, - uint8_t _startPartitionBit, - uint8_t _joinPartitionBits, - uint8_t _aggregationPartitionBits, - bool _aggregationSpillAll, - int32_t _maxSpillLevel, - int32_t _testSpillPct, - const std::string& _compressionKind) - : filePath(_filePath), - maxFileSize( - _maxFileSize == 0 ? std::numeric_limits::max() - : _maxFileSize), - writeBufferSize(_writeBufferSize), - minSpillRunSize(_minSpillRunSize), - executor(_executor), - minSpillableReservationPct(_minSpillableReservationPct), - spillableReservationGrowthPct(_spillableReservationGrowthPct), - startPartitionBit(_startPartitionBit), - joinPartitionBits(_joinPartitionBits), - aggregationPartitionBits(_aggregationPartitionBits), - aggregationSpillAll(_aggregationSpillAll), - maxSpillLevel(_maxSpillLevel), - testSpillPct(_testSpillPct), - compressionKind(common::stringToCompressionKind(_compressionKind)) { - VELOX_USER_CHECK_GE( - spillableReservationGrowthPct, - minSpillableReservationPct, - "Spillable memory reservation growth pct should not be lower than minimum available pct"); -} - -int32_t SpillConfig::joinSpillLevel(uint8_t startBitOffset) const { - const auto numPartitionBits = joinPartitionBits; - VELOX_CHECK_LE( - startBitOffset + numPartitionBits, - 64, - "startBitOffset:{} numPartitionsBits:{}", - startBitOffset, - numPartitionBits); - const int32_t deltaBits = startBitOffset - startPartitionBit; - VELOX_CHECK_GE(deltaBits, 0, "deltaBits:{}", deltaBits); - VELOX_CHECK_EQ( - deltaBits % numPartitionBits, - 0, - "deltaBits:{} numPartitionsBits{}", - deltaBits, - numPartitionBits); - return deltaBits / numPartitionBits; -} - -bool SpillConfig::exceedJoinSpillLevelLimit(uint8_t startBitOffset) const { - if (startBitOffset + joinPartitionBits > 64) { - return true; - } - if (maxSpillLevel == -1) { - return false; - } - return joinSpillLevel(startBitOffset) > maxSpillLevel; -} -} // namespace facebook::velox::common diff --git a/velox/common/config/SpillConfig.h b/velox/common/config/SpillConfig.h deleted file mode 100644 index 5c20f677642be..0000000000000 --- a/velox/common/config/SpillConfig.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include - -#include -#include "velox/common/compression/Compression.h" - -namespace facebook::velox::common { -/// Specifies the config for spilling. -struct SpillConfig { - SpillConfig( - const std::string& _filePath, - uint64_t _maxFileSize, - uint64_t _writeBufferSize, - uint64_t _minSpillRunSize, - folly::Executor* _executor, - int32_t _minSpillableReservationPct, - int32_t _spillableReservationGrowthPct, - uint8_t _startPartitionBit, - uint8_t _joinPartitionBits, - uint8_t _aggregationPartitionBits, - bool _aggregationSpillAll, - int32_t _maxSpillLevel, - int32_t _testSpillPct, - const std::string& _compressionKind); - - /// Returns the hash join spilling level with given 'startBitOffset'. - /// - /// NOTE: we advance (or right shift) the partition bit offset when goes to - /// the next level of recursive spilling. - int32_t joinSpillLevel(uint8_t startBitOffset) const; - - /// Checks if the given 'startBitOffset' has exceeded the max hash join - /// spill limit. - bool exceedJoinSpillLevelLimit(uint8_t startBitOffset) const; - - /// Filesystem path for spill files. - std::string filePath; - - /// The max spill file size. If it is zero, there is no limit on the spill - /// file size. - uint64_t maxFileSize; - - /// Specifies the size to buffer the serialized spill data before write to - /// storage system for io efficiency. - uint64_t writeBufferSize; - - /// The min spill run size (bytes) limit used to select partitions for - /// spilling. The spiller tries to spill a previously spilled partitions if - /// its data size exceeds this limit, otherwise it spills the partition with - /// most data. If the limit is zero, then the spiller always spill a - /// previously spilled partition if it has any data. This is to avoid spill - /// from a partition wigth a small amount of data which might result in - /// generating too many small spilled files. - uint64_t minSpillRunSize; - - /// Executor for spilling. If nullptr spilling writes on the Driver's thread. - folly::Executor* executor; // Not owned. - - /// The minimal spillable memory reservation in percentage of the current - /// memory usage. - int32_t minSpillableReservationPct; - - /// The spillable memory reservation growth in percentage of the current - /// memory usage. - int32_t spillableReservationGrowthPct; - - /// Used to calculate spill partition number. - uint8_t startPartitionBit; - - /// Used to calculate the spill hash partition number for hash join with - /// 'startPartitionBit'. - uint8_t joinPartitionBits; - - /// Used to calculate the spill hash partition number for aggregation with - /// 'startPartitionBit'. - uint8_t aggregationPartitionBits; - - /// If true and spilling has been triggered during the input processing, the - /// spiller will spill all the remaining in-memory state to disk before output - /// processing. This is to simplify the aggregation query OOM prevention in - /// output processing stage. - bool aggregationSpillAll; - - /// The max allowed spilling level with zero being the initial spilling - /// level. This only applies for hash build spilling which needs recursive - /// spilling when the build table is too big. If it is set to -1, then there - /// is no limit and then some extreme large query might run out of spilling - /// partition bits at the end. - int32_t maxSpillLevel; - - /// Percentage of input batches to be spilled for testing. 0 means no - /// spilling for test. - int32_t testSpillPct; - - /// CompressionKind when spilling, CompressionKind_NONE means no compression. - common::CompressionKind compressionKind; -}; -} // namespace facebook::velox::common diff --git a/velox/common/config/tests/CMakeLists.txt b/velox/common/config/tests/CMakeLists.txt new file mode 100644 index 0000000000000..83de01821eea5 --- /dev/null +++ b/velox/common/config/tests/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(velox_config_test ConfigTest.cpp) +add_test(velox_config_test velox_config_test) +target_link_libraries( + velox_config_test + PUBLIC Folly::folly + PRIVATE velox_common_config GTest::gtest GTest::gtest_main) diff --git a/velox/common/config/tests/ConfigTest.cpp b/velox/common/config/tests/ConfigTest.cpp new file mode 100644 index 0000000000000..b97e2df3528b0 --- /dev/null +++ b/velox/common/config/tests/ConfigTest.cpp @@ -0,0 +1,268 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/config/Config.h" + +namespace facebook::velox::config { + +class ConfigTest : public ::testing::Test {}; + +enum TestEnum { ENUM_0 = 0, ENUM_1 = 1, ENUM_2 = 2, UNKNOWN = 3 }; + +class TestConfig : public ConfigBase { + public: + template + using Entry = ConfigBase::Entry; + + static Entry kInt32Entry; + static Entry kUint64Entry; + static Entry kBoolEntry; + static Entry kStringEntry; + static Entry kEnumEntry; + + TestConfig( + std::unordered_map&& configs, + bool _mutable) + : ConfigBase(std::move(configs), _mutable) {} +}; + +// Definition needs to be outside of class +TestConfig::Entry TestConfig::kInt32Entry("int32_entry", -32); +TestConfig::Entry TestConfig::kUint64Entry("uint64_entry", 64); +TestConfig::Entry TestConfig::kBoolEntry("bool_entry", true); +TestConfig::Entry TestConfig::kStringEntry( + "string_entry", + "default.string.value"); +TestConfig::Entry TestConfig::kEnumEntry( + "enum_entry", + TestEnum::ENUM_0, + [](const TestEnum& value) { + if (value == TestEnum::ENUM_0) { + return "ENUM_0"; + } + if (value == TestEnum::ENUM_1) { + return "ENUM_1"; + } + if (value == TestEnum::ENUM_2) { + return "ENUM_2"; + } + return "UNKNOWN"; + }, + [](const std::string& /* unused */, const std::string& v) { + if (v == "ENUM_0") { + return TestEnum::ENUM_0; + } + if (v == "ENUM_1") { + return TestEnum::ENUM_1; + } + if (v == "ENUM_2") { + return TestEnum::ENUM_2; + } + return TestEnum::UNKNOWN; + }); + +TEST_F(ConfigTest, creation) { + { + std::unordered_map rawConfigs{}; + auto config = std::make_shared(std::move(rawConfigs), false); + ASSERT_EQ(config->rawConfigs().size(), 0); + ASSERT_EQ(config->rawConfigsCopy().size(), 0); + } + + { + std::unordered_map rawConfigs{}; + rawConfigs.emplace("int32_entry", "-3200"); + auto config = std::make_shared(std::move(rawConfigs), true); + ASSERT_EQ(config->rawConfigsCopy().size(), 1); + VELOX_ASSERT_THROW( + config->rawConfigs(), + "Mutable config cannot return unprotected reference to raw configs."); + } +} + +TEST_F(ConfigTest, immutableConfig) { + // Testing default values + auto config = std::make_shared( + std::unordered_map(), false); + ASSERT_EQ(config->get(TestConfig::kInt32Entry), -32); + ASSERT_EQ(config->get(TestConfig::kUint64Entry), 64); + ASSERT_EQ(config->get(TestConfig::kBoolEntry), true); + ASSERT_EQ(config->get(TestConfig::kStringEntry), "default.string.value"); + ASSERT_EQ(config->get(TestConfig::kEnumEntry), TestEnum::ENUM_0); + + std::unordered_map rawConfigs{ + {TestConfig::kInt32Entry.key, "-3200"}, + {TestConfig::kUint64Entry.key, "6400"}, + {TestConfig::kStringEntry.key, "not.default.string.value"}, + {TestConfig::kBoolEntry.key, "false"}, + {TestConfig::kEnumEntry.key, "ENUM_2"}, + }; + + auto expectedRawConfigs = rawConfigs; + + config = std::make_shared(std::move(rawConfigs), false); + + // Testing behavior when trying to modify the immutable config + VELOX_ASSERT_THROW(config->set(TestConfig::kInt32Entry, 100), "Cannot set"); + VELOX_ASSERT_THROW( + config->set(TestConfig::kInt32Entry.key, "100"), "Cannot set"); + VELOX_ASSERT_THROW(config->unset(TestConfig::kInt32Entry), "Cannot unset"); + VELOX_ASSERT_THROW(config->reset(), "Cannot reset"); + + // Ensure values are unchanged after attempted modifications + ASSERT_EQ(config->get(TestConfig::kInt32Entry), -3200); + ASSERT_EQ(config->get(TestConfig::kUint64Entry), 6400); + ASSERT_EQ(config->get(TestConfig::kBoolEntry), false); + ASSERT_EQ(config->get(TestConfig::kStringEntry), "not.default.string.value"); + ASSERT_EQ(config->get(TestConfig::kEnumEntry), TestEnum::ENUM_2); + ASSERT_EQ( + config->get( + TestConfig::kInt32Entry.key, TestConfig::kInt32Entry.defaultVal), + -3200); + ASSERT_EQ( + config->get( + TestConfig::kUint64Entry.key, TestConfig::kUint64Entry.defaultVal), + 6400); + ASSERT_EQ( + config->get( + TestConfig::kBoolEntry.key, TestConfig::kBoolEntry.defaultVal), + false); + ASSERT_EQ( + config->get( + TestConfig::kStringEntry.key, TestConfig::kStringEntry.defaultVal), + "not.default.string.value"); + ASSERT_EQ( + config->get( + TestConfig::kEnumEntry.key, + TestConfig::kEnumEntry.defaultVal, + TestConfig::kEnumEntry.toT), + TestEnum::ENUM_2); + ASSERT_TRUE(config->get(TestConfig::kInt32Entry.key).has_value()); + ASSERT_EQ(config->get(TestConfig::kInt32Entry.key).value(), -3200); + ASSERT_FALSE(config->get("wrong_int32_key").has_value()); + + // Testing value existence + ASSERT_TRUE(config->valueExists(TestConfig::kInt32Entry.key)); + ASSERT_FALSE(config->valueExists("non_existent_entry")); + + // Testing retrieval of raw configurations + ASSERT_EQ(expectedRawConfigs, config->rawConfigs()); + ASSERT_EQ(expectedRawConfigs, config->rawConfigsCopy()); +} + +TEST_F(ConfigTest, mutableConfig) { + // Create a mutable configuration with some initial values + std::unordered_map initialConfigs{ + {TestConfig::kInt32Entry.key, "-3200"}, + {TestConfig::kUint64Entry.key, "6400"}, + {TestConfig::kStringEntry.key, "initial.string.value"}, + {TestConfig::kBoolEntry.key, "false"}, + {TestConfig::kEnumEntry.key, "ENUM_2"}, + }; + + auto config = std::make_shared(std::move(initialConfigs), true); + + // Test setting new values + (*config) + .set(TestConfig::kInt32Entry, 123) + .set(TestConfig::kStringEntry, std::string("modified.string.value")) + .set(TestConfig::kBoolEntry.key, "true") + .set(TestConfig::kEnumEntry.key, "ENUM_1"); + + ASSERT_EQ(config->get(TestConfig::kInt32Entry), 123); + ASSERT_EQ(config->get(TestConfig::kStringEntry), "modified.string.value"); + ASSERT_EQ(config->get(TestConfig::kBoolEntry), true); + ASSERT_EQ(config->get(TestConfig::kEnumEntry), TestEnum::ENUM_1); + + // Test unsetting values + ASSERT_EQ(config->get(TestConfig::kUint64Entry), 6400); + config->unset(TestConfig::kUint64Entry); + ASSERT_EQ( + config->get(TestConfig::kUint64Entry), + TestConfig::kUint64Entry.defaultVal); + + // Test resetting the configuration + config->reset(); + auto rawConfigsCopy = config->rawConfigsCopy(); + ASSERT_TRUE(rawConfigsCopy.empty()); + ASSERT_FALSE(config->valueExists(TestConfig::kUint64Entry.key)); +} + +TEST_F(ConfigTest, capacityConversion) { + folly::Random::DefaultGenerator rng; + rng.seed(1); + + std::unordered_map> unitStrLookup{ + {CapacityUnit::BYTE, {"b", "B"}}, + {CapacityUnit::KILOBYTE, {"kb", "kB", "Kb", "KB"}}, + {CapacityUnit::MEGABYTE, {"mb", "mB", "Mb", "MB"}}, + {CapacityUnit::GIGABYTE, {"gb", "gB", "Gb", "GB"}}, + {CapacityUnit::TERABYTE, {"tb", "tB", "Tb", "TB"}}, + {CapacityUnit::PETABYTE, {"pb", "pB", "Pb", "PB"}}}; + + std::vector> units{ + {CapacityUnit::BYTE, 1}, + {CapacityUnit::KILOBYTE, 1024}, + {CapacityUnit::MEGABYTE, 1024 * 1024}, + {CapacityUnit::GIGABYTE, 1024 * 1024 * 1024}, + {CapacityUnit::TERABYTE, 1024ll * 1024 * 1024 * 1024}, + {CapacityUnit::PETABYTE, 1024ll * 1024 * 1024 * 1024 * 1024}}; + for (int32_t i = 0; i < units.size(); i++) { + for (int32_t j = 0; j < units.size(); j++) { + // We use this diffRatio to prevent float conversion overflow when + // converting from one unit to another. + uint64_t diffRatio = i < j ? units[j].second / units[i].second + : units[i].second / units[j].second; + uint64_t randNumber = folly::Random::rand64(rng); + uint64_t testNumber = i > j ? randNumber / diffRatio : randNumber; + const auto& unitStrs = unitStrLookup[units[i].first]; + for (int32_t k = 0; k < unitStrs.size(); k++) { + ASSERT_EQ( + toCapacity( + std::string(std::to_string(testNumber) + unitStrs[k]), + units[j].first), + (uint64_t)(testNumber * (units[i].second / units[j].second))); + } + } + } +} + +TEST_F(ConfigTest, durationConversion) { + folly::Random::DefaultGenerator rng; + rng.seed(1); + + std::vector> units{ + {"ns", 1}, + {"us", 1000}, + {"ms", 1000 * 1000}, + {"s", 1000ll * 1000 * 1000}, + {"m", 1000ll * 1000 * 1000 * 60}, + {"h", 1000ll * 1000 * 1000 * 60 * 60}, + {"d", 1000ll * 1000 * 1000 * 60 * 60 * 24}}; + for (uint32_t i = 0; i < units.size(); i++) { + auto testNumber = folly::Random::rand32(rng) % 10000; + auto duration = + toDuration(std::string(std::to_string(testNumber) + units[i].first)); + ASSERT_EQ( + testNumber * units[i].second, + std::chrono::duration_cast(duration).count()); + } +} +} // namespace facebook::velox::config diff --git a/velox/common/encode/Base64.cpp b/velox/common/encode/Base64.cpp index 85fd843b86a83..da4e9cdbfcfdd 100644 --- a/velox/common/encode/Base64.cpp +++ b/velox/common/encode/Base64.cpp @@ -20,14 +20,25 @@ #include #include +#include "velox/common/base/Exceptions.h" + namespace facebook::velox::encoding { +// Constants defining the size in bytes of binary and encoded blocks for Base64 +// encoding. +// Size of a binary block in bytes (3 bytes = 24 bits) +constexpr static int kBinaryBlockByteSize = 3; +// Size of an encoded block in bytes (4 bytes = 24 bits) +constexpr static int kEncodedBlockByteSize = 4; + +// Character sets for Base64 and Base64 URL encoding constexpr const Base64::Charset kBase64Charset = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}; + constexpr const Base64::Charset kBase64UrlCharset = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', @@ -35,6 +46,7 @@ constexpr const Base64::Charset kBase64UrlCharset = { 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'}; +// Reverse lookup tables for decoding constexpr const Base64::ReverseIndex kBase64ReverseIndexTable = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, @@ -54,6 +66,7 @@ constexpr const Base64::ReverseIndex kBase64ReverseIndexTable = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}; + constexpr const Base64::ReverseIndex kBase64UrlReverseIndexTable = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, @@ -74,13 +87,15 @@ constexpr const Base64::ReverseIndex kBase64UrlReverseIndexTable = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}; +// Validate the character in charset with ReverseIndex table constexpr bool checkForwardIndex( uint8_t idx, const Base64::Charset& charset, - const Base64::ReverseIndex& table) { - return (table[static_cast(charset[idx])] == idx) && - (idx > 0 ? checkForwardIndex(idx - 1, charset, table) : true); + const Base64::ReverseIndex& reverseIndex) { + return (reverseIndex[static_cast(charset[idx])] == idx) && + (idx > 0 ? checkForwardIndex(idx - 1, charset, reverseIndex) : true); } + // Verify that for every entry in kBase64Charset, the corresponding entry // in kBase64ReverseIndexTable is correct. static_assert( @@ -89,6 +104,7 @@ static_assert( kBase64Charset, kBase64ReverseIndexTable), "kBase64Charset has incorrect entries"); + // Verify that for every entry in kBase64UrlCharset, the corresponding entry // in kBase64UrlReverseIndexTable is correct. static_assert( @@ -97,26 +113,28 @@ static_assert( kBase64UrlCharset, kBase64UrlReverseIndexTable), "kBase64UrlCharset has incorrect entries"); -// Similar to strchr(), but for null-terminated const strings. -// Another difference is that we do not consider "\0" to be present in the -// string. -// Returns true if "str" contains the character c. -constexpr bool constCharsetContains( + +// Searches for a character within a charset up to a certain index. +constexpr bool findCharacterInCharset( const Base64::Charset& charset, uint8_t idx, const char c) { return idx < charset.size() && - ((charset[idx] == c) || constCharsetContains(charset, idx + 1, c)); + ((charset[idx] == c) || findCharacterInCharset(charset, idx + 1, c)); } + +// Checks the consistency of a reverse index mapping for a given character +// set. constexpr bool checkReverseIndex( uint8_t idx, const Base64::Charset& charset, - const Base64::ReverseIndex& table) { - return (table[idx] == 255 - ? !constCharsetContains(charset, 0, static_cast(idx)) - : (charset[table[idx]] == idx)) && - (idx > 0 ? checkReverseIndex(idx - 1, charset, table) : true); + const Base64::ReverseIndex& reverseIndex) { + return (reverseIndex[idx] == 255 + ? !findCharacterInCharset(charset, 0, static_cast(idx)) + : (charset[reverseIndex[idx]] == idx)) && + (idx > 0 ? checkReverseIndex(idx - 1, charset, reverseIndex) : true); } + // Verify that for every entry in kBase64ReverseIndexTable, the corresponding // entry in kBase64Charset is correct. static_assert( @@ -125,10 +143,11 @@ static_assert( kBase64Charset, kBase64ReverseIndexTable), "kBase64ReverseIndexTable has incorrect entries."); + // Verify that for every entry in kBase64ReverseIndexTable, the corresponding // entry in kBase64Charset is correct. -// We can't run this check as the URL version has two duplicate entries so that -// the url decoder can handle url encodings and default encodings +// We can't run this check as the URL version has two duplicate entries so +// that the url decoder can handle url encodings and default encodings // static_assert( // checkReverseIndex( // sizeof(kBase64UrlReverseIndexTable) - 1, @@ -136,14 +155,15 @@ static_assert( // kBase64UrlReverseIndexTable), // "kBase64UrlReverseIndexTable has incorrect entries."); +// Implementation of Base64 encoding and decoding functions. template -/* static */ std::string -Base64::encodeImpl(const T& data, const Charset& charset, bool include_pad) { +/* static */ std::string Base64::encodeImpl( + const T& data, + const Base64::Charset& charset, + bool include_pad) { size_t outlen = calculateEncodedSize(data.size(), include_pad); - std::string out; out.resize(outlen); - encodeImpl(data, charset, include_pad, out.data()); return out; } @@ -176,7 +196,7 @@ void Base64::encodeUrl(const char* data, size_t len, char* output) { template /* static */ void Base64::encodeImpl( const T& data, - const Charset& charset, + const Base64::Charset& charset, bool include_pad, char* out) { auto len = data.size(); @@ -211,22 +231,24 @@ template *wp++ = charset[(curr >> 12) & 0x3f]; *wp++ = charset[(curr >> 6) & 0x3f]; if (include_pad) { - *wp = kBase64Pad; + *wp = kPadding; } } else { *wp++ = charset[(curr >> 12) & 0x3f]; if (include_pad) { - *wp++ = kBase64Pad; - *wp = kBase64Pad; + *wp++ = kPadding; + *wp = kPadding; } } } } +// static std::string Base64::encode(folly::StringPiece text) { return encodeImpl(text, kBase64Charset, true); } +// static std::string Base64::encode(const char* data, size_t len) { return encode(folly::StringPiece(data, len)); } @@ -277,31 +299,25 @@ class IOBufWrapper { } // namespace +// static std::string Base64::encode(const folly::IOBuf* data) { return encodeImpl(IOBufWrapper(data), kBase64Charset, true); } -void Base64::encodeAppend(folly::StringPiece text, std::string& out) { - size_t outlen = calculateEncodedSize(text.size(), true); - - size_t initialLen = out.size(); - out.resize(initialLen + outlen); - encodeImpl(text, kBase64Charset, true, out.data() + initialLen); -} - +// static std::string Base64::decode(folly::StringPiece encoded) { std::string output; Base64::decode(std::make_pair(encoded.data(), encoded.size()), output); return output; } +// static void Base64::decode( const std::pair& payload, std::string& output) { - size_t out_len = payload.second / 4 * 3; - output.resize(out_len, '\0'); - out_len = Base64::decode(payload.first, payload.second, &output[0], out_len); - output.resize(out_len); + size_t inputSize = payload.second; + output.resize(calculateDecodedSize(payload.first, inputSize)); + decode(payload.first, inputSize, output.data(), output.size()); } // static @@ -310,81 +326,80 @@ void Base64::decode(const char* data, size_t size, char* output) { Base64::decode(data, size, output, out_len); } -uint8_t Base64::Base64ReverseLookup( +// static +uint8_t Base64::base64ReverseLookup( char p, - const Base64::ReverseIndex& reverse_lookup) { - auto curr = reverse_lookup[(uint8_t)p]; + const Base64::ReverseIndex& reverseIndex) { + auto curr = reverseIndex[(uint8_t)p]; if (curr >= 0x40) { - throw Base64Exception( - "Base64::decode() - invalid input string: invalid characters"); + VELOX_USER_FAIL("decode() - invalid input string: invalid characters"); } - return curr; } +// static size_t Base64::decode(const char* src, size_t src_len, char* dst, size_t dst_len) { - return decodeImpl(src, src_len, dst, dst_len, kBase64ReverseIndexTable, true); + return decodeImpl(src, src_len, dst, dst_len, kBase64ReverseIndexTable); } // static -size_t -Base64::calculateDecodedSize(const char* data, size_t& size, bool withPadding) { +size_t Base64::calculateDecodedSize(const char* data, size_t& size) { if (size == 0) { return 0; } - auto needed = (size / 4) * 3; - if (withPadding) { - // If the pad characters are included then the source string must be a - // multiple of 4 and we can query the end of the string to see how much - // padding exists. - if (size % 4 != 0) { - throw Base64Exception( + // Check if the input data is padded + if (isPadded(data, size)) { + // If padded, ensure that the string length is a multiple of the encoded + // block size + if (size % kEncodedBlockByteSize != 0) { + VELOX_USER_FAIL( "Base64::decode() - invalid input string: " - "string length is not multiple of 4."); + "string length is not a multiple of 4."); } - auto padding = countPadding(data, size); + auto needed = (size * kBinaryBlockByteSize) / kEncodedBlockByteSize; + auto padding = numPadding(data, size); size -= padding; - return needed - padding; + + // Adjust the needed size by deducting the bytes corresponding to the + // padding from the calculated size. + return needed - + ((padding * kBinaryBlockByteSize) + (kEncodedBlockByteSize - 1)) / + kEncodedBlockByteSize; } + // If not padded, Calculate extra bytes, if any + auto extra = size % kEncodedBlockByteSize; + auto needed = (size / kEncodedBlockByteSize) * kBinaryBlockByteSize; - // If padding doesn't exist we need to calculate it from the size - if the - // size % 4 is 0 then we have an even multiple 3 byte chunks in the result - // if it is 2 then we need 1 more byte in the output. If it is 3 then we - // need 2 more bytes in the output. It should never be 1. - auto extra = size % 4; + // Adjust the needed size for extra bytes, if present if (extra) { if (extra == 1) { - throw Base64Exception( + VELOX_USER_FAIL( "Base64::decode() - invalid input string: " "string length cannot be 1 more than a multiple of 4."); } - return needed + extra - 1; + needed += (extra * kBinaryBlockByteSize) / kEncodedBlockByteSize; } - // Just because we don't need the pad, doesn't mean it is not there. The - // URL decoder should be able to handle the original encoding. - auto padding = countPadding(data, size); - size -= padding; - return needed - padding; + return needed; } +// static size_t Base64::decodeImpl( const char* src, size_t src_len, char* dst, size_t dst_len, - const Base64::ReverseIndex& reverse_lookup, - bool include_pad) { + const Base64::ReverseIndex& reverseIndex) { if (!src_len) { return 0; } - auto needed = calculateDecodedSize(src, src_len, include_pad); + auto needed = calculateDecodedSize(src, src_len); if (dst_len < needed) { - throw Base64Exception( + VELOX_USER_FAIL( "Base64::decode() - invalid output string: " "output string is too small."); } @@ -394,10 +409,10 @@ size_t Base64::decodeImpl( // Each character of the 4 encode 6 bits of the original, grab each with // the appropriate shifts to rebuild the original and then split that back // into the original 8 bit bytes. - uint32_t last = (Base64ReverseLookup(src[0], reverse_lookup) << 18) | - (Base64ReverseLookup(src[1], reverse_lookup) << 12) | - (Base64ReverseLookup(src[2], reverse_lookup) << 6) | - Base64ReverseLookup(src[3], reverse_lookup); + uint32_t last = (base64ReverseLookup(src[0], reverseIndex) << 18) | + (base64ReverseLookup(src[1], reverseIndex) << 12) | + (base64ReverseLookup(src[2], reverseIndex) << 6) | + base64ReverseLookup(src[3], reverseIndex); dst[0] = (last >> 16) & 0xff; dst[1] = (last >> 8) & 0xff; dst[2] = last & 0xff; @@ -406,14 +421,14 @@ size_t Base64::decodeImpl( // Handle the last 2-4 characters. This is similar to the above, but the // last 2 characters may or may not exist. DCHECK(src_len >= 2); - uint32_t last = (Base64ReverseLookup(src[0], reverse_lookup) << 18) | - (Base64ReverseLookup(src[1], reverse_lookup) << 12); + uint32_t last = (base64ReverseLookup(src[0], reverseIndex) << 18) | + (base64ReverseLookup(src[1], reverseIndex) << 12); dst[0] = (last >> 16) & 0xff; if (src_len > 2) { - last |= Base64ReverseLookup(src[2], reverse_lookup) << 6; + last |= base64ReverseLookup(src[2], reverseIndex) << 6; dst[1] = (last >> 8) & 0xff; if (src_len > 3) { - last |= Base64ReverseLookup(src[3], reverse_lookup); + last |= base64ReverseLookup(src[3], reverseIndex); dst[2] = last & 0xff; } } @@ -421,33 +436,38 @@ size_t Base64::decodeImpl( return needed; } +// static std::string Base64::encodeUrl(folly::StringPiece text) { return encodeImpl(text, kBase64UrlCharset, false); } +// static std::string Base64::encodeUrl(const char* data, size_t len) { return encodeUrl(folly::StringPiece(data, len)); } +// static std::string Base64::encodeUrl(const folly::IOBuf* data) { return encodeImpl(IOBufWrapper(data), kBase64UrlCharset, false); } +// static void Base64::decodeUrl( const char* src, size_t src_len, char* dst, - size_t dst_len, - bool hasPad) { - decodeImpl(src, src_len, dst, dst_len, kBase64UrlReverseIndexTable, hasPad); + size_t dst_len) { + decodeImpl(src, src_len, dst, dst_len, kBase64UrlReverseIndexTable); } +// static std::string Base64::decodeUrl(folly::StringPiece encoded) { std::string output; Base64::decodeUrl(std::make_pair(encoded.data(), encoded.size()), output); return output; } +// static void Base64::decodeUrl( const std::pair& payload, std::string& output) { @@ -458,8 +478,8 @@ void Base64::decodeUrl( payload.second, &output[0], out_len, - kBase64UrlReverseIndexTable, - false); + kBase64UrlReverseIndexTable); output.resize(out_len); } + } // namespace facebook::velox::encoding diff --git a/velox/common/encode/Base64.h b/velox/common/encode/Base64.h index 9888d97e67c54..13004175379a6 100644 --- a/velox/common/encode/Base64.h +++ b/velox/common/encode/Base64.h @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -22,26 +23,34 @@ #include #include +#include "velox/common/base/GTestMacros.h" + namespace facebook::velox::encoding { -class Base64Exception : public std::exception { +class Base64 { public: - explicit Base64Exception(const char* msg) : msg_(msg) {} - const char* what() const noexcept override { - return msg_; - } + static const size_t kCharsetSize = 64; + static const size_t kReverseIndexSize = 256; - protected: - const char* msg_; -}; + /// Character set used for encoding purposes. + /// Contains specific characters that form the encoding scheme. + using Charset = std::array; -class Base64 { - public: - using Charset = std::array; - using ReverseIndex = std::array; + /// Reverse lookup table for decoding purposes. + /// Maps each possible encoded character to its corresponding numeric value + /// within the encoding base. + using ReverseIndex = std::array; + + /// Padding character used in encoding. + static const char kPadding = '='; + /// Encodes the specified number of characters from the 'data'. static std::string encode(const char* data, size_t len); + + /// Encodes the specified text. static std::string encode(folly::StringPiece text); + + /// Encodes the specified IOBuf data. static std::string encode(const folly::IOBuf* text); /// Returns encoded size for the input of the specified size. @@ -52,15 +61,12 @@ class Base64 { /// returned by the calculateEncodedSize(). static void encode(const char* data, size_t size, char* output); - // Appends the encoded text to out. - static void encodeAppend(folly::StringPiece text, std::string& out); - + /// Decodes the specified encoded text. static std::string decode(folly::StringPiece encoded); - /// Returns decoded size for the specified input. Adjusts the 'size' to - /// subtract the length of the padding, if exists. - static size_t - calculateDecodedSize(const char* data, size_t& size, bool withPadding = true); + /// Returns the actual size of the decoded data. Will also remove the padding + /// length from the input data 'size'. + static size_t calculateDecodedSize(const char* data, size_t& size); /// Decodes the specified number of characters from the 'data' and writes the /// result to the 'output'. The output must have enough space, e.g. as @@ -69,47 +75,68 @@ class Base64 { static void decode( const std::pair& payload, - std::string& outp); + std::string& output); /// Encodes the specified number of characters from the 'data' and writes the - /// result to the 'output'. The output must have enough space, e.g. as - /// returned by the calculateEncodedSize(). + /// result to the 'output' using URL encoding. The output must have enough + /// space as returned by the calculateEncodedSize(). static void encodeUrl(const char* data, size_t size, char* output); - // compatible with www's Base64URL::encode/decode - // TODO rename encode_url/decode_url to encodeUrl/encodeUrl. + /// Encodes the specified number of characters from the 'data' using URL + /// encoding. static std::string encodeUrl(const char* data, size_t len); + + /// Encodes the specified IOBuf data using URL encoding. static std::string encodeUrl(const folly::IOBuf* data); + + /// Encodes the specified text using URL encoding. static std::string encodeUrl(folly::StringPiece text); + + /// Decodes the specified URL encoded payload and writes the result to the + /// 'output'. static void decodeUrl( const std::pair& payload, std::string& output); + + /// Decodes the specified URL encoded text. static std::string decodeUrl(folly::StringPiece text); + /// Decodes the specified number of characters from the 'src' and writes the + /// result to the 'dst'. static size_t decode(const char* src, size_t src_len, char* dst, size_t dst_len); - static void decodeUrl( - const char* src, - size_t src_len, - char* dst, - size_t dst_len, - bool pad); - - constexpr static char kBase64Pad = '='; + /// Decodes the specified number of characters from the 'src' using URL + /// encoding and writes the result to the 'dst'. + static void + decodeUrl(const char* src, size_t src_len, char* dst, size_t dst_len); private: - static inline size_t countPadding(const char* src, size_t len) { - DCHECK_GE(len, 2); - return src[len - 1] != kBase64Pad ? 0 : src[len - 2] != kBase64Pad ? 1 : 2; + /// Checks if there is padding in encoded data. + static inline bool isPadded(const char* data, size_t len) { + return (len > 0 && data[len - 1] == kPadding); } - static uint8_t Base64ReverseLookup(char p, const ReverseIndex& table); + /// Counts the number of padding characters in encoded data. + static inline size_t numPadding(const char* src, size_t len) { + size_t numPadding{0}; + while (len > 0 && src[len - 1] == kPadding) { + numPadding++; + len--; + } + return numPadding; + } + + /// Performs a reverse lookup in the reverse index to retrieve the original + /// index of a character in the base. + static uint8_t base64ReverseLookup(char p, const ReverseIndex& reverseIndex); + /// Encodes the specified data using the provided charset. template static std::string encodeImpl(const T& data, const Charset& charset, bool include_pad); + /// Encodes the specified data using the provided charset. template static void encodeImpl( const T& data, @@ -117,13 +144,16 @@ class Base64 { bool include_pad, char* out); + /// Decodes the specified data using the provided reverse lookup table. static size_t decodeImpl( const char* src, size_t src_len, char* dst, size_t dst_len, - const ReverseIndex& table, - bool include_pad); + const ReverseIndex& table); + + VELOX_FRIEND_TEST(Base64Test, checksPadding); + VELOX_FRIEND_TEST(Base64Test, countsPaddingCorrectly); }; } // namespace facebook::velox::encoding diff --git a/velox/common/encode/CMakeLists.txt b/velox/common/encode/CMakeLists.txt index d9918d53b59c5..501c690c476bd 100644 --- a/velox/common/encode/CMakeLists.txt +++ b/velox/common/encode/CMakeLists.txt @@ -12,5 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_encode Base64.cpp) -target_link_libraries(velox_encode PUBLIC Folly::folly) +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() + +velox_add_library(velox_encode Base64.cpp) +velox_link_libraries(velox_encode PUBLIC Folly::folly) diff --git a/velox/common/encode/tests/Base64Test.cpp b/velox/common/encode/tests/Base64Test.cpp new file mode 100644 index 0000000000000..9cbbbad471245 --- /dev/null +++ b/velox/common/encode/tests/Base64Test.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/encode/Base64.h" + +#include +#include "velox/common/base/Exceptions.h" +#include "velox/common/base/tests/GTestUtils.h" + +namespace facebook::velox::encoding { + +class Base64Test : public ::testing::Test {}; + +TEST_F(Base64Test, fromBase64) { + EXPECT_EQ( + "Hello, World!", + Base64::decode(folly::StringPiece("SGVsbG8sIFdvcmxkIQ=="))); + EXPECT_EQ( + "Base64 encoding is fun.", + Base64::decode(folly::StringPiece("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4="))); + EXPECT_EQ( + "Simple text", Base64::decode(folly::StringPiece("U2ltcGxlIHRleHQ="))); + EXPECT_EQ( + "1234567890", Base64::decode(folly::StringPiece("MTIzNDU2Nzg5MA=="))); + + // Check encoded strings without padding + EXPECT_EQ( + "Hello, World!", + Base64::decode(folly::StringPiece("SGVsbG8sIFdvcmxkIQ"))); + EXPECT_EQ( + "Base64 encoding is fun.", + Base64::decode(folly::StringPiece("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4"))); + EXPECT_EQ( + "Simple text", Base64::decode(folly::StringPiece("U2ltcGxlIHRleHQ"))); + EXPECT_EQ("1234567890", Base64::decode(folly::StringPiece("MTIzNDU2Nzg5MA"))); +} + +TEST_F(Base64Test, calculateDecodedSizeProperSize) { + size_t encoded_size{0}; + + encoded_size = 20; + EXPECT_EQ( + 13, Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ==", encoded_size)); + EXPECT_EQ(18, encoded_size); + + encoded_size = 18; + EXPECT_EQ( + 13, Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ", encoded_size)); + EXPECT_EQ(18, encoded_size); + + encoded_size = 21; + VELOX_ASSERT_THROW( + Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ==", encoded_size), + "Base64::decode() - invalid input string: string length cannot be 1 more than a multiple of 4."); + + encoded_size = 32; + EXPECT_EQ( + 23, + Base64::calculateDecodedSize( + "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4=", encoded_size)); + EXPECT_EQ(31, encoded_size); + + encoded_size = 31; + EXPECT_EQ( + 23, + Base64::calculateDecodedSize( + "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4", encoded_size)); + EXPECT_EQ(31, encoded_size); + + encoded_size = 16; + EXPECT_EQ(10, Base64::calculateDecodedSize("MTIzNDU2Nzg5MA==", encoded_size)); + EXPECT_EQ(14, encoded_size); + + encoded_size = 14; + EXPECT_EQ(10, Base64::calculateDecodedSize("MTIzNDU2Nzg5MA", encoded_size)); + EXPECT_EQ(14, encoded_size); +} + +TEST_F(Base64Test, checksPadding) { + EXPECT_TRUE(Base64::isPadded("ABC=", 4)); + EXPECT_FALSE(Base64::isPadded("ABC", 3)); +} + +TEST_F(Base64Test, countsPaddingCorrectly) { + EXPECT_EQ(0, Base64::numPadding("ABC", 3)); + EXPECT_EQ(1, Base64::numPadding("ABC=", 4)); + EXPECT_EQ(2, Base64::numPadding("AB==", 4)); +} +} // namespace facebook::velox::encoding diff --git a/velox/common/encode/tests/CMakeLists.txt b/velox/common/encode/tests/CMakeLists.txt new file mode 100644 index 0000000000000..90c9733ecf22e --- /dev/null +++ b/velox/common/encode/tests/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(velox_common_encode_test Base64Test.cpp) +add_test(velox_common_encode_test velox_common_encode_test) +target_link_libraries( + velox_common_encode_test + PUBLIC Folly::folly + PRIVATE velox_encode velox_exception GTest::gtest GTest::gtest_main) diff --git a/velox/common/file/CMakeLists.txt b/velox/common/file/CMakeLists.txt index fd139c168a12f..31d9f1ebe0ff9 100644 --- a/velox/common/file/CMakeLists.txt +++ b/velox/common/file/CMakeLists.txt @@ -14,15 +14,17 @@ # for generated headers include_directories(.) -add_library(velox_file File.cpp FileSystems.cpp Utils.cpp) -target_link_libraries( +velox_add_library( + velox_file + File.cpp + FileInputStream.cpp + FileSystems.cpp + Utils.cpp) +velox_link_libraries( velox_file PUBLIC velox_exception Folly::folly - PRIVATE velox_common_base fmt::fmt glog::glog) + PRIVATE velox_buffer velox_common_base fmt::fmt glog::glog) -if(${VELOX_BUILD_TESTING}) +if(${VELOX_BUILD_TESTING} OR ${VELOX_BUILD_TEST_UTILS}) add_subdirectory(tests) endif() -if(${VELOX_ENABLE_BENCHMARKS}) - add_subdirectory(benchmark) -endif() diff --git a/velox/common/file/File.cpp b/velox/common/file/File.cpp index 68b5c4656b333..46156c2f1aa77 100644 --- a/velox/common/file/File.cpp +++ b/velox/common/file/File.cpp @@ -22,8 +22,11 @@ #include #include -#include #include +#ifdef linux +#include +#endif // linux +#include namespace facebook::velox { @@ -33,6 +36,27 @@ namespace facebook::velox { return result; \ } +namespace { +FOLLY_ALWAYS_INLINE void checkNotClosed(bool closed) { + VELOX_CHECK(!closed, "file is closed"); +} + +template +T getAttribute( + const std::unordered_map& attributes, + const std::string_view& key, + const T& defaultValue) { + if (attributes.count(std::string(key)) > 0) { + try { + return folly::to(attributes.at(std::string(key))); + } catch (const std::exception& e) { + VELOX_FAIL("Failed while parsing File attributes: {}", e.what()); + } + } + return defaultValue; +} +} // namespace + std::string ReadFile::pread(uint64_t offset, uint64_t length) const { std::string buf; buf.resize(length); @@ -61,17 +85,21 @@ uint64_t ReadFile::preadv( return numRead; } -void ReadFile::preadv( +uint64_t ReadFile::preadv( folly::Range regions, folly::Range iobufs) const { VELOX_CHECK_EQ(regions.size(), iobufs.size()); + uint64_t length = 0; for (size_t i = 0; i < regions.size(); ++i) { const auto& region = regions[i]; auto& output = iobufs[i]; output = folly::IOBuf(folly::IOBuf::CREATE, region.length); pread(region.offset, region.length, output.writableData()); output.append(region.length); + length += region.length; } + + return length; } std::string_view @@ -90,28 +118,39 @@ void InMemoryWriteFile::append(std::string_view data) { file_->append(data); } +void InMemoryWriteFile::append(std::unique_ptr data) { + for (auto rangeIter = data->begin(); rangeIter != data->end(); ++rangeIter) { + file_->append( + reinterpret_cast(rangeIter->data()), rangeIter->size()); + } +} + uint64_t InMemoryWriteFile::size() const { return file_->size(); } LocalReadFile::LocalReadFile(std::string_view path) : path_(path) { fd_ = open(path_.c_str(), O_RDONLY); + if (fd_ < 0) { + if (errno == ENOENT) { + VELOX_FILE_NOT_FOUND_ERROR("No such file or directory: {}", path); + } else { + VELOX_FAIL( + "open failure in LocalReadFile constructor, {} {} {}.", + fd_, + path, + folly::errnoStr(errno)); + } + } + const off_t ret = lseek(fd_, 0, SEEK_END); VELOX_CHECK_GE( - fd_, - 0, - "open failure in LocalReadFile constructor, {} {} {}.", - fd_, - path, - folly::errnoStr(errno)); - const off_t rc = lseek(fd_, 0, SEEK_END); - VELOX_CHECK_GE( - rc, + ret, 0, "fseek failure in LocalReadFile constructor, {} {} {}.", - rc, + ret, path, folly::errnoStr(errno)); - size_ = rc; + size_ = ret; } LocalReadFile::LocalReadFile(int32_t fd) : fd_(fd) {} @@ -210,36 +249,62 @@ uint64_t LocalReadFile::memoryUsage() const { return sizeof(FILE); } +bool LocalWriteFile::Attributes::cowDisabled( + const std::unordered_map& attrs) { + return getAttribute(attrs, kNoCow, kDefaultNoCow); +} + LocalWriteFile::LocalWriteFile( std::string_view path, bool shouldCreateParentDirectories, - bool shouldThrowOnFileAlreadyExists) { - auto dir = fs::path(path).parent_path(); + bool shouldThrowOnFileAlreadyExists, + bool bufferWrite) + : path_(path) { + const auto dir = fs::path(path_).parent_path(); if (shouldCreateParentDirectories && !fs::exists(dir)) { VELOX_CHECK( common::generateFileDirectory(dir.c_str()), "Failed to generate file directory"); } - std::unique_ptr buf(new char[path.size() + 1]); - buf[path.size()] = 0; - memcpy(buf.get(), path.data(), path.size()); - { - if (shouldThrowOnFileAlreadyExists) { - FILE* exists = fopen(buf.get(), "rb"); - VELOX_CHECK( - !exists, - "Failure in LocalWriteFile: path '{}' already exists.", - path); - } + // File open flags: write-only, create the file if it doesn't exist. + int32_t flags = O_WRONLY | O_CREAT; + if (shouldThrowOnFileAlreadyExists) { + flags |= O_EXCL; } - auto file = fopen(buf.get(), "ab"); - VELOX_CHECK( - file, - "fopen failure in LocalWriteFile constructor, {} {}.", - path, +#ifdef linux + if (!bufferWrite) { + flags |= O_DIRECT; + } +#endif // linux + + // The file mode bits to be applied when a new file is created. By default + // user has read and write access to the file. + // NOTE: The mode argument must be supplied if O_CREAT or O_TMPFILE is + // specified in flags; if it is not supplied, some arbitrary bytes from the + // stack will be applied as the file mode. + const int32_t mode = S_IRUSR | S_IWUSR; + + std::unique_ptr buf(new char[path_.size() + 1]); + buf[path_.size()] = 0; + ::memcpy(buf.get(), path_.data(), path_.size()); + fd_ = open(buf.get(), flags, mode); + VELOX_CHECK_GE( + fd_, + 0, + "Cannot open or create {}. Error: {}", + path_, + folly::errnoStr(errno)); + + const off_t ret = lseek(fd_, 0, SEEK_END); + VELOX_CHECK_GE( + ret, + 0, + "fseek failure in LocalWriteFile constructor, {} {} {}.", + ret, + path_, folly::errnoStr(errno)); - file_ = file; + size_ = ret; } LocalWriteFile::~LocalWriteFile() { @@ -253,39 +318,125 @@ LocalWriteFile::~LocalWriteFile() { } void LocalWriteFile::append(std::string_view data) { - VELOX_CHECK(!closed_, "file is closed"); - const uint64_t bytes_written = fwrite(data.data(), 1, data.size(), file_); + checkNotClosed(closed_); + const uint64_t bytesWritten = ::write(fd_, data.data(), data.size()); VELOX_CHECK_EQ( - bytes_written, + bytesWritten, data.size(), - "fwrite failure in LocalWriteFile::append, {} vs {}.", - bytes_written, - data.size()); + "fwrite failure in LocalWriteFile::append, {} vs {}: {}", + bytesWritten, + data.size(), + folly::errnoStr(errno)); + size_ += bytesWritten; +} + +void LocalWriteFile::append(std::unique_ptr data) { + checkNotClosed(closed_); + uint64_t totalBytesWritten{0}; + for (auto rangeIter = data->begin(); rangeIter != data->end(); ++rangeIter) { + const auto bytesToWrite = rangeIter->size(); + const uint64_t bytesWritten = + ::write(fd_, rangeIter->data(), rangeIter->size()); + totalBytesWritten += bytesWritten; + if (bytesWritten != bytesToWrite) { + VELOX_FAIL( + "fwrite failure in LocalWriteFile::append, {} vs {}: {}", + bytesWritten, + bytesToWrite, + folly::errnoStr(errno)); + } + } + const auto totalBytesToWrite = data->computeChainDataLength(); + VELOX_CHECK_EQ( + totalBytesWritten, + totalBytesToWrite, + "Failure in LocalWriteFile::append, {} vs {}", + totalBytesWritten, + totalBytesToWrite); + size_ += totalBytesWritten; +} + +void LocalWriteFile::write( + const std::vector& iovecs, + int64_t offset, + int64_t length) { + checkNotClosed(closed_); + VELOX_CHECK_GE(offset, 0, "Offset cannot be negative."); + const auto bytesWritten = ::pwritev( + fd_, iovecs.data(), static_cast(iovecs.size()), offset); + VELOX_CHECK_EQ( + bytesWritten, + length, + "Failure in LocalWriteFile::write, {} vs {}", + bytesWritten, + length); + size_ = std::max(size_, offset + bytesWritten); +} + +void LocalWriteFile::truncate(int64_t newSize) { + checkNotClosed(closed_); + VELOX_CHECK_GE(newSize, 0, "New size cannot be negative."); + const auto ret = ::ftruncate(fd_, newSize); + VELOX_CHECK_EQ( + ret, + 0, + "ftruncate failed in LocalWriteFile::truncate: {}.", + folly::errnoStr(errno)); + size_ = newSize; } void LocalWriteFile::flush() { - VELOX_CHECK(!closed_, "file is closed"); - auto ret = fflush(file_); + checkNotClosed(closed_); + const auto ret = ::fsync(fd_); VELOX_CHECK_EQ( ret, 0, - "fflush failed in LocalWriteFile::flush: {}.", + "fsync failed in LocalWriteFile::flush: {}.", folly::errnoStr(errno)); } +void LocalWriteFile::setAttributes( + const std::unordered_map& attributes) { + checkNotClosed(closed_); + attributes_ = attributes; +#ifdef linux + if (Attributes::cowDisabled(attributes_)) { + int attr{0}; + auto ret = ioctl(fd_, FS_IOC_GETFLAGS, &attr); + VELOX_CHECK_EQ( + 0, + ret, + "ioctl(FS_IOC_GETFLAGS) failed: {}, {}", + ret, + folly::errnoStr(errno)); + attr |= FS_NOCOW_FL; + ret = ioctl(fd_, FS_IOC_SETFLAGS, &attr); + VELOX_CHECK_EQ( + 0, + ret, + "ioctl(FS_IOC_SETFLAGS, FS_NOCOW_FL) failed: {}, {}", + ret, + folly::errnoStr(errno)); + } +#endif // linux +} + +std::unordered_map LocalWriteFile::getAttributes() + const { + checkNotClosed(closed_); + return attributes_; +} + void LocalWriteFile::close() { if (!closed_) { - auto ret = fclose(file_); + const auto ret = ::close(fd_); VELOX_CHECK_EQ( ret, 0, - "fwrite failure in LocalWriteFile::close: {}.", + "close failed in LocalWriteFile::close: {}.", folly::errnoStr(errno)); closed_ = true; } } -uint64_t LocalWriteFile::size() const { - return ftell(file_); -} } // namespace facebook::velox diff --git a/velox/common/file/File.h b/velox/common/file/File.h index f851fcac9f329..294961a32c55e 100644 --- a/velox/common/file/File.h +++ b/velox/common/file/File.h @@ -26,6 +26,7 @@ #pragma once +#include #include #include #include @@ -50,8 +51,8 @@ class ReadFile { // buffer 'buf'. The bytes are returned as a string_view pointing to 'buf'. // // This method should be thread safe. - virtual std::string_view - pread(uint64_t offset, uint64_t length, void* FOLLY_NONNULL buf) const = 0; + virtual std::string_view pread(uint64_t offset, uint64_t length, void* buf) + const = 0; // Same as above, but returns owned data directly. // @@ -74,17 +75,19 @@ class ReadFile { // array must be pre-allocated by the caller, with the same size as `regions`, // but don't need to be initialized, since each iobuf will be copy-constructed // by the preadv. + // Returns the total number of bytes read, which might be different than the + // sum of all buffer sizes (for example, if coalescing was used). // // This method should be thread safe. - virtual void preadv( + virtual uint64_t preadv( folly::Range regions, folly::Range iobufs) const; - // Like preadv but may execute asynchronously and returns the read - // size or exception via SemiFuture. Use hasPreadvAsync() to check - // if the implementation is in fact asynchronous. - // - // This method should be thread safe. + /// Like preadv but may execute asynchronously and returns the read size or + /// exception via SemiFuture. Use hasPreadvAsync() to check if the + /// implementation is in fact asynchronous. + /// + /// This method should be thread safe. virtual folly::SemiFuture preadvAsync( uint64_t offset, const std::vector>& buffers) const { @@ -124,10 +127,8 @@ class ReadFile { virtual std::string getName() const = 0; - // - // Get the natural size for reads. - // @return the number of bytes that should be read at once - // + /// Gets the natural size for reads. Returns the number of bytes that should + /// be read at once. virtual uint64_t getNaturalReadSize() const = 0; protected: @@ -140,17 +141,55 @@ class WriteFile { public: virtual ~WriteFile() = default; - // Appends data to the end of the file. + /// Appends data to the end of the file. virtual void append(std::string_view data) = 0; - // Flushes any local buffers, i.e. ensures the backing medium received - // all data that has been appended. + /// Appends data to the end of the file. + virtual void append(std::unique_ptr /* data */) { + VELOX_NYI("IOBuf appending is not implemented"); + } + + /// Writes data at the given offset of the file. + /// + /// NOTE: this is only supported on local file system and used by SSD cache + /// for now. For filesystem like S3, it is not supported. + virtual void write( + const std::vector& /* iovecs */, + int64_t /* offset */, + int64_t /* length */ + ) { + VELOX_NYI("{} is not implemented", __FUNCTION__); + } + + /// Truncates file to a new size. + /// + /// NOTE: this is only supported on local file system and used by SSD cache + /// for now. For filesystem like S3, it is not supported. + virtual void truncate(int64_t /* newSize */) { + VELOX_NYI("{} is not implemented", __FUNCTION__); + } + + /// Flushes any write buffers, i.e. ensures the remote storage backend or + /// local storage medium received all the written data. virtual void flush() = 0; - // Close the file. Any cleanup (disk flush, etc.) will be done here. + /// Sets the file attributes, which are file implementation specific. + virtual void setAttributes( + const std::unordered_map& /* attributes */) { + VELOX_NYI("{} is not implemented", __FUNCTION__); + } + + /// Gets the file attributes, which are file implementation specific. + virtual std::unordered_map getAttributes() const { + VELOX_NYI("{} is not implemented", __FUNCTION__); + } + + /// Closes the file. Any cleanup (disk flush, etc.) will be done here. virtual void close() = 0; - // Current file size, i.e. the sum of all previous Appends. + /// Current file size, i.e. the sum of all previous Appends. No flush should + /// be needed to get the exact size written, and this should be able to be + /// called after the file close. virtual uint64_t size() const = 0; }; @@ -169,10 +208,8 @@ class InMemoryReadFile : public ReadFile { explicit InMemoryReadFile(std::string file) : ownedFile_(std::move(file)), file_(ownedFile_) {} - std::string_view pread( - uint64_t offset, - uint64_t length, - void* FOLLY_NONNULL buf) const override; + std::string_view pread(uint64_t offset, uint64_t length, void* buf) + const override; std::string pread(uint64_t offset, uint64_t length) const override; @@ -208,31 +245,33 @@ class InMemoryReadFile : public ReadFile { class InMemoryWriteFile final : public WriteFile { public: - explicit InMemoryWriteFile(std::string* FOLLY_NONNULL file) : file_(file) {} + explicit InMemoryWriteFile(std::string* file) : file_(file) {} void append(std::string_view data) final; + void append(std::unique_ptr data) final; void flush() final {} void close() final {} uint64_t size() const final; private: - std::string* FOLLY_NONNULL file_; + std::string* file_; }; -// Current implementation for the local version is quite simple (e.g. no -// internal arenaing), as local disk writes are expected to be cheap. Local -// files match against any filepath starting with '/'. - +/// Current implementation for the local version is quite simple (e.g. no +/// internal arenaing), as local disk writes are expected to be cheap. Local +/// files match against any filepath starting with '/'. class LocalReadFile final : public ReadFile { public: explicit LocalReadFile(std::string_view path); + /// TODO: deprecate this after creating local file all through velox fs + /// interface. explicit LocalReadFile(int32_t fd); ~LocalReadFile(); - std::string_view - pread(uint64_t offset, uint64_t length, void* FOLLY_NONNULL buf) const final; + std::string_view pread(uint64_t offset, uint64_t length, void* buf) + const final; uint64_t size() const final; @@ -258,8 +297,7 @@ class LocalReadFile final : public ReadFile { } private: - void preadInternal(uint64_t offset, uint64_t length, char* FOLLY_NONNULL pos) - const; + void preadInternal(uint64_t offset, uint64_t length, char* pos) const; std::string path_; int32_t fd_; @@ -268,22 +306,55 @@ class LocalReadFile final : public ReadFile { class LocalWriteFile final : public WriteFile { public: + struct Attributes { + // If set to true, the file will not be subject to copy-on-write updates. + // This flag has an effect only on filesystems that support copy-on-write + // semantics, such as Btrfs. + static constexpr std::string_view kNoCow{"write-on-copy-disabled"}; + static constexpr bool kDefaultNoCow{false}; + + static bool cowDisabled( + const std::unordered_map& attrs); + }; + // An error is thrown is a file already exists at |path|, // unless flag shouldThrowOnFileAlreadyExists is false explicit LocalWriteFile( std::string_view path, bool shouldCreateParentDirectories = false, - bool shouldThrowOnFileAlreadyExists = true); + bool shouldThrowOnFileAlreadyExists = true, + bool bufferWrite = true); + ~LocalWriteFile(); void append(std::string_view data) final; + + void append(std::unique_ptr data) final; + + void write(const std::vector& iovecs, int64_t offset, int64_t length) + final; + + void truncate(int64_t newSize) final; + void flush() final; + + void setAttributes( + const std::unordered_map& attributes) final; + + std::unordered_map getAttributes() const final; + void close() final; - uint64_t size() const final; + + uint64_t size() const final { + return size_; + } private: - FILE* FOLLY_NONNULL file_; - mutable long size_; + // File descriptor. + int32_t fd_{-1}; + std::string path_; + uint64_t size_{0}; + std::unordered_map attributes_{}; bool closed_{false}; }; diff --git a/velox/common/file/FileInputStream.cpp b/velox/common/file/FileInputStream.cpp new file mode 100644 index 0000000000000..e680733c637b4 --- /dev/null +++ b/velox/common/file/FileInputStream.cpp @@ -0,0 +1,257 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/file/FileInputStream.h" + +namespace facebook::velox::common { + +FileInputStream::FileInputStream( + std::unique_ptr&& file, + uint64_t bufferSize, + memory::MemoryPool* pool) + : file_(std::move(file)), + fileSize_(file_->size()), + bufferSize_(std::min(fileSize_, bufferSize)), + pool_(pool), + readAheadEnabled_((bufferSize_ < fileSize_) && file_->hasPreadvAsync()) { + VELOX_CHECK_NOT_NULL(pool_); + VELOX_CHECK_GT(fileSize_, 0, "Empty FileInputStream"); + + buffers_.push_back(AlignedBuffer::allocate(bufferSize_, pool_)); + if (readAheadEnabled_) { + buffers_.push_back(AlignedBuffer::allocate(bufferSize_, pool_)); + } + readNextRange(); +} + +FileInputStream::~FileInputStream() { + if (!readAheadWait_.valid()) { + return; + } + try { + readAheadWait_.wait(); + } catch (const std::exception& ex) { + // ignore any prefetch error when query has failed. + LOG(WARNING) << "FileInputStream read-ahead failed on destruction " + << ex.what(); + } +} + +void FileInputStream::readNextRange() { + VELOX_CHECK(current_ == nullptr || current_->availableBytes() == 0); + ranges_.clear(); + current_ = nullptr; + + int32_t readBytes{0}; + uint64_t readTimeNs{0}; + { + NanosecondTimer timer{&readTimeNs}; + if (readAheadWait_.valid()) { + readBytes = std::move(readAheadWait_) + .via(&folly::QueuedImmediateExecutor::instance()) + .wait() + .value(); + VELOX_CHECK(!readAheadWait_.valid()); + VELOX_CHECK_LT( + 0, readBytes, "Read past end of FileInputStream {}", fileSize_); + advanceBuffer(); + } else { + readBytes = readSize(); + VELOX_CHECK_LT( + 0, readBytes, "Read past end of FileInputStream {}", fileSize_); + NanosecondTimer timer{&readTimeNs}; + file_->pread(fileOffset_, readBytes, buffer()->asMutable()); + } + } + + ranges_.resize(1); + ranges_[0] = {buffer()->asMutable(), readBytes, 0}; + current_ = ranges_.data(); + fileOffset_ += readBytes; + + updateStats(readBytes, readTimeNs); + + maybeIssueReadahead(); +} + +size_t FileInputStream::size() const { + return fileSize_; +} + +bool FileInputStream::atEnd() const { + return tellp() >= fileSize_; +} + +std::streampos FileInputStream::tellp() const { + if (current_ == nullptr) { + VELOX_CHECK_EQ(fileOffset_, fileSize_); + return fileOffset_; + } + return fileOffset_ - current_->availableBytes(); +} + +void FileInputStream::seekp(std::streampos position) { + static_assert(sizeof(std::streamsize) <= sizeof(int64_t)); + const int64_t seekPos = position; + const int64_t curPos = tellp(); + VELOX_CHECK_GE( + seekPos, curPos, "Backward seek is not supported by FileInputStream"); + + const int64_t toSkip = seekPos - curPos; + if (toSkip == 0) { + return; + } + doSeek(toSkip); +} + +void FileInputStream::skip(int32_t size) { + doSeek(size); +} + +void FileInputStream::doSeek(int64_t skipBytes) { + VELOX_CHECK_GE(skipBytes, 0, "Attempting to skip negative number of bytes"); + if (skipBytes == 0) { + return; + } + + VELOX_CHECK_LE( + skipBytes, + remainingSize(), + "Skip past the end of FileInputStream: {}", + fileSize_); + + for (;;) { + const int64_t skippedBytes = + std::min(current_->availableBytes(), skipBytes); + skipBytes -= skippedBytes; + current_->position += skippedBytes; + if (skipBytes == 0) { + return; + } + readNextRange(); + } +} + +size_t FileInputStream::remainingSize() const { + return fileSize_ - tellp(); +} + +uint8_t FileInputStream::readByte() { + VELOX_CHECK_GT( + remainingSize(), 0, "Read past the end of input file {}", fileSize_); + + if (current_->availableBytes() > 0) { + return current_->buffer[current_->position++]; + } + readNextRange(); + return readByte(); +} + +void FileInputStream::readBytes(uint8_t* bytes, int32_t size) { + VELOX_CHECK_GE(size, 0, "Attempting to read negative number of bytes"); + if (size == 0) { + return; + } + + VELOX_CHECK_LE( + size, remainingSize(), "Read past the end of input file {}", fileSize_); + + int32_t offset{0}; + for (;;) { + const int32_t readBytes = + std::min(current_->availableBytes(), size); + simd::memcpy( + bytes + offset, current_->buffer + current_->position, readBytes); + offset += readBytes; + size -= readBytes; + current_->position += readBytes; + if (size == 0) { + return; + } + readNextRange(); + } +} + +std::string_view FileInputStream::nextView(int32_t size) { + VELOX_CHECK_GE(size, 0, "Attempting to view negative number of bytes"); + if (remainingSize() == 0) { + return std::string_view(nullptr, 0); + } + + if (current_->availableBytes() == 0) { + readNextRange(); + } + + VELOX_CHECK_GT(current_->availableBytes(), 0); + const auto position = current_->position; + const auto viewSize = std::min(current_->availableBytes(), size); + current_->position += viewSize; + return std::string_view( + reinterpret_cast(current_->buffer) + position, viewSize); +} + +uint64_t FileInputStream::readSize() const { + return std::min(fileSize_ - fileOffset_, bufferSize_); +} + +void FileInputStream::maybeIssueReadahead() { + VELOX_CHECK(!readAheadWait_.valid()); + if (!readAheadEnabled_) { + return; + } + const auto size = readSize(); + if (size == 0) { + return; + } + std::vector> ranges; + ranges.emplace_back(nextBuffer()->asMutable(), size); + readAheadWait_ = file_->preadvAsync(fileOffset_, ranges); + VELOX_CHECK(readAheadWait_.valid()); +} + +void FileInputStream::updateStats(uint64_t readBytes, uint64_t readTimeNs) { + stats_.readBytes += readBytes; + stats_.readTimeNs += readTimeNs; + ++stats_.numReads; +} + +std::string FileInputStream::toString() const { + return fmt::format( + "file (offset {}/size {}) current (position {}/ size {})", + succinctBytes(fileOffset_), + succinctBytes(fileSize_), + current_ == nullptr ? "NULL" : succinctBytes(current_->position), + current_ == nullptr ? "NULL" : succinctBytes(current_->size)); +} + +FileInputStream::Stats FileInputStream::stats() const { + return stats_; +} + +bool FileInputStream::Stats::operator==( + const FileInputStream::Stats& other) const { + return std::tie(numReads, readBytes, readTimeNs) == + std::tie(other.numReads, other.readBytes, other.readTimeNs); +} + +std::string FileInputStream::Stats::toString() const { + return fmt::format( + "numReads: {}, readBytes: {}, readTimeNs: {}", + numReads, + succinctBytes(readBytes), + succinctMicros(readTimeNs)); +} +} // namespace facebook::velox::common diff --git a/velox/common/file/FileInputStream.h b/velox/common/file/FileInputStream.h new file mode 100644 index 0000000000000..6daf9f84e1098 --- /dev/null +++ b/velox/common/file/FileInputStream.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "velox/buffer/Buffer.h" +#include "velox/common/file/File.h" +#include "velox/common/memory/ByteStream.h" + +namespace facebook::velox::common { + +/// Readonly byte input stream backed by file. +class FileInputStream : public ByteInputStream { + public: + FileInputStream( + std::unique_ptr&& file, + uint64_t bufferSize, + memory::MemoryPool* pool); + + ~FileInputStream() override; + + FileInputStream(const FileInputStream&) = delete; + FileInputStream& operator=(const FileInputStream& other) = delete; + FileInputStream(FileInputStream&& other) noexcept = delete; + FileInputStream& operator=(FileInputStream&& other) noexcept = delete; + + size_t size() const override; + + bool atEnd() const override; + + std::streampos tellp() const override; + + void seekp(std::streampos pos) override; + + void skip(int32_t size) override; + + size_t remainingSize() const override; + + uint8_t readByte() override; + + void readBytes(uint8_t* bytes, int32_t size) override; + + std::string_view nextView(int32_t size) override; + + std::string toString() const override; + + /// Records the file read stats. + struct Stats { + uint32_t numReads{0}; + uint64_t readBytes{0}; + uint64_t readTimeNs{0}; + + bool operator==(const Stats& other) const; + + std::string toString() const; + }; + Stats stats() const; + + private: + void doSeek(int64_t skipBytes); + + // Invoked to read the next byte range from the file in a buffer. + void readNextRange(); + + // Issues readahead if underlying file system supports async mode read. + // + // TODO: we might consider to use AsyncSource to support read-ahead on + // filesystem which doesn't support async mode read. + void maybeIssueReadahead(); + + inline uint64_t readSize() const; + + inline uint32_t bufferIndex() const { + return bufferIndex_; + } + + inline uint32_t nextBufferIndex() const { + return (bufferIndex_ + 1) % buffers_.size(); + } + + // Advances buffer index to point to the next buffer for read. + inline void advanceBuffer() { + bufferIndex_ = nextBufferIndex(); + } + + inline Buffer* buffer() const { + return buffers_[bufferIndex()].get(); + } + + inline Buffer* nextBuffer() const { + return buffers_[nextBufferIndex()].get(); + } + + void updateStats(uint64_t readBytes, uint64_t readTimeNs); + + const std::unique_ptr file_; + const uint64_t fileSize_; + const uint64_t bufferSize_; + memory::MemoryPool* const pool_; + const bool readAheadEnabled_; + + // Offset of the next byte to read from file. + uint64_t fileOffset_ = 0; + + std::vector buffers_; + uint32_t bufferIndex_{0}; + // Sets to read-ahead future if valid. + folly::SemiFuture readAheadWait_{ + folly::SemiFuture::makeEmpty()}; + + Stats stats_; +}; +} // namespace facebook::velox::common diff --git a/velox/common/file/FileSystems.cpp b/velox/common/file/FileSystems.cpp index eac9c01337b74..641ac5c958c60 100644 --- a/velox/common/file/FileSystems.cpp +++ b/velox/common/file/FileSystems.cpp @@ -30,8 +30,9 @@ constexpr std::string_view kFileScheme("file:"); using RegisteredFileSystems = std::vector, - std::function(std::shared_ptr, std::string_view)>>>; + std::function( + std::shared_ptr, + std::string_view)>>>; RegisteredFileSystems& registeredFileSystems() { // Meyers singleton. @@ -44,14 +45,14 @@ RegisteredFileSystems& registeredFileSystems() { void registerFileSystem( std::function schemeMatcher, std::function( - std::shared_ptr, + std::shared_ptr, std::string_view)> fileSystemGenerator) { registeredFileSystems().emplace_back(schemeMatcher, fileSystemGenerator); } std::shared_ptr getFileSystem( std::string_view filePath, - std::shared_ptr properties) { + std::shared_ptr properties) { const auto& filesystems = registeredFileSystems(); for (const auto& p : filesystems) { if (p.first(filePath)) { @@ -61,6 +62,16 @@ std::shared_ptr getFileSystem( VELOX_FAIL("No registered file system matched with file path '{}'", filePath); } +bool isPathSupportedByRegisteredFileSystems(const std::string_view& filePath) { + const auto& filesystems = registeredFileSystems(); + for (const auto& p : filesystems) { + if (p.first(filePath)) { + return true; + } + } + return false; +} + namespace { folly::once_flag localFSInstantiationFlag; @@ -68,7 +79,7 @@ folly::once_flag localFSInstantiationFlag; // Implement Local FileSystem. class LocalFileSystem : public FileSystem { public: - explicit LocalFileSystem(std::shared_ptr config) + explicit LocalFileSystem(std::shared_ptr config) : FileSystem(config) {} ~LocalFileSystem() override {} @@ -77,7 +88,7 @@ class LocalFileSystem : public FileSystem { return "Local FS"; } - inline std::string_view extractPath(std::string_view path) { + inline std::string_view extractPath(std::string_view path) override { if (path.find(kFileScheme) == 0) { return path.substr(kFileScheme.length()); } @@ -92,8 +103,12 @@ class LocalFileSystem : public FileSystem { std::unique_ptr openFileForWrite( std::string_view path, - const FileOptions& /*unused*/) override { - return std::make_unique(extractPath(path)); + const FileOptions& options) override { + return std::make_unique( + extractPath(path), + options.shouldCreateParentDirectories, + options.shouldThrowOnFileAlreadyExists, + options.bufferWrite); } void remove(std::string_view path) override { @@ -155,8 +170,8 @@ class LocalFileSystem : public FileSystem { 0, ec.value(), "Mkdir {} failed: {}, message: {}", - path, - ec, + std::string(path), + ec.value(), ec.message()); VLOG(1) << "LocalFileSystem::mkdir " << path; } @@ -168,8 +183,8 @@ class LocalFileSystem : public FileSystem { 0, ec.value(), "Rmdir {} failed: {}, message: {}", - path, - ec, + std::string(path), + ec.value(), ec.message()); VLOG(1) << "LocalFileSystem::rmdir " << path; } @@ -183,9 +198,9 @@ class LocalFileSystem : public FileSystem { } static std::function(std::shared_ptr, std::string_view)> + FileSystem>(std::shared_ptr, std::string_view)> fileSystemGenerator() { - return [](std::shared_ptr properties, + return [](std::shared_ptr properties, std::string_view filePath) { // One instance of Local FileSystem is sufficient. // Initialize on first access and reuse after that. diff --git a/velox/common/file/FileSystems.h b/velox/common/file/FileSystems.h index 07b8400fba7b0..8fccb5bf6d146 100644 --- a/velox/common/file/FileSystems.h +++ b/velox/common/file/FileSystems.h @@ -23,7 +23,9 @@ #include namespace facebook::velox { -class Config; +namespace config { +class ConfigBase; +} class ReadFile; class WriteFile; } // namespace facebook::velox @@ -35,20 +37,51 @@ namespace facebook::velox::filesystems { /// MemoryPool to allocate buffers needed to read/write files on FileSystems /// such as S3. struct FileOptions { + /// A free form option in 'values' that is provided for file creation. The + /// form should be defined by specific implementations of file system. e.g. + /// inside this property there could be things like block size, encoding, and + /// etc. + static constexpr folly::StringPiece kFileCreateConfig{"file-create-config"}; + std::unordered_map values; memory::MemoryPool* pool{nullptr}; + /// If specified then can be trusted to be the file size. + std::optional fileSize; + + /// Whether to create parent directories if they don't exist. + /// + /// NOTE: this only applies for write open file. + bool shouldCreateParentDirectories{false}; + + /// Whether to throw an error if a file already exists. + /// + /// NOTE: this only applies for write open file. + bool shouldThrowOnFileAlreadyExists{true}; + + /// Whether to buffer the write data in file system client or not. For local + /// filesystem on Unix-like operating system, this corresponds to the direct + /// IO mode if set. + /// + /// NOTE: this only applies for write open file. + bool bufferWrite{true}; }; /// An abstract FileSystem class FileSystem { public: - FileSystem(std::shared_ptr config) + FileSystem(std::shared_ptr config) : config_(std::move(config)) {} virtual ~FileSystem() = default; /// Returns the name of the File System virtual std::string name() const = 0; + /// Returns the file path without the fs scheme prefix such as "local:" prefix + /// for local file system. + virtual std::string_view extractPath(std::string_view path) { + VELOX_NYI(); + } + /// Returns a ReadFile handle for a given file path virtual std::unique_ptr openFileForRead( std::string_view path, @@ -87,12 +120,16 @@ class FileSystem { virtual void rmdir(std::string_view path) = 0; protected: - std::shared_ptr config_; + std::shared_ptr config_; }; std::shared_ptr getFileSystem( std::string_view filename, - std::shared_ptr config); + std::shared_ptr config); + +/// Returns true if filePath is supported by any registered file system, +/// otherwise false. +bool isPathSupportedByRegisteredFileSystems(const std::string_view& filePath); /// FileSystems must be registered explicitly. /// The registration function takes two parameters: @@ -102,7 +139,7 @@ std::shared_ptr getFileSystem( void registerFileSystem( std::function schemeMatcher, std::function( - std::shared_ptr, + std::shared_ptr, std::string_view)> fileSystemGenerator); /// Register the local filesystem. diff --git a/velox/common/file/Region.h b/velox/common/file/Region.h index c0617ec0437a6..2b396b6789b13 100644 --- a/velox/common/file/Region.h +++ b/velox/common/file/Region.h @@ -20,11 +20,11 @@ namespace facebook::velox::common { -// define a disk region to read +/// Defines a disk region to read. struct Region { uint64_t offset; uint64_t length; - // Optional label used by lower layers for cache warm up + /// Optional label used by lower layers for cache warm up. std::string_view label; Region(uint64_t offset = 0, uint64_t length = 0, std::string_view label = {}) diff --git a/velox/common/file/Utils.cpp b/velox/common/file/Utils.cpp index 4c5de66f459b4..bd9bfea9dcb8d 100644 --- a/velox/common/file/Utils.cpp +++ b/velox/common/file/Utils.cpp @@ -28,7 +28,10 @@ bool CoalesceIfDistanceLE::operator()( VELOX_CHECK_LE(beginGap, endGap, "Regions to combine can't overlap."); const uint64_t gap = endGap - beginGap; - return gap <= maxCoalescingDistance_; + const bool shouldCoalesce = gap <= maxCoalescingDistance_; + if (coalescedBytes_ && shouldCoalesce) { + *coalescedBytes_ += gap; + } + return shouldCoalesce; } - } // namespace facebook::velox::file::utils diff --git a/velox/common/file/Utils.h b/velox/common/file/Utils.h index 9b497fac511da..b19468fbe1fe2 100644 --- a/velox/common/file/Utils.h +++ b/velox/common/file/Utils.h @@ -106,8 +106,11 @@ class CoalesceRegions { class CoalesceIfDistanceLE { public: - explicit CoalesceIfDistanceLE(uint64_t maxCoalescingDistance) - : maxCoalescingDistance_(maxCoalescingDistance) {} + explicit CoalesceIfDistanceLE( + uint64_t maxCoalescingDistance, + uint64_t* FOLLY_NULLABLE coalescedBytes = nullptr) + : maxCoalescingDistance_{maxCoalescingDistance}, + coalescedBytes_{coalescedBytes} {} bool operator()( const velox::common::Region& a, @@ -115,6 +118,7 @@ class CoalesceIfDistanceLE { private: uint64_t maxCoalescingDistance_; + uint64_t* coalescedBytes_; }; template @@ -161,5 +165,4 @@ class ReadToIOBufs { OutputIter output_; Reader reader_; }; - } // namespace facebook::velox::file::utils diff --git a/velox/common/file/benchmark/CMakeLists.txt b/velox/common/file/benchmark/CMakeLists.txt deleted file mode 100644 index 8861ce5d9adc3..0000000000000 --- a/velox/common/file/benchmark/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -add_library(velox_read_benchmark_lib ReadBenchmark.cpp) - -target_link_libraries(velox_read_benchmark_lib - PUBLIC velox_file velox_time Folly::folly gflags::gflags) - -add_executable(velox_read_benchmark ReadBenchmarkMain.cpp) - -target_link_libraries(velox_read_benchmark PRIVATE velox_read_benchmark_lib) diff --git a/velox/common/file/benchmark/ReadBenchmark.cpp b/velox/common/file/benchmark/ReadBenchmark.cpp deleted file mode 100644 index ba1eaa95f355e..0000000000000 --- a/velox/common/file/benchmark/ReadBenchmark.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/common/file/benchmark/ReadBenchmark.h" - -DEFINE_string(path, "", "Path of test file"); -DEFINE_int64( - file_size_gb, - 0, - "Limits the test to the first --file_size_gb " - "of --path. 0 means use the whole file"); -DEFINE_int32(num_threads, 16, "Test paralelism"); -DEFINE_int32(seed, 0, "Random seed, 0 means no seed"); -DEFINE_bool(odirect, false, "Use O_DIRECT"); - -DEFINE_int32( - bytes, - 0, - "If 0, runs through a set of predefined read patterns. " - "If non-0, this is the size of a single read. The reads are " - "made in --num_in_run consecutive batchhes with --gap bytes between each read"); -DEFINE_int32(gap, 0, "Gap between consecutive reads if --bytes is non-0"); -DEFINE_int32( - num_in_run, - 10, - "Number of consecutive reads of --bytes separated by --gap bytes"); -DEFINE_int32( - measurement_size, - 100 << 20, - "Total reads per thread when throughput for a --bytes/--gap/--/gap/" - "--num_in_run combination"); - -namespace { -static bool notEmpty(const char* /*flagName*/, const std::string& value) { - return !value.empty(); -} -} // namespace - -DEFINE_validator(path, ¬Empty); - -namespace facebook::velox { - -void ReadBenchmark::run() { - if (FLAGS_bytes) { - modes(FLAGS_bytes, FLAGS_gap, FLAGS_num_in_run); - return; - } - modes(1100, 0, 10); - modes(1100, 1200, 10); - modes(16 * 1024, 0, 10); - modes(16 * 1024, 10000, 10); - modes(1000000, 0, 8); - modes(1000000, 100000, 8); -} -} // namespace facebook::velox diff --git a/velox/common/file/tests/CMakeLists.txt b/velox/common/file/tests/CMakeLists.txt index 446ef6a859e18..fb9ef6f735215 100644 --- a/velox/common/file/tests/CMakeLists.txt +++ b/velox/common/file/tests/CMakeLists.txt @@ -12,11 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_file_test_utils TestUtils.cpp) -target_link_libraries(velox_file_test_utils PUBLIC velox_file) +add_library(velox_file_test_utils TestUtils.cpp FaultyFile.cpp + FaultyFileSystem.cpp) -add_executable(velox_file_test FileTest.cpp UtilsTest.cpp) +target_link_libraries( + velox_file_test_utils + PUBLIC velox_file) + +add_executable(velox_file_test FileTest.cpp FileInputStreamTest.cpp + UtilsTest.cpp) add_test(velox_file_test velox_file_test) target_link_libraries( - velox_file_test PRIVATE velox_file velox_file_test_utils velox_temp_path - gmock gtest gtest_main) + velox_file_test + PRIVATE + velox_buffer + velox_file + velox_file_test_utils + velox_temp_path + GTest::gmock + GTest::gtest + GTest::gtest_main) diff --git a/velox/common/file/tests/FaultyFile.cpp b/velox/common/file/tests/FaultyFile.cpp new file mode 100644 index 0000000000000..2593fa1fad7ba --- /dev/null +++ b/velox/common/file/tests/FaultyFile.cpp @@ -0,0 +1,143 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/file/tests/FaultyFile.h" + +namespace facebook::velox::tests::utils { + +std::string FaultFileOperation::typeString(Type type) { + switch (type) { + case Type::kReadv: + return "READV"; + case Type::kRead: + return "READ"; + default: + VELOX_UNSUPPORTED( + "Unknown file operation type: {}", static_cast(type)); + break; + } +} + +FaultyReadFile::FaultyReadFile( + const std::string& path, + std::shared_ptr delegatedFile, + FileFaultInjectionHook injectionHook, + folly::Executor* executor) + : path_(path), + delegatedFile_(std::move(delegatedFile)), + injectionHook_(std::move(injectionHook)), + executor_(executor) { + VELOX_CHECK_NOT_NULL(delegatedFile_); +} + +std::string_view +FaultyReadFile::pread(uint64_t offset, uint64_t length, void* buf) const { + if (injectionHook_ != nullptr) { + FaultFileReadOperation op(path_, offset, length, buf); + injectionHook_(&op); + if (!op.delegate) { + return std::string_view(static_cast(op.buf), op.length); + } + } + return delegatedFile_->pread(offset, length, buf); +} + +uint64_t FaultyReadFile::preadv( + uint64_t offset, + const std::vector>& buffers) const { + if (injectionHook_ != nullptr) { + FaultFileReadvOperation op(path_, offset, buffers); + injectionHook_(&op); + if (!op.delegate) { + return op.readBytes; + } + } + return delegatedFile_->preadv(offset, buffers); +} + +folly::SemiFuture FaultyReadFile::preadvAsync( + uint64_t offset, + const std::vector>& buffers) const { + // TODO: add fault injection for async read later. + if (delegatedFile_->hasPreadvAsync() || executor_ == nullptr) { + return delegatedFile_->preadvAsync(offset, buffers); + } + auto promise = std::make_unique>(); + folly::SemiFuture future = promise->getSemiFuture(); + executor_->add([this, + _promise = std::move(promise), + _offset = offset, + _buffers = buffers]() { + auto delegateFuture = delegatedFile_->preadvAsync(_offset, _buffers); + _promise->setValue(delegateFuture.wait().value()); + }); + return future; +} + +FaultyWriteFile::FaultyWriteFile( + const std::string& path, + std::shared_ptr delegatedFile, + FileFaultInjectionHook injectionHook) + : path_(path), + delegatedFile_(std::move(delegatedFile)), + injectionHook_(std::move(injectionHook)) { + VELOX_CHECK_NOT_NULL(delegatedFile_); +} + +void FaultyWriteFile::append(std::string_view data) { + if (injectionHook_ != nullptr) { + FaultFileWriteOperation op(path_, data); + injectionHook_(&op); + if (!op.delegate) { + return; + } + } + delegatedFile_->append(data); +} + +void FaultyWriteFile::append(std::unique_ptr data) { + delegatedFile_->append(std::move(data)); +} + +void FaultyWriteFile::write( + const std::vector& iovecs, + int64_t offset, + int64_t length) { + delegatedFile_->write(iovecs, offset, length); +} + +void FaultyWriteFile::truncate(int64_t newSize) { + delegatedFile_->truncate(newSize); +} + +void FaultyWriteFile::flush() { + delegatedFile_->flush(); +} + +void FaultyWriteFile::setAttributes( + const std::unordered_map& attributes) { + delegatedFile_->setAttributes(attributes); +} + +std::unordered_map FaultyWriteFile::getAttributes() + const { + return delegatedFile_->getAttributes(); +} + +void FaultyWriteFile::close() { + delegatedFile_->close(); +} +} // namespace facebook::velox::tests::utils diff --git a/velox/common/file/tests/FaultyFile.h b/velox/common/file/tests/FaultyFile.h new file mode 100644 index 0000000000000..968d98da3cf9d --- /dev/null +++ b/velox/common/file/tests/FaultyFile.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/common/file/File.h" + +namespace facebook::velox::tests::utils { + +/// Defines the per-file operation fault injection. +struct FaultFileOperation { + enum class Type { + /// Injects faults for file read operations. + kRead, + kReadv, + kWrite, + /// TODO: add to support fault injections for the other file operation + /// types. + }; + static std::string typeString(Type type); + + const Type type; + + /// The delegated file path. + const std::string path; + + /// Indicates to forward this operation to the delegated file or not. If not, + /// then the file fault injection hook must have processed the request. For + /// instance, if this is a file read injection, then the hook must have filled + /// the fake read data for data corruption tests. + bool delegate{true}; + + FaultFileOperation(Type _type, const std::string& _path) + : type(_type), path(_path) {} +}; + +FOLLY_ALWAYS_INLINE std::ostream& operator<<( + std::ostream& o, + const FaultFileOperation::Type& type) { + return o << FaultFileOperation::typeString(type); +} + +/// Fault injection parameters for file read API. +struct FaultFileReadOperation : FaultFileOperation { + const uint64_t offset; + const uint64_t length; + void* const buf; + + FaultFileReadOperation( + const std::string& _path, + uint64_t _offset, + uint64_t _length, + void* _buf) + : FaultFileOperation(FaultFileOperation::Type::kRead, _path), + offset(_offset), + length(_length), + buf(_buf) {} +}; + +/// Fault injection parameters for file readv API. +struct FaultFileReadvOperation : FaultFileOperation { + const uint64_t offset; + const std::vector>& buffers; + uint64_t readBytes{0}; + + FaultFileReadvOperation( + const std::string& _path, + uint64_t _offset, + const std::vector>& _buffers) + : FaultFileOperation(FaultFileOperation::Type::kReadv, _path), + offset(_offset), + buffers(_buffers) {} +}; + +/// Fault injection parameters for file write API. +struct FaultFileWriteOperation : FaultFileOperation { + std::string_view* data; + + FaultFileWriteOperation( + const std::string& _path, + const std::string_view& _data) + : FaultFileOperation(FaultFileOperation::Type::kWrite, _path), + data(const_cast(&_data)) {} +}; + +/// The fault injection hook on the file operation path. +using FileFaultInjectionHook = std::function; + +class FaultyReadFile : public ReadFile { + public: + FaultyReadFile( + const std::string& path, + std::shared_ptr delegatedFile, + FileFaultInjectionHook injectionHook, + folly::Executor* executor); + + ~FaultyReadFile() override{}; + + uint64_t size() const override { + return delegatedFile_->size(); + } + + std::string_view pread(uint64_t offset, uint64_t length, void* buf) + const override; + + uint64_t preadv( + uint64_t offset, + const std::vector>& buffers) const override; + + uint64_t memoryUsage() const override { + return delegatedFile_->memoryUsage(); + } + + bool shouldCoalesce() const override { + return delegatedFile_->shouldCoalesce(); + } + + std::string getName() const override { + return delegatedFile_->getName(); + } + + uint64_t getNaturalReadSize() const override { + return delegatedFile_->getNaturalReadSize(); + } + + bool hasPreadvAsync() const override { + if (executor_ != nullptr) { + return true; + } + return delegatedFile_->hasPreadvAsync(); + } + + folly::SemiFuture preadvAsync( + uint64_t offset, + const std::vector>& buffers) const override; + + private: + const std::string path_; + const std::shared_ptr delegatedFile_; + const FileFaultInjectionHook injectionHook_; + folly::Executor* const executor_; +}; + +class FaultyWriteFile : public WriteFile { + public: + FaultyWriteFile( + const std::string& path, + std::shared_ptr delegatedFile, + FileFaultInjectionHook injectionHook); + + ~FaultyWriteFile() override{}; + + void append(std::string_view data) override; + + void append(std::unique_ptr data) override; + + void write(const std::vector& iovecs, int64_t offset, int64_t length) + override; + + void truncate(int64_t newSize) override; + + void flush() override; + + void setAttributes( + const std::unordered_map& attributes) override; + + std::unordered_map getAttributes() const override; + + void close() override; + + uint64_t size() const override { + return delegatedFile_->size(); + } + + private: + const std::string path_; + const std::shared_ptr delegatedFile_; + const FileFaultInjectionHook injectionHook_; +}; + +} // namespace facebook::velox::tests::utils diff --git a/velox/common/file/tests/FaultyFileSystem.cpp b/velox/common/file/tests/FaultyFileSystem.cpp new file mode 100644 index 0000000000000..39b4f6b09e158 --- /dev/null +++ b/velox/common/file/tests/FaultyFileSystem.cpp @@ -0,0 +1,187 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/file/tests/FaultyFileSystem.h" +#include + +#include + +namespace facebook::velox::tests::utils { +namespace { +// Constructs the faulty file path based on the delegated read file 'path'. It +// pre-appends the faulty file system scheme. +inline std::string faultyPath(const std::string& path) { + return fmt::format("{}{}", FaultyFileSystem::scheme(), path); +} + +std::function schemeMatcher() { + // Note: presto behavior is to prefix local paths with 'file:'. + // Check for that prefix and prune to absolute regular paths as needed. + return [](std::string_view filePath) { + return filePath.find(FaultyFileSystem::scheme()) == 0; + }; +} + +folly::once_flag faultFilesystemInitOnceFlag; + +std::function(std::shared_ptr, std::string_view)> +fileSystemGenerator() { + return [](std::shared_ptr properties, + std::string_view /*unused*/) { + // One instance of faulty FileSystem is sufficient. Initializes on first + // access and reuse after that. + static std::shared_ptr lfs; + folly::call_once(faultFilesystemInitOnceFlag, [&properties]() { + lfs = std::make_shared(std::move(properties)); + }); + return lfs; + }; +} +} // namespace + +std::unique_ptr FaultyFileSystem::openFileForRead( + std::string_view path, + const FileOptions& options) { + const std::string delegatedPath = std::string(extractPath(path)); + auto delegatedFile = getFileSystem(delegatedPath, config_) + ->openFileForRead(delegatedPath, options); + return std::make_unique( + std::string(path), + std::move(delegatedFile), + [&](FaultFileOperation* op) { maybeInjectFileFault(op); }, + executor_); +} + +std::unique_ptr FaultyFileSystem::openFileForWrite( + std::string_view path, + const FileOptions& options) { + const std::string delegatedPath = std::string(extractPath(path)); + auto delegatedFile = getFileSystem(delegatedPath, config_) + ->openFileForWrite(delegatedPath, options); + return std::make_unique( + std::string(path), std::move(delegatedFile), [&](FaultFileOperation* op) { + maybeInjectFileFault(op); + }); +} + +void FaultyFileSystem::remove(std::string_view path) { + const std::string delegatedPath = std::string(extractPath(path)); + getFileSystem(delegatedPath, config_)->remove(delegatedPath); +} + +void FaultyFileSystem::rename( + std::string_view oldPath, + std::string_view newPath, + bool overwrite) { + const auto delegatedOldPath = extractPath(oldPath); + const auto delegatedNewPath = extractPath(newPath); + getFileSystem(delegatedOldPath, config_) + ->rename(delegatedOldPath, delegatedNewPath, overwrite); +} + +bool FaultyFileSystem::exists(std::string_view path) { + const auto delegatedPath = extractPath(path); + return getFileSystem(delegatedPath, config_)->exists(delegatedPath); +} + +std::vector FaultyFileSystem::list(std::string_view path) { + const auto delegatedDirPath = extractPath(path); + const auto delegatedFiles = + getFileSystem(delegatedDirPath, config_)->list(delegatedDirPath); + // NOTE: we shall return the faulty file paths instead of the delegated file + // paths for list result. + std::vector files; + files.reserve(delegatedFiles.size()); + for (const auto& delegatedFile : delegatedFiles) { + files.push_back(faultyPath(delegatedFile)); + } + return files; +} + +void FaultyFileSystem::mkdir(std::string_view path) { + const auto delegatedDirPath = extractPath(path); + getFileSystem(delegatedDirPath, config_)->mkdir(delegatedDirPath); +} + +void FaultyFileSystem::rmdir(std::string_view path) { + const auto delegatedDirPath = extractPath(path); + getFileSystem(delegatedDirPath, config_)->rmdir(delegatedDirPath); +} + +void FaultyFileSystem::setFileInjectionHook( + FileFaultInjectionHook injectionHook) { + std::lock_guard l(mu_); + fileInjections_ = FileInjections(std::move(injectionHook)); +} + +void FaultyFileSystem::setFileInjectionError( + std::exception_ptr error, + std::unordered_set opTypes) { + std::lock_guard l(mu_); + fileInjections_ = FileInjections(std::move(error), std::move(opTypes)); +} + +void FaultyFileSystem::setFileInjectionDelay( + uint64_t delayUs, + std::unordered_set opTypes) { + std::lock_guard l(mu_); + fileInjections_ = FileInjections(delayUs, std::move(opTypes)); +} + +void FaultyFileSystem::clearFileFaultInjections() { + std::lock_guard l(mu_); + fileInjections_.reset(); +} + +void FaultyFileSystem::maybeInjectFileFault(FaultFileOperation* op) { + FileInjections injections; + { + std::lock_guard l(mu_); + if (!fileInjections_.has_value()) { + return; + } + injections = fileInjections_.value(); + } + + if (injections.fileInjectionHook != nullptr) { + injections.fileInjectionHook(op); + return; + } + + if (!injections.opTypes.empty() && injections.opTypes.count(op->type) == 0) { + return; + } + + if (injections.fileException != nullptr) { + std::rethrow_exception(injections.fileException); + } + + if (injections.fileDelayUs != 0) { + std::this_thread::sleep_for( + std::chrono::microseconds(injections.fileDelayUs)); + } +} + +void registerFaultyFileSystem() { + registerFileSystem(schemeMatcher(), fileSystemGenerator()); +} + +std::shared_ptr faultyFileSystem() { + return std::dynamic_pointer_cast( + getFileSystem(FaultyFileSystem::scheme(), {})); +} +} // namespace facebook::velox::tests::utils diff --git a/velox/common/file/tests/FaultyFileSystem.h b/velox/common/file/tests/FaultyFileSystem.h new file mode 100644 index 0000000000000..b55266d41b9a3 --- /dev/null +++ b/velox/common/file/tests/FaultyFileSystem.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/common/file/FileSystems.h" + +#include +#include +#include +#include "velox/common/file/tests/FaultyFile.h" +#include "velox/common/file/tests/FaultyFileSystem.h" + +namespace facebook::velox::tests::utils { + +using namespace filesystems; + +/// Implements faulty filesystem for io fault injection in unit test. It is a +/// wrapper on top of a real file system, and by default it delegates the the +/// file operation to the real file system underneath. +class FaultyFileSystem : public FileSystem { + public: + explicit FaultyFileSystem(std::shared_ptr config) + : FileSystem(std::move(config)) {} + + ~FaultyFileSystem() override {} + + static inline std::string scheme() { + return "faulty:"; + } + + std::string name() const override { + return "Faulty FS"; + } + + // Extracts the delegated real file path by removing the faulty file system + // scheme prefix. + inline std::string_view extractPath(std::string_view path) override { + VELOX_CHECK_EQ(path.find(scheme()), 0, ""); + const auto filePath = path.substr(scheme().length()); + return getFileSystem(filePath, config_)->extractPath(filePath); + } + + std::unique_ptr openFileForRead( + std::string_view path, + const FileOptions& options) override; + + std::unique_ptr openFileForWrite( + std::string_view path, + const FileOptions& options) override; + + void remove(std::string_view path) override; + + void rename( + std::string_view oldPath, + std::string_view newPath, + bool overwrite) override; + + bool exists(std::string_view path) override; + + std::vector list(std::string_view path) override; + + void mkdir(std::string_view path) override; + + void rmdir(std::string_view path) override; + + /// Sets executor for async read execution. + void setExecutor(folly::Executor* executor) { + std::lock_guard l(mu_); + executor_ = executor; + } + + /// Setups hook for file fault injection. + void setFileInjectionHook(FileFaultInjectionHook hook); + + /// Setups to inject 'error' for a particular set of file operation types. If + /// 'opTypes' is empty, it injects error for all kinds of file operation + /// types. + void setFileInjectionError( + std::exception_ptr error, + std::unordered_set opTypes = {}); + + /// Setups to inject delay for a particular set of file operation types. If + /// 'opTypes' is empty, it injects delay for all kinds of file operation + /// types. + void setFileInjectionDelay( + uint64_t delayUs, + std::unordered_set opTypes = {}); + + /// Clears the file fault injections. + void clearFileFaultInjections(); + + private: + // Defines the file injection setup and only one type of injection can be set + // at a time. + struct FileInjections { + FileFaultInjectionHook fileInjectionHook{nullptr}; + + std::exception_ptr fileException{nullptr}; + + uint64_t fileDelayUs{0}; + + std::unordered_set opTypes{}; + + FileInjections() = default; + + explicit FileInjections(FileFaultInjectionHook _fileInjectionHook) + : fileInjectionHook(std::move(_fileInjectionHook)) {} + + FileInjections( + uint64_t _fileDelayUs, + std::unordered_set _opTypes) + : fileDelayUs(_fileDelayUs), opTypes(std::move(_opTypes)) {} + + FileInjections( + std::exception_ptr _fileException, + std::unordered_set _opTypes) + : fileException(std::move(_fileException)), + opTypes(std::move(_opTypes)) {} + }; + + // Invoked to inject file fault to 'op' if configured. + void maybeInjectFileFault(FaultFileOperation* op); + + mutable std::mutex mu_; + std::optional fileInjections_; + folly::Executor* executor_; +}; + +/// Registers the faulty filesystem. +void registerFaultyFileSystem(); + +/// Gets the fault filesystem instance. +std::shared_ptr faultyFileSystem(); +} // namespace facebook::velox::tests::utils diff --git a/velox/common/file/tests/FileInputStreamTest.cpp b/velox/common/file/tests/FileInputStreamTest.cpp new file mode 100644 index 0000000000000..18ab5733900e0 --- /dev/null +++ b/velox/common/file/tests/FileInputStreamTest.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/common/memory/ByteStream.h" + +#include "velox/common/base/BitUtil.h" +#include "velox/common/file/FileInputStream.h" +#include "velox/common/file/FileSystems.h" +#include "velox/common/memory/MmapAllocator.h" +#include "velox/exec/tests/utils/TempDirectoryPath.h" + +#include +#include + +using namespace facebook::velox; +using namespace facebook::velox::memory; + +class FileInputStreamTest : public testing::Test { + protected: + static void SetUpTestCase() { + filesystems::registerLocalFileSystem(); + } + + void SetUp() override { + constexpr uint64_t kMaxMappedMemory = 64 << 20; + MemoryManagerOptions options; + options.useMmapAllocator = true; + options.allocatorCapacity = kMaxMappedMemory; + options.arbitratorCapacity = kMaxMappedMemory; + options.arbitratorReservedCapacity = 0; + memoryManager_ = std::make_unique(options); + mmapAllocator_ = static_cast(memoryManager_->allocator()); + pool_ = memoryManager_->addLeafPool("ByteStreamTest"); + rng_.seed(124); + tempDirPath_ = exec::test::TempDirectoryPath::create(); + fs_ = filesystems::getFileSystem(tempDirPath_->getPath(), nullptr); + } + + void TearDown() override {} + + std::unique_ptr createStream( + uint64_t streamSize, + uint32_t bufferSize = 1024) { + const auto filePath = + fmt::format("{}/{}", tempDirPath_->getPath(), fileId_++); + auto writeFile = fs_->openFileForWrite(filePath); + std::uint8_t buffer[streamSize]; + for (int i = 0; i < streamSize; ++i) { + buffer[i] = i % 256; + } + writeFile->append( + std::string_view(reinterpret_cast(buffer), streamSize)); + writeFile->close(); + return std::make_unique( + fs_->openFileForRead(filePath), bufferSize, pool_.get()); + } + + folly::Random::DefaultGenerator rng_; + std::unique_ptr memoryManager_; + MmapAllocator* mmapAllocator_; + std::shared_ptr pool_; + std::atomic_uint64_t fileId_{0}; + std::shared_ptr tempDirPath_; + std::shared_ptr fs_; +}; + +TEST_F(FileInputStreamTest, stats) { + struct { + size_t streamSize; + size_t bufferSize; + + std::string debugString() const { + return fmt::format( + "streamSize {}, bufferSize {}", streamSize, bufferSize); + } + } testSettings[] = { + {4096, 1024}, {4096, 4096}, {4096, 8192}, {4096, 4096 + 1024}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + + auto byteStream = createStream(testData.streamSize, testData.bufferSize); + ASSERT_EQ(byteStream->stats().numReads, 1); + ASSERT_EQ( + byteStream->stats().readBytes, + std::min(testData.streamSize, testData.bufferSize)); + ASSERT_GT(byteStream->stats().readTimeNs, 0); + uint8_t buffer[testData.streamSize / 8]; + for (int offset = 0; offset < testData.streamSize;) { + byteStream->readBytes(buffer, testData.streamSize / 8); + for (int i = 0; i < testData.streamSize / 8; ++i, ++offset) { + ASSERT_EQ(buffer[i], offset % 256); + } + } + ASSERT_TRUE(byteStream->atEnd()); + ASSERT_EQ( + byteStream->stats().numReads, + bits::roundUp(testData.streamSize, testData.bufferSize) / + testData.bufferSize); + ASSERT_EQ(byteStream->stats().readBytes, testData.streamSize); + ASSERT_GT(byteStream->stats().readTimeNs, 0); + } +} diff --git a/velox/common/file/tests/FileTest.cpp b/velox/common/file/tests/FileTest.cpp index 8ac5f753d2fa8..4534309adc6a7 100644 --- a/velox/common/file/tests/FileTest.cpp +++ b/velox/common/file/tests/FileTest.cpp @@ -16,8 +16,10 @@ #include +#include "velox/common/base/tests/GTestUtils.h" #include "velox/common/file/File.h" #include "velox/common/file/FileSystems.h" +#include "velox/common/file/tests/FaultyFileSystem.h" #include "velox/exec/tests/utils/TempDirectoryPath.h" #include "velox/exec/tests/utils/TempFilePath.h" @@ -25,14 +27,42 @@ using namespace facebook::velox; using facebook::velox::common::Region; +using namespace facebook::velox::tests::utils; constexpr int kOneMB = 1 << 20; -void writeData(WriteFile* writeFile) { - writeFile->append("aaaaa"); - writeFile->append("bbbbb"); - writeFile->append(std::string(kOneMB, 'c')); - writeFile->append("ddddd"); +void writeData(WriteFile* writeFile, bool useIOBuf = false) { + if (useIOBuf) { + std::unique_ptr buf = folly::IOBuf::copyBuffer("aaaaa"); + buf->appendToChain(folly::IOBuf::copyBuffer("bbbbb")); + buf->appendToChain(folly::IOBuf::copyBuffer(std::string(kOneMB, 'c'))); + buf->appendToChain(folly::IOBuf::copyBuffer("ddddd")); + writeFile->append(std::move(buf)); + ASSERT_EQ(writeFile->size(), 15 + kOneMB); + } else { + writeFile->append("aaaaa"); + writeFile->append("bbbbb"); + writeFile->append(std::string(kOneMB, 'c')); + writeFile->append("ddddd"); + ASSERT_EQ(writeFile->size(), 15 + kOneMB); + } +} + +void writeDataWithOffset(WriteFile* writeFile) { + ASSERT_EQ(writeFile->size(), 0); + writeFile->truncate(15 + kOneMB); + std::vector iovecs; + std::string s1 = "aaaaa"; + std::string s2 = "bbbbb"; + std::string s3 = std::string(kOneMB, 'c'); + std::string s4 = "ddddd"; + iovecs.push_back({s3.data(), s3.length()}); + iovecs.push_back({s4.data(), s4.length()}); + writeFile->write(iovecs, 10, 5 + kOneMB); + iovecs.clear(); + iovecs.push_back({s1.data(), s1.length()}); + iovecs.push_back({s2.data(), s2.length()}); + writeFile->write(iovecs, 0, 10); ASSERT_EQ(writeFile->size(), 15 + kOneMB); } @@ -68,7 +98,8 @@ void readData(ReadFile* readFile, bool checkFileSize = true) { folly::Range(middle, sizeof(middle)), folly::Range( nullptr, - (char*)(uint64_t)(15 + kOneMB - 500000 - sizeof(head) - sizeof(middle) - sizeof(tail))), + (char*)(uint64_t)(15 + kOneMB - 500000 - sizeof(head) - + sizeof(middle) - sizeof(tail))), folly::Range(tail, sizeof(tail))}; ASSERT_EQ(15 + kOneMB, readFile->preadv(0, buffers)); ASSERT_EQ(std::string_view(head, sizeof(head)), "aaaaabbbbbcc"); @@ -78,13 +109,15 @@ void readData(ReadFile* readFile, bool checkFileSize = true) { // We could templated this test, but that's kinda overkill for how simple it is. TEST(InMemoryFile, writeAndRead) { - std::string buf; - { - InMemoryWriteFile writeFile(&buf); - writeData(&writeFile); + for (bool useIOBuf : {true, false}) { + std::string buf; + { + InMemoryWriteFile writeFile(&buf); + writeData(&writeFile, useIOBuf); + } + InMemoryReadFile readFile(buf); + readData(&readFile); } - InMemoryReadFile readFile(buf); - readData(&readFile); } TEST(InMemoryFile, preadv) { @@ -114,41 +147,70 @@ TEST(InMemoryFile, preadv) { EXPECT_EQ(expected, values); } -TEST(LocalFile, writeAndRead) { - auto tempFile = ::exec::test::TempFilePath::create(); - const auto& filename = tempFile->path.c_str(); - remove(filename); - { - LocalWriteFile writeFile(filename); - writeData(&writeFile); +class LocalFileTest : public ::testing::TestWithParam { + protected: + LocalFileTest() : useFaultyFs_(GetParam()) {} + + static void SetUpTestCase() { + filesystems::registerLocalFileSystem(); + tests::utils::registerFaultyFileSystem(); + } + + const bool useFaultyFs_; +}; + +TEST_P(LocalFileTest, writeAndRead) { + struct { + bool useIOBuf; + bool withOffset; + + std::string debugString() const { + return fmt::format("useIOBuf {}, withOffset {}", useIOBuf, withOffset); + } + } testSettings[] = {{false, false}, {true, false}, {false, true}}; + for (auto testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + + auto tempFile = exec::test::TempFilePath::create(useFaultyFs_); + const auto& filename = tempFile->getPath(); + auto fs = filesystems::getFileSystem(filename, {}); + fs->remove(filename); + { + auto writeFile = fs->openFileForWrite(filename); + if (testData.withOffset) { + writeDataWithOffset(writeFile.get()); + } else { + writeData(writeFile.get(), testData.useIOBuf); + } + writeFile->close(); + ASSERT_EQ(writeFile->size(), 15 + kOneMB); + } + auto readFile = fs->openFileForRead(filename); + readData(readFile.get()); } - LocalReadFile readFile(filename); - readData(&readFile); } -TEST(LocalFile, viaRegistry) { - filesystems::registerLocalFileSystem(); - auto tempFile = ::exec::test::TempFilePath::create(); - const auto& filename = tempFile->path.c_str(); - remove(filename); - auto lfs = filesystems::getFileSystem(filename, nullptr); +TEST_P(LocalFileTest, viaRegistry) { + auto tempFile = exec::test::TempFilePath::create(useFaultyFs_); + const auto& filename = tempFile->getPath(); + auto fs = filesystems::getFileSystem(filename, {}); + fs->remove(filename); { - auto writeFile = lfs->openFileForWrite(filename); + auto writeFile = fs->openFileForWrite(filename); writeFile->append("snarf"); } - auto readFile = lfs->openFileForRead(filename); + auto readFile = fs->openFileForRead(filename); ASSERT_EQ(readFile->size(), 5); char buffer1[5]; ASSERT_EQ(readFile->pread(0, 5, &buffer1), "snarf"); - lfs->remove(filename); + fs->remove(filename); } -TEST(LocalFile, rename) { - filesystems::registerLocalFileSystem(); - auto tempFolder = ::exec::test::TempDirectoryPath::create(); - auto a = fmt::format("{}/a", tempFolder->path); - auto b = fmt::format("{}/b", tempFolder->path); - auto newA = fmt::format("{}/newA", tempFolder->path); +TEST_P(LocalFileTest, rename) { + const auto tempFolder = ::exec::test::TempDirectoryPath::create(useFaultyFs_); + const auto a = fmt::format("{}/a", tempFolder->getPath()); + const auto b = fmt::format("{}/b", tempFolder->getPath()); + const auto newA = fmt::format("{}/newA", tempFolder->getPath()); const std::string data("aaaaa"); auto localFs = filesystems::getFileSystem(a, nullptr); { @@ -160,7 +222,7 @@ TEST(LocalFile, rename) { ASSERT_TRUE(localFs->exists(a)); ASSERT_TRUE(localFs->exists(b)); ASSERT_FALSE(localFs->exists(newA)); - EXPECT_THROW(localFs->rename(a, b), VeloxUserError); + VELOX_ASSERT_USER_THROW(localFs->rename(a, b), ""); localFs->rename(a, newA); ASSERT_FALSE(localFs->exists(a)); ASSERT_TRUE(localFs->exists(b)); @@ -171,11 +233,10 @@ TEST(LocalFile, rename) { ASSERT_EQ(readFile->pread(0, 5, &buffer), data); } -TEST(LocalFile, exists) { - filesystems::registerLocalFileSystem(); - auto tempFolder = ::exec::test::TempDirectoryPath::create(); - auto a = fmt::format("{}/a", tempFolder->path); - auto b = fmt::format("{}/b", tempFolder->path); +TEST_P(LocalFileTest, exists) { + auto tempFolder = ::exec::test::TempDirectoryPath::create(useFaultyFs_); + auto a = fmt::format("{}/a", tempFolder->getPath()); + auto b = fmt::format("{}/b", tempFolder->getPath()); auto localFs = filesystems::getFileSystem(a, nullptr); { auto writeFile = localFs->openFileForWrite(a); @@ -191,47 +252,51 @@ TEST(LocalFile, exists) { ASSERT_FALSE(localFs->exists(b)); } -TEST(LocalFile, list) { - filesystems::registerLocalFileSystem(); - auto tempFolder = ::exec::test::TempDirectoryPath::create(); - auto a = fmt::format("{}/1", tempFolder->path); - auto b = fmt::format("{}/2", tempFolder->path); +TEST_P(LocalFileTest, list) { + const auto tempFolder = ::exec::test::TempDirectoryPath::create(useFaultyFs_); + const auto a = fmt::format("{}/1", tempFolder->getPath()); + const auto b = fmt::format("{}/2", tempFolder->getPath()); auto localFs = filesystems::getFileSystem(a, nullptr); { auto writeFile = localFs->openFileForWrite(a); writeFile = localFs->openFileForWrite(b); } - auto files = localFs->list(std::string_view(tempFolder->path)); + auto files = localFs->list(std::string_view(tempFolder->getPath())); std::sort(files.begin(), files.end()); ASSERT_EQ(files, std::vector({a, b})); localFs->remove(a); ASSERT_EQ( - localFs->list(std::string_view(tempFolder->path)), + localFs->list(std::string_view(tempFolder->getPath())), std::vector({b})); localFs->remove(b); - ASSERT_TRUE(localFs->list(std::string_view(tempFolder->path)).empty()); + ASSERT_TRUE(localFs->list(std::string_view(tempFolder->getPath())).empty()); } -TEST(LocalFile, readFileDestructor) { - auto tempFile = ::exec::test::TempFilePath::create(); - const auto& filename = tempFile->path.c_str(); - remove(filename); +TEST_P(LocalFileTest, readFileDestructor) { + if (useFaultyFs_) { + return; + } + auto tempFile = exec::test::TempFilePath::create(useFaultyFs_); + const auto& filename = tempFile->getPath(); + auto fs = filesystems::getFileSystem(filename, {}); + fs->remove(filename); { - LocalWriteFile writeFile(filename); - writeData(&writeFile); + auto writeFile = fs->openFileForWrite(filename); + writeData(writeFile.get()); } { - LocalReadFile readFile(filename); - readData(&readFile); + auto readFile = fs->openFileForRead(filename); + readData(readFile.get()); } int32_t readFd; { - std::unique_ptr buf(new char[tempFile->path.size() + 1]); - buf[tempFile->path.size()] = 0; - memcpy(buf.get(), tempFile->path.data(), tempFile->path.size()); - readFd = open(buf.get(), O_RDONLY); + std::unique_ptr buf(new char[tempFile->getPath().size() + 1]); + buf[tempFile->getPath().size()] = 0; + ::memcpy( + buf.get(), tempFile->getPath().c_str(), tempFile->getPath().size()); + readFd = ::open(buf.get(), O_RDONLY); } { LocalReadFile readFile(readFd); @@ -244,11 +309,10 @@ TEST(LocalFile, readFileDestructor) { } } -TEST(LocalFile, mkdir) { - filesystems::registerLocalFileSystem(); - auto tempFolder = ::exec::test::TempDirectoryPath::create(); +TEST_P(LocalFileTest, mkdir) { + auto tempFolder = exec::test::TempDirectoryPath::create(useFaultyFs_); - std::string path = tempFolder->path; + std::string path = tempFolder->getPath(); auto localFs = filesystems::getFileSystem(path, nullptr); // Create 3 levels of directories and ensure they exist. @@ -270,11 +334,10 @@ TEST(LocalFile, mkdir) { EXPECT_TRUE(localFs->exists(path)); } -TEST(LocalFile, rmdir) { - filesystems::registerLocalFileSystem(); - auto tempFolder = ::exec::test::TempDirectoryPath::create(); +TEST_P(LocalFileTest, rmdir) { + auto tempFolder = exec::test::TempDirectoryPath::create(useFaultyFs_); - std::string path = tempFolder->path; + std::string path = tempFolder->getPath(); auto localFs = filesystems::getFileSystem(path, nullptr); // Create 3 levels of directories and ensure they exist. @@ -293,13 +356,401 @@ TEST(LocalFile, rmdir) { EXPECT_TRUE(localFs->exists(path)); // Now delete the whole temp folder and ensure it is gone. - EXPECT_NO_THROW(localFs->rmdir(tempFolder->path)); - EXPECT_FALSE(localFs->exists(tempFolder->path)); + EXPECT_NO_THROW(localFs->rmdir(tempFolder->getPath())); + EXPECT_FALSE(localFs->exists(tempFolder->getPath())); // Delete a non-existing directory. path += "/does_not_exist/subdir"; EXPECT_FALSE(localFs->exists(path)); // The function does not throw, but will return zero files and folders // deleted, which is not an error. - EXPECT_NO_THROW(localFs->rmdir(tempFolder->path)); + EXPECT_NO_THROW(localFs->rmdir(tempFolder->getPath())); +} + +TEST_P(LocalFileTest, fileNotFound) { + auto tempFolder = exec::test::TempDirectoryPath::create(useFaultyFs_); + auto path = fmt::format("{}/file", tempFolder->getPath()); + auto localFs = filesystems::getFileSystem(path, nullptr); + VELOX_ASSERT_RUNTIME_THROW_CODE( + localFs->openFileForRead(path), + error_code::kFileNotFound, + "No such file or directory"); +} + +TEST_P(LocalFileTest, attributes) { + auto tempFile = exec::test::TempFilePath::create(useFaultyFs_); + const auto& filename = tempFile->getPath(); + auto fs = filesystems::getFileSystem(filename, {}); + fs->remove(filename); + auto writeFile = fs->openFileForWrite(filename); + ASSERT_FALSE( + LocalWriteFile::Attributes::cowDisabled(writeFile->getAttributes())); + try { + writeFile->setAttributes( + {{std::string(LocalWriteFile::Attributes::kNoCow), "true"}}); + } catch (const std::exception& /*e*/) { + // Flags like FS_IOC_SETFLAGS might not be supported for certain + // file systems (e.g., EXT4, XFS). + } + ASSERT_TRUE( + LocalWriteFile::Attributes::cowDisabled(writeFile->getAttributes())); + writeFile->close(); +} + +INSTANTIATE_TEST_SUITE_P( + LocalFileTestSuite, + LocalFileTest, + ::testing::Values(false, true)); + +class FaultyFsTest : public ::testing::Test { + protected: + FaultyFsTest() {} + + static void SetUpTestCase() { + filesystems::registerLocalFileSystem(); + tests::utils::registerFaultyFileSystem(); + } + + void SetUp() { + dir_ = exec::test::TempDirectoryPath::create(true); + fs_ = std::dynamic_pointer_cast( + filesystems::getFileSystem(dir_->getPath(), {})); + VELOX_CHECK_NOT_NULL(fs_); + readFilePath_ = fmt::format("{}/faultyTestReadFile", dir_->getPath()); + writeFilePath_ = fmt::format("{}/faultyTestWriteFile", dir_->getPath()); + const int bufSize = 1024; + buffer_.resize(bufSize); + for (int i = 0; i < bufSize; ++i) { + buffer_[i] = i % 256; + } + { + auto writeFile = fs_->openFileForWrite(readFilePath_, {}); + writeData(writeFile.get()); + } + auto readFile = fs_->openFileForRead(readFilePath_, {}); + readData(readFile.get(), true); + try { + VELOX_FAIL("InjectedFaultFileError"); + } catch (VeloxRuntimeError&) { + fileError_ = std::current_exception(); + } + } + + void TearDown() { + fs_->clearFileFaultInjections(); + } + + void writeData(WriteFile* file) { + file->append(std::string_view(buffer_)); + file->flush(); + } + + void readData(ReadFile* file, bool useReadv = false) { + char readBuf[buffer_.size()]; + if (!useReadv) { + file->pread(0, buffer_.size(), readBuf); + } else { + std::vector> buffers; + buffers.push_back(folly::Range(readBuf, buffer_.size())); + file->preadv(0, buffers); + } + for (int i = 0; i < buffer_.size(); ++i) { + if (buffer_[i] != readBuf[i]) { + VELOX_FAIL("Data Mismatch"); + } + } + } + + std::shared_ptr dir_; + std::string readFilePath_; + std::string writeFilePath_; + std::shared_ptr fs_; + std::string buffer_; + std::exception_ptr fileError_; +}; + +TEST_F(FaultyFsTest, schemCheck) { + ASSERT_TRUE( + filesystems::isPathSupportedByRegisteredFileSystems("faulty:/test")); + ASSERT_FALSE( + filesystems::isPathSupportedByRegisteredFileSystems("other:/test")); +} + +TEST_F(FaultyFsTest, fileReadErrorInjection) { + // Set read error. + fs_->setFileInjectionError(fileError_, {FaultFileOperation::Type::kRead}); + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + VELOX_ASSERT_THROW( + readData(readFile.get(), false), "InjectedFaultFileError"); + } + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + // We only inject error for pread API so preadv should be fine. + readData(readFile.get(), true); + } + + // Set readv error + fs_->setFileInjectionError(fileError_, {FaultFileOperation::Type::kReadv}); + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + VELOX_ASSERT_THROW( + readData(readFile.get(), true), "InjectedFaultFileError"); + } + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + // We only inject error for preadv API so pread should be fine. + readData(readFile.get(), false); + } + + // Set error for all kinds of operations. + fs_->setFileInjectionError(fileError_); + auto readFile = fs_->openFileForRead(readFilePath_, {}); + VELOX_ASSERT_THROW(readData(readFile.get(), true), "InjectedFaultFileError"); + VELOX_ASSERT_THROW(readData(readFile.get(), false), "InjectedFaultFileError"); + fs_->remove(readFilePath_); +} + +TEST_F(FaultyFsTest, fileReadDelayInjection) { + // Set 2 seconds delay. + const uint64_t injectDelay{2'000'000}; + fs_->setFileInjectionDelay(injectDelay, {FaultFileOperation::Type::kRead}); + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + uint64_t readDurationUs{0}; + { + MicrosecondTimer readTimer(&readDurationUs); + readData(readFile.get(), false); + } + ASSERT_GE(readDurationUs, injectDelay); + } + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + // We only inject error for pread API so preadv should be fine. + uint64_t readDurationUs{0}; + { + MicrosecondTimer readTimer(&readDurationUs); + readData(readFile.get(), true); + } + ASSERT_LT(readDurationUs, injectDelay); + } + + // Set readv error + fs_->setFileInjectionDelay(injectDelay, {FaultFileOperation::Type::kReadv}); + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + uint64_t readDurationUs{0}; + { + MicrosecondTimer readTimer(&readDurationUs); + readData(readFile.get(), true); + } + ASSERT_GE(readDurationUs, injectDelay); + } + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + // We only inject error for pread API so preadv should be fine. + uint64_t readDurationUs{0}; + { + MicrosecondTimer readTimer(&readDurationUs); + readData(readFile.get(), false); + } + ASSERT_LT(readDurationUs, injectDelay); + } + + // Set error for all kinds of operations. + fs_->setFileInjectionDelay(injectDelay); + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + // We only inject error for pread API so preadv should be fine. + uint64_t readDurationUs{0}; + { + MicrosecondTimer readTimer(&readDurationUs); + readData(readFile.get(), false); + } + ASSERT_GE(readDurationUs, injectDelay); + } + { + auto readFile = fs_->openFileForRead(readFilePath_, {}); + // We only inject error for pread API so preadv should be fine. + uint64_t readDurationUs{0}; + { + MicrosecondTimer readTimer(&readDurationUs); + readData(readFile.get(), false); + } + ASSERT_GE(readDurationUs, injectDelay); + } +} + +TEST_F(FaultyFsTest, fileReadFaultHookInjection) { + const std::string path1 = fmt::format("{}/hookFile1", dir_->getPath()); + { + auto writeFile = fs_->openFileForWrite(path1, {}); + writeData(writeFile.get()); + auto readFile = fs_->openFileForRead(path1, {}); + readData(readFile.get()); + } + const std::string path2 = fmt::format("{}/hookFile2", dir_->getPath()); + { + auto writeFile = fs_->openFileForWrite(path2, {}); + writeData(writeFile.get()); + auto readFile = fs_->openFileForRead(path2, {}); + readData(readFile.get()); + } + // Set read error. + fs_->setFileInjectionHook([&](FaultFileOperation* op) { + // Only inject error for readv. + if (op->type != FaultFileOperation::Type::kReadv) { + return; + } + // Only inject error for path2. + if (op->path != path2) { + return; + } + VELOX_FAIL("inject hook read failure"); + }); + { + auto readFile = fs_->openFileForRead(path1, {}); + readData(readFile.get(), false); + readData(readFile.get(), true); + } + { + auto readFile = fs_->openFileForRead(path2, {}); + // Verify only throw for readv. + readData(readFile.get(), false); + VELOX_ASSERT_THROW( + readData(readFile.get(), true), "inject hook read failure"); + } + + // Set to return fake data. + fs_->setFileInjectionHook([&](FaultFileOperation* op) { + // Only inject error for path1. + if (op->path != path1) { + return; + } + // Only inject error for read. + if (op->type != FaultFileOperation::Type::kRead) { + return; + } + auto* readOp = static_cast(op); + char* readBuf = static_cast(readOp->buf); + for (int i = 0; i < readOp->length; ++i) { + readBuf[i] = 0; + } + readOp->delegate = false; + }); + + { + auto readFile = fs_->openFileForRead(path2, {}); + readData(readFile.get(), false); + readData(readFile.get(), true); + } + { + auto readFile = fs_->openFileForRead(path1, {}); + // Verify only throw for read. + readData(readFile.get(), true); + VELOX_ASSERT_THROW(readData(readFile.get(), false), "Data Mismatch"); + } +} + +TEST_F(FaultyFsTest, fileWriteErrorInjection) { + // Set write error. + fs_->setFileInjectionError(fileError_, {FaultFileOperation::Type::kWrite}); + { + auto writeFile = fs_->openFileForWrite(writeFilePath_, {}); + VELOX_ASSERT_THROW(writeFile->append("hello"), "InjectedFaultFileError"); + fs_->remove(writeFilePath_); + } + // Set error for all kinds of operations. + fs_->setFileInjectionError(fileError_); + { + auto writeFile = fs_->openFileForWrite(writeFilePath_, {}); + VELOX_ASSERT_THROW(writeFile->append("hello"), "InjectedFaultFileError"); + fs_->remove(writeFilePath_); + } +} + +TEST_F(FaultyFsTest, fileWriteDelayInjection) { + // Set 2 seconds delay. + const uint64_t injectDelay{2'000'000}; + fs_->setFileInjectionDelay(injectDelay, {FaultFileOperation::Type::kWrite}); + { + auto writeFile = fs_->openFileForWrite(writeFilePath_, {}); + uint64_t readDurationUs{0}; + { + MicrosecondTimer readTimer(&readDurationUs); + writeFile->append("hello"); + } + ASSERT_GE(readDurationUs, injectDelay); + fs_->remove(writeFilePath_); + } +} + +TEST_F(FaultyFsTest, fileWriteFaultHookInjection) { + const std::string path1 = fmt::format("{}/hookFile1", dir_->getPath()); + const std::string path2 = fmt::format("{}/hookFile2", dir_->getPath()); + // Set to write fake data. + fs_->setFileInjectionHook([&](FaultFileOperation* op) { + // Only inject for write. + if (op->type != FaultFileOperation::Type::kWrite) { + return; + } + // Only inject for path2. + if (op->path != path2) { + return; + } + auto* writeOp = static_cast(op); + *writeOp->data = "Error data"; + }); + { + auto writeFile = fs_->openFileForWrite(path1, {}); + writeFile->append("hello"); + writeFile->close(); + auto readFile = fs_->openFileForRead(path1, {}); + char buffer[5]; + ASSERT_EQ(readFile->size(), 5); + ASSERT_EQ(readFile->pread(0, 5, &buffer), "hello"); + fs_->remove(path1); + } + { + auto writeFile = fs_->openFileForWrite(path2, {}); + writeFile->append("hello"); + writeFile->close(); + auto readFile = fs_->openFileForRead(path2, {}); + char buffer[10]; + ASSERT_EQ(readFile->size(), 10); + ASSERT_EQ(readFile->pread(0, 10, &buffer), "Error data"); + fs_->remove(path2); + } + + // Set to not delegate. + fs_->setFileInjectionHook([&](FaultFileOperation* op) { + // Only inject for write. + if (op->type != FaultFileOperation::Type::kWrite) { + return; + } + // Only inject for path2. + if (op->path != path2) { + return; + } + auto* writeOp = static_cast(op); + writeOp->delegate = false; + }); + { + auto writeFile = fs_->openFileForWrite(path1, {}); + writeFile->append("hello"); + writeFile->close(); + auto readFile = fs_->openFileForRead(path1, {}); + char buffer[5]; + ASSERT_EQ(readFile->size(), 5); + ASSERT_EQ(readFile->pread(0, 5, &buffer), "hello"); + fs_->remove(path1); + } + { + auto writeFile = fs_->openFileForWrite(path2, {}); + writeFile->append("hello"); + writeFile->close(); + auto readFile = fs_->openFileForRead(path2, {}); + ASSERT_EQ(readFile->size(), 0); + fs_->remove(path2); + } } diff --git a/velox/common/file/tests/UtilsTest.cpp b/velox/common/file/tests/UtilsTest.cpp index 58d9d76f796c5..a2b2d6ee5a04e 100644 --- a/velox/common/file/tests/UtilsTest.cpp +++ b/velox/common/file/tests/UtilsTest.cpp @@ -70,8 +70,13 @@ coalescedIndices(Iter begin, Iter end, ShouldCoalesce& shouldCoalesce) { bool willCoalesceIfDistanceLE( uint64_t distance, const Region& regionA, - const Region& regionB) { - return CoalesceIfDistanceLE(distance)(regionA, regionB); + const Region& regionB, + uint64_t expectedCoalescedBytes) { + uint64_t coalescedBytes = 0; + const bool willCoalesce = + CoalesceIfDistanceLE(distance, &coalescedBytes)(regionA, regionB); + EXPECT_EQ(coalescedBytes, expectedCoalescedBytes); + return willCoalesce; } auto getReader( @@ -197,53 +202,68 @@ TEST(CoalesceSegmentsTest, MergeEven) { } TEST(CoalesceIfDistanceLETest, MultipleCases) { - EXPECT_TRUE(willCoalesceIfDistanceLE(0, {0, 1}, {1, 1})); - EXPECT_FALSE(willCoalesceIfDistanceLE(0, {0, 1}, {2, 1})); + EXPECT_TRUE(willCoalesceIfDistanceLE(0, {0, 1}, {1, 1}, 0)); + EXPECT_FALSE(willCoalesceIfDistanceLE(0, {0, 1}, {2, 1}, 0)); - EXPECT_TRUE(willCoalesceIfDistanceLE(1, {0, 1}, {2, 1})); + EXPECT_TRUE(willCoalesceIfDistanceLE(1, {0, 1}, {2, 1}, 1)); - EXPECT_TRUE(willCoalesceIfDistanceLE(10, {0, 1}, {1, 1})); - EXPECT_TRUE(willCoalesceIfDistanceLE(10, {10, 1}, {11, 1})); - EXPECT_TRUE(willCoalesceIfDistanceLE(10, {0, 10}, {19, 5})); - EXPECT_TRUE(willCoalesceIfDistanceLE(10, {0, 10}, {20, 5})); - EXPECT_FALSE(willCoalesceIfDistanceLE(10, {0, 10}, {21, 5})); + EXPECT_TRUE(willCoalesceIfDistanceLE(10, {0, 1}, {1, 1}, 0)); + EXPECT_TRUE(willCoalesceIfDistanceLE(10, {10, 1}, {11, 1}, 0)); + EXPECT_TRUE(willCoalesceIfDistanceLE(10, {0, 10}, {19, 5}, 9)); + EXPECT_TRUE(willCoalesceIfDistanceLE(10, {0, 10}, {20, 5}, 10)); + EXPECT_FALSE(willCoalesceIfDistanceLE(10, {0, 10}, {21, 5}, 0)); - EXPECT_TRUE(willCoalesceIfDistanceLE(0, {0, 0}, {0, 1})); + EXPECT_TRUE(willCoalesceIfDistanceLE(0, {0, 0}, {0, 1}, 0)); +} + +TEST(CoalesceIfDistanceLETest, MultipleSegments) { + uint64_t coalescedBytes = 0; + auto willCoalesce = CoalesceIfDistanceLE(10, &coalescedBytes); + EXPECT_TRUE(willCoalesce({0, 1}, {1, 1})); // 0 + EXPECT_TRUE(willCoalesce({10, 1}, {11, 1})); // 0 + EXPECT_TRUE(willCoalesce({0, 10}, {19, 5})); // 9 + EXPECT_TRUE(willCoalesce({0, 10}, {20, 5})); // 10 + EXPECT_FALSE(willCoalesce({0, 10}, {21, 5})); // 0 + EXPECT_EQ(coalescedBytes, 19); +} + +TEST(CoalesceIfDistanceLETest, SupportsNullArgument) { + EXPECT_NO_THROW(CoalesceIfDistanceLE(10, nullptr)({0, 10}, {20, 5})); // 10 } TEST(CoalesceIfDistanceLETest, SegmentsMustBeSorted) { EXPECT_THROW( - willCoalesceIfDistanceLE(0, {1, 1}, {0, 1}), + willCoalesceIfDistanceLE(0, {1, 1}, {0, 1}, 0), ::facebook::velox::VeloxRuntimeError); EXPECT_THROW( - willCoalesceIfDistanceLE(10, {1, 1}, {0, 1}), + willCoalesceIfDistanceLE(10, {1, 1}, {0, 1}, 0), ::facebook::velox::VeloxRuntimeError); EXPECT_THROW( - willCoalesceIfDistanceLE(0, {1000, 1}, {2, 1}), + willCoalesceIfDistanceLE(0, {1000, 1}, {2, 1}, 0), ::facebook::velox::VeloxRuntimeError); EXPECT_THROW( - willCoalesceIfDistanceLE(10, {1000, 1}, {2, 1}), + willCoalesceIfDistanceLE(10, {1000, 1}, {2, 1}, 0), ::facebook::velox::VeloxRuntimeError); } TEST(CoalesceIfDistanceLETest, SegmentsCantOverlap) { EXPECT_THROW( - willCoalesceIfDistanceLE(0, {0, 1}, {0, 1}), + willCoalesceIfDistanceLE(0, {0, 1}, {0, 1}, 0), ::facebook::velox::VeloxRuntimeError); EXPECT_THROW( - willCoalesceIfDistanceLE(10, {0, 1}, {0, 1}), + willCoalesceIfDistanceLE(10, {0, 1}, {0, 1}, 0), ::facebook::velox::VeloxRuntimeError); EXPECT_THROW( - willCoalesceIfDistanceLE(0, {0, 2}, {1, 1}), + willCoalesceIfDistanceLE(0, {0, 2}, {1, 1}, 0), ::facebook::velox::VeloxRuntimeError); EXPECT_THROW( - willCoalesceIfDistanceLE(10, {0, 2}, {1, 1}), + willCoalesceIfDistanceLE(10, {0, 2}, {1, 1}, 0), ::facebook::velox::VeloxRuntimeError); EXPECT_THROW( - willCoalesceIfDistanceLE(0, {0, 2}, {1, 2}), + willCoalesceIfDistanceLE(0, {0, 2}, {1, 2}, 0), ::facebook::velox::VeloxRuntimeError); EXPECT_THROW( - willCoalesceIfDistanceLE(10, {0, 2}, {1, 2}), + willCoalesceIfDistanceLE(10, {0, 2}, {1, 2}, 0), ::facebook::velox::VeloxRuntimeError); } diff --git a/velox/common/hyperloglog/CMakeLists.txt b/velox/common/hyperloglog/CMakeLists.txt index 3b4cb32efcd34..27c07830b8644 100644 --- a/velox/common/hyperloglog/CMakeLists.txt +++ b/velox/common/hyperloglog/CMakeLists.txt @@ -11,14 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -if(${VELOX_BUILD_TESTING}) - add_subdirectory(tests) -endif() +velox_add_library(velox_common_hyperloglog BiasCorrection.cpp DenseHll.cpp + SparseHll.cpp) -add_library(velox_common_hyperloglog BiasCorrection.cpp DenseHll.cpp - SparseHll.cpp) - -target_link_libraries( +velox_link_libraries( velox_common_hyperloglog PUBLIC velox_memory PRIVATE velox_exception) + +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() + +if(${VELOX_ENABLE_BENCHMARKS}) + add_subdirectory(benchmarks) +endif() diff --git a/velox/common/hyperloglog/DenseHll.cpp b/velox/common/hyperloglog/DenseHll.cpp index 09b17af838ca2..d383a7429776f 100644 --- a/velox/common/hyperloglog/DenseHll.cpp +++ b/velox/common/hyperloglog/DenseHll.cpp @@ -532,11 +532,11 @@ void DenseHll::mergeWith(const DenseHll& other) { "Cannot merge HLLs with different number of buckets"); mergeWith( - other.baseline_, - other.deltas_.data(), - other.overflows_, - other.overflowBuckets_.data(), - other.overflowValues_.data()); + {other.baseline_, + other.deltas_.data(), + other.overflows_, + other.overflowBuckets_.data(), + other.overflowValues_.data()}); } void DenseHll::mergeWith(const char* serialized) { @@ -558,16 +558,200 @@ void DenseHll::mergeWith(const char* serialized) { auto overflows = stream.read(); auto overflowBuckets = overflows ? stream.read(overflows) : nullptr; auto overflowValues = overflows ? stream.read(overflows) : nullptr; - mergeWith(baseline, deltas, overflows, overflowBuckets, overflowValues); + mergeWith({baseline, deltas, overflows, overflowBuckets, overflowValues}); } -void DenseHll::mergeWith( - int8_t otherBaseline, - const int8_t* otherDeltas, - int16_t otherOverflows, - const uint16_t* otherOverflowBuckets, - const int8_t* otherOverflowValues) { - int8_t newBaseline = std::max(baseline_, otherBaseline); +std::pair DenseHll::computeNewValue( + int8_t delta, + int8_t otherDelta, + int32_t bucket, + const HllView& other) { + int8_t value1 = baseline_ + delta; + int8_t value2 = other.baseline + otherDelta; + + int16_t overflowEntry = -1; + if (delta == kMaxDelta) { + overflowEntry = findOverflowEntry(bucket); + if (overflowEntry != -1) { + value1 += overflowValues_[overflowEntry]; + } + } + + if (otherDelta == kMaxDelta) { + value2 += getOverflowImpl( + bucket, other.overflows, other.overflowBuckets, other.overflowValues); + } + + return {std::max(value1, value2), overflowEntry}; +} + +void DenseHll::mergeWith(const HllView& other) { + // Number of 'delta' bytes that fit in a single SIMD batch. Each 'delta' byte + // stores 2 4-bit deltas. + constexpr auto batchSize = xsimd::batch::size; + + // If deltas_.size() is not a multiple of batchSize, we need to use scalar + // code to process the 'tail'. deltas_.size() is a power of 2. batchSize is + // also a power of 2. Hence, the only case where deltas_.size() is not a + // multiple of batchSize is when deltas_.size() is less than batchSize. In + // this case we can't use SIMD path at all. Therefore, there are only 2 + // possibilities: all data can be processed using SIMD or none. + + const int8_t newBaseline = std::max(baseline_, other.baseline); + if (deltas_.size() >= batchSize) { + baselineCount_ = mergeWithSimd(other, newBaseline); + } else { + baselineCount_ = mergeWithScalar(other, newBaseline); + } + + baseline_ = newBaseline; + + // If all baseline values in one of the HLLs lost to the values + // in the other HLL, we need to adjust the final baseline. + adjustBaselineIfNeeded(); +} + +int32_t DenseHll::mergeWithSimd(const HllView& other, int8_t newBaseline) { + const auto batchSize = xsimd::batch::size; + + const auto bucketMaskBatch = xsimd::broadcast(kBucketMask); + const auto maxDeltaBatch = xsimd::broadcast(kMaxDelta); + const auto baselineBatch = xsimd::broadcast(baseline_); + const auto otherBaselineBatch = xsimd::broadcast(other.baseline); + const auto newBaselineBatch = xsimd::broadcast(newBaseline); + const auto zeroBatch = xsimd::broadcast((int8_t)0); + + // SIMD doesn't support 4-bit integers. The smallest integer is 8-bit. + // We are going to use 2 SIMD registers to process a batch of values. + // One register will store values with odd indices (0, 2, 4...). The other + // register will store values with even indices (1, 3, 5...). + + // Load deltas with even indices into SIMD register. + auto loadEven = [&](const int8_t* deltas) { + auto batch = xsimd::load_unaligned(deltas); + batch = xsimd::kernel::bitwise_rshift(batch, 4, xsimd::default_arch{}); + return xsimd::bitwise_and(batch, bucketMaskBatch); + }; + + // Load deltas with odd indices into SIMD register. + auto loadOdd = [&](const int8_t* deltas) { + auto batch = xsimd::load_unaligned(deltas); + return xsimd::bitwise_and(batch, bucketMaskBatch); + }; + + // Count number of zeros in a SIMD register. + auto countZeros = [&](const xsimd::batch& batch) { + auto zerosBitmask = xsimd::eq(batch, zeroBatch).mask(); + return bits::countBits(&zerosBitmask, 0, batchSize); + }; + + // Given two SIMD registers of deltas, converts deltas to values by adding + // baselines and returns their max along with a bitmask that has bits set for + // entries that may have an overflow. + auto processBatch = [&](xsimd::batch& batch, + xsimd::batch& otherBatch) { + auto overflows = xsimd::eq(batch, maxDeltaBatch).mask(); + batch += baselineBatch; + + overflows |= xsimd::eq(otherBatch, maxDeltaBatch).mask(); + otherBatch += otherBaselineBatch; + + // Compute max. + auto maxBatch = xsimd::max(batch, otherBatch); + maxBatch -= newBaselineBatch; + + return std::pair{maxBatch, overflows}; + }; + + auto processOverflow = [&](int8_t delta1, int8_t delta2, int bucket) { + auto [newValue, overflowEntry] = + computeNewValue(delta1, delta2, bucket, other); + + int8_t newDelta = newValue - newBaseline; + + return updateOverflow(bucket, overflowEntry, newDelta); + }; + + int32_t baselineCount = 0; + for (int i = 0; i < deltas_.size(); i += batchSize) { + // Process values in even indices first. + auto evenBatch = loadEven(deltas_.data() + i); + auto otherEvenBatch = loadEven(other.deltas + i); + + auto [evenMaxBatch, evenOverflows] = + processBatch(evenBatch, otherEvenBatch); + + baselineCount += countZeros(evenMaxBatch); + + // Process values in odd indices. + auto oddBatch = loadOdd(deltas_.data() + i); + auto otherOddBatch = loadOdd(other.deltas + i); + + auto [oddMaxBatch, oddOverflows] = processBatch(oddBatch, otherOddBatch); + + baselineCount += countZeros(oddMaxBatch); + + // Combine even and odd batches. Shift even batch left by 4 bits, then OR + // with odd batch. + auto combinedBatch = + xsimd::kernel::bitwise_lshift(evenMaxBatch, 4, xsimd::default_arch{}); + combinedBatch = xsimd::bitwise_or(combinedBatch, oddMaxBatch); + + xsimd::store_unaligned(deltas_.data() + i, combinedBatch); + + // Process overflows. + if (evenOverflows != 0) { + // deltas_ has been updated and can no longer be used to process overflow + // entries. evenBatch and otherEvenBatch contain original deltas + + // baseline. + int8_t temp[batchSize], otherTemp[batchSize]; + xsimd::store_unaligned(temp, evenBatch); + xsimd::store_unaligned(otherTemp, otherEvenBatch); + + bits::forEachSetBit(&evenOverflows, 0, batchSize, [&](auto index) { + const auto deltaIndex = i + index; + const auto bucket = deltaIndex * 2; + int8_t newDelta = processOverflow( + temp[index] - baseline_, otherTemp[index] - other.baseline, bucket); + + if (newDelta == 0) { + baselineCount++; + } + + // Store newDelta in deltas_[deltaIndex]. + auto slot1 = deltas_[deltaIndex]; + deltas_[deltaIndex] = (newDelta << 4) | (slot1 & kBucketMask); + }); + } + + if (oddOverflows != 0) { + // deltas_ has been updated and can no longer be used to process overflow + // entries. oddBatch and otherOddBatch contain original deltas + baseline. + int8_t temp[batchSize], otherTemp[batchSize]; + xsimd::store_unaligned(temp, oddBatch); + xsimd::store_unaligned(otherTemp, otherOddBatch); + + bits::forEachSetBit(&oddOverflows, 0, batchSize, [&](auto index) { + const auto deltaIndex = i + index; + const auto bucket = deltaIndex * 2 + 1; + int8_t newDelta = processOverflow( + temp[index] - baseline_, otherTemp[index] - other.baseline, bucket); + + if (newDelta == 0) { + baselineCount++; + } + + // Store newDelta. + auto slot1 = deltas_[deltaIndex]; + deltas_[deltaIndex] = (((slot1 >> 4) & kBucketMask) << 4) | newDelta; + }); + } + } + + return baselineCount; +} + +int32_t DenseHll::mergeWithScalar(const HllView& other, int8_t newBaseline) { int32_t baselineCount = 0; int bucket = 0; @@ -575,29 +759,15 @@ void DenseHll::mergeWith( int newSlot = 0; int8_t slot1 = deltas_[i]; - int8_t slot2 = otherDeltas[i]; + int8_t slot2 = other.deltas[i]; for (int shift = 4; shift >= 0; shift -= 4) { int8_t delta1 = (slot1 >> shift) & kBucketMask; int8_t delta2 = (slot2 >> shift) & kBucketMask; - int8_t value1 = baseline_ + delta1; - int8_t value2 = otherBaseline + delta2; + auto [newValue, overflowEntry] = + computeNewValue(delta1, delta2, bucket, other); - int16_t overflowEntry = -1; - if (delta1 == kMaxDelta) { - overflowEntry = findOverflowEntry(bucket); - if (overflowEntry != -1) { - value1 += overflowValues_[overflowEntry]; - } - } - - if (delta2 == kMaxDelta) { - value2 += getOverflowImpl( - bucket, otherOverflows, otherOverflowBuckets, otherOverflowValues); - } - - int8_t newValue = std::max(value1, value2); int8_t newDelta = newValue - newBaseline; if (newDelta == 0) { @@ -614,12 +784,7 @@ void DenseHll::mergeWith( deltas_[i] = newSlot; } - baseline_ = newBaseline; - baselineCount_ = baselineCount; - - // All baseline values in one of the HLLs lost to the values - // in the other HLL, so we need to adjust the final baseline. - adjustBaselineIfNeeded(); + return baselineCount; } int8_t diff --git a/velox/common/hyperloglog/DenseHll.h b/velox/common/hyperloglog/DenseHll.h index 3b3c50b9ae5ef..b6b5f03f8cdf2 100644 --- a/velox/common/hyperloglog/DenseHll.h +++ b/velox/common/hyperloglog/DenseHll.h @@ -109,12 +109,34 @@ class DenseHll { void removeOverflow(int overflowEntry); - void mergeWith( - int8_t otherBaseline, - const int8_t* otherDeltas, - int16_t otherOverflows, - const uint16_t* otherOverflowBuckets, - const int8_t* otherOverflowValues); + struct HllView { + int8_t baseline; + const int8_t* deltas; + int16_t overflows; + const uint16_t* overflowBuckets; + const int8_t* overflowValues; + }; + + void mergeWith(const HllView& other); + + // Merges 'other' HLL into this one using scalar (non-SIMD) code. + // @return Number of buckets with values equal to 'newBaseline'. + int32_t mergeWithScalar(const HllView& other, int8_t newBaseline); + + // Merges 'other' HLL into this one using SIMD. + // @return Number of buckets with values equal to 'newBaseline'. + int32_t mergeWithSimd(const HllView& other, int8_t newBaseline); + + // Given two deltas and a bucket, converts deltas into values and returns + // their max. + // @return Max of two values and an index into overflowBuckets_ and + // overflowValues_ if 'delta' has an overflow. If 'delta' doesn't have an + // overflow, the second value is -1. + std::pair computeNewValue( + int8_t delta, + int8_t otherDelta, + int32_t bucket, + const HllView& other); /// Number of first bits of the hash to calculate buckets from. int8_t indexBitLength_; diff --git a/velox/common/hyperloglog/HllUtils.h b/velox/common/hyperloglog/HllUtils.h index 6ad82de773290..0c32bdb5204ba 100644 --- a/velox/common/hyperloglog/HllUtils.h +++ b/velox/common/hyperloglog/HllUtils.h @@ -23,7 +23,8 @@ namespace facebook::velox::common::hll { constexpr double kLowestMaxStandardError = 0.0040625; constexpr double kHighestMaxStandardError = 0.26000; -constexpr double kDefaultStandardError = 0.023; +constexpr double kDefaultApproxDistinctStandardError = 0.023; +constexpr double kDefaultApproxSetStandardError = 0.01625; const int8_t kPrestoSparseV2 = 2; const int8_t kPrestoDenseV2 = 3; diff --git a/velox/common/hyperloglog/benchmarks/CMakeLists.txt b/velox/common/hyperloglog/benchmarks/CMakeLists.txt new file mode 100644 index 0000000000000..2b86b9cd66ee8 --- /dev/null +++ b/velox/common/hyperloglog/benchmarks/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(velox_common_hyperloglog_dense_hll_bm DenseHll.cpp) + +target_link_libraries( + velox_common_hyperloglog_dense_hll_bm velox_common_hyperloglog + ${FOLLY_BENCHMARK}) diff --git a/velox/common/hyperloglog/benchmarks/DenseHll.cpp b/velox/common/hyperloglog/benchmarks/DenseHll.cpp new file mode 100644 index 0000000000000..0503c4dbe74a0 --- /dev/null +++ b/velox/common/hyperloglog/benchmarks/DenseHll.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/common/hyperloglog/DenseHll.h" +#include +#include +#include "velox/common/memory/HashStringAllocator.h" + +#define XXH_INLINE_ALL +#include + +using namespace facebook::velox; + +namespace { + +template +uint64_t hashOne(T value) { + return XXH64(&value, sizeof(value), 0); +} + +// A benchmark for DenseHll::mergeWith(serialized) API. +// +// Measures the time it takes to merge 2 serialized digests using different +// values for hash bits. Larger values of hash bits corresponds to larger +// digests that are more accurate, but slower to merge. The default number of +// hash bits is 11, while in practice 16 is common. +class DenseHllBenchmark { + public: + explicit DenseHllBenchmark(memory::MemoryPool* pool) : pool_(pool) { + for (auto hashBits : {11, 12, 16}) { + serializedHlls_[hashBits].push_back(makeSerializedHll(hashBits, 1)); + serializedHlls_[hashBits].push_back(makeSerializedHll(hashBits, 2)); + } + } + + void run(int hashBits) { + folly::BenchmarkSuspender suspender; + + HashStringAllocator allocator(pool_); + common::hll::DenseHll hll(hashBits, &allocator); + + suspender.dismiss(); + + for (const auto& serialized : serializedHlls_.at(hashBits)) { + hll.mergeWith(serialized.data()); + } + } + + private: + std::string makeSerializedHll(int hashBits, int32_t step) { + HashStringAllocator allocator(pool_); + common::hll::DenseHll hll(hashBits, &allocator); + for (int32_t i = 0; i < 1'000'000; ++i) { + auto hash = hashOne(i * step); + hll.insertHash(hash); + } + return serialize(hll); + } + + static std::string serialize(common::hll::DenseHll& denseHll) { + auto size = denseHll.serializedSize(); + std::string serialized; + serialized.resize(size); + denseHll.serialize(serialized.data()); + return serialized; + } + + memory::MemoryPool* pool_; + + // List of serialized HLLs to use for merging, keyed by the number of hash + // bits. + std::unordered_map> serializedHlls_; +}; + +} // namespace + +std::unique_ptr benchmark; + +BENCHMARK(mergeSerialized11) { + benchmark->run(11); +} + +BENCHMARK(mergeSerialized12) { + benchmark->run(12); +} + +BENCHMARK(mergeSerialized16) { + benchmark->run(16); +} + +int main(int argc, char** argv) { + folly::Init init(&argc, &argv); + + memory::MemoryManager::initialize({}); + auto rootPool = memory::memoryManager()->addRootPool(); + auto pool = rootPool->addLeafChild("bm"); + benchmark = std::make_unique(pool.get()); + + folly::runBenchmarks(); + return 0; +} diff --git a/velox/common/hyperloglog/tests/CMakeLists.txt b/velox/common/hyperloglog/tests/CMakeLists.txt index b500a958cbe6e..c05a810796fee 100644 --- a/velox/common/hyperloglog/tests/CMakeLists.txt +++ b/velox/common/hyperloglog/tests/CMakeLists.txt @@ -18,5 +18,5 @@ add_test(NAME velox_common_hyperloglog_test COMMAND velox_common_hyperloglog_test) target_link_libraries( - velox_common_hyperloglog_test PRIVATE velox_common_hyperloglog velox_encode - gtest gtest_main) + velox_common_hyperloglog_test + PRIVATE velox_common_hyperloglog velox_encode GTest::gtest GTest::gtest_main) diff --git a/velox/common/hyperloglog/tests/DenseHllTest.cpp b/velox/common/hyperloglog/tests/DenseHllTest.cpp index 66af1fd635eed..1484eec3c8b92 100644 --- a/velox/common/hyperloglog/tests/DenseHllTest.cpp +++ b/velox/common/hyperloglog/tests/DenseHllTest.cpp @@ -36,6 +36,10 @@ uint64_t hashOne(T value) { class DenseHllTest : public ::testing::TestWithParam { protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + DenseHll roundTrip(DenseHll& hll) { auto size = hll.serializedSize(); std::string serialized; @@ -100,7 +104,8 @@ class DenseHllTest : public ::testing::TestWithParam { expected.cardinality()); } - std::shared_ptr pool_{memory::addDefaultLeafMemoryPool()}; + std::shared_ptr pool_{ + memory::memoryManager()->addLeafPool()}; HashStringAllocator allocator_{pool_.get()}; }; diff --git a/velox/common/hyperloglog/tests/SparseHllTest.cpp b/velox/common/hyperloglog/tests/SparseHllTest.cpp index d2f37ca1a6aac..afc06808fbfe0 100644 --- a/velox/common/hyperloglog/tests/SparseHllTest.cpp +++ b/velox/common/hyperloglog/tests/SparseHllTest.cpp @@ -30,6 +30,10 @@ uint64_t hashOne(T value) { class SparseHllTest : public ::testing::Test { protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + template void testMergeWith(const std::vector& left, const std::vector& right) { testMergeWith(left, right, false); @@ -98,7 +102,8 @@ class SparseHllTest : public ::testing::Test { return serialized; } - std::shared_ptr pool_{memory::addDefaultLeafMemoryPool()}; + std::shared_ptr pool_{ + memory::memoryManager()->addLeafPool()}; HashStringAllocator allocator_{pool_.get()}; }; @@ -170,6 +175,10 @@ TEST_F(SparseHllTest, mergeWith) { class SparseHllToDenseTest : public ::testing::TestWithParam { protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + std::string serialize(DenseHll& denseHll) { auto size = denseHll.serializedSize(); std::string serialized; @@ -178,7 +187,8 @@ class SparseHllToDenseTest : public ::testing::TestWithParam { return serialized; } - std::shared_ptr pool_{memory::addDefaultLeafMemoryPool()}; + std::shared_ptr pool_{ + memory::memoryManager()->addLeafPool()}; HashStringAllocator allocator_{pool_.get()}; }; diff --git a/velox/common/io/CMakeLists.txt b/velox/common/io/CMakeLists.txt index 52619f0d5ce1e..3498214b4fdb4 100644 --- a/velox/common/io/CMakeLists.txt +++ b/velox/common/io/CMakeLists.txt @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_common_io IoStatistics.cpp) +velox_add_library(velox_common_io IoStatistics.cpp) -target_link_libraries(velox_common_io Folly::folly glog::glog) +velox_link_libraries(velox_common_io Folly::folly glog::glog) diff --git a/velox/common/io/IoStatistics.cpp b/velox/common/io/IoStatistics.cpp index 562beacb773d1..7dfddc6dc4831 100644 --- a/velox/common/io/IoStatistics.cpp +++ b/velox/common/io/IoStatistics.cpp @@ -46,6 +46,10 @@ uint64_t IoStatistics::totalScanTime() const { return totalScanTime_.load(std::memory_order_relaxed); } +uint64_t IoStatistics::writeIOTimeUs() const { + return writeIOTimeUs_.load(std::memory_order_relaxed); +} + uint64_t IoStatistics::incRawBytesRead(int64_t v) { return rawBytesRead_.fetch_add(v, std::memory_order_relaxed); } @@ -70,22 +74,32 @@ uint64_t IoStatistics::incTotalScanTime(int64_t v) { return totalScanTime_.fetch_add(v, std::memory_order_relaxed); } +uint64_t IoStatistics::incWriteIOTimeUs(int64_t v) { + return writeIOTimeUs_.fetch_add(v, std::memory_order_relaxed); +} + void IoStatistics::incOperationCounters( const std::string& operation, const uint64_t resourceThrottleCount, const uint64_t localThrottleCount, + const uint64_t networkThrottleCount, const uint64_t globalThrottleCount, const uint64_t retryCount, const uint64_t latencyInMs, - const uint64_t delayInjectedInSecs) { + const uint64_t delayInjectedInSecs, + const uint64_t fullThrottleCount, + const uint64_t partialThrottleCount) { std::lock_guard lock{operationStatsMutex_}; operationStats_[operation].localThrottleCount += localThrottleCount; operationStats_[operation].resourceThrottleCount += resourceThrottleCount; + operationStats_[operation].networkThrottleCount += networkThrottleCount; operationStats_[operation].globalThrottleCount += globalThrottleCount; operationStats_[operation].retryCount += retryCount; operationStats_[operation].latencyInMs += latencyInMs; operationStats_[operation].requestCount++; operationStats_[operation].delayInjectedInSecs += delayInjectedInSecs; + operationStats_[operation].fullThrottleCount += fullThrottleCount; + operationStats_[operation].partialThrottleCount += partialThrottleCount; } std::unordered_map @@ -114,22 +128,28 @@ void IoStatistics::merge(const IoStatistics& other) { void OperationCounters::merge(const OperationCounters& other) { resourceThrottleCount += other.resourceThrottleCount; localThrottleCount += other.localThrottleCount; + networkThrottleCount += other.networkThrottleCount; globalThrottleCount += other.globalThrottleCount; retryCount += other.retryCount; latencyInMs += other.latencyInMs; requestCount += other.requestCount; delayInjectedInSecs += other.delayInjectedInSecs; + fullThrottleCount += other.fullThrottleCount; + partialThrottleCount += other.partialThrottleCount; } folly::dynamic serialize(const OperationCounters& counters) { folly::dynamic json = folly::dynamic::object; json["latencyInMs"] = counters.latencyInMs; json["localThrottleCount"] = counters.localThrottleCount; + json["networkThrottleCount"] = counters.networkThrottleCount; json["resourceThrottleCount"] = counters.resourceThrottleCount; json["globalThrottleCount"] = counters.globalThrottleCount; json["retryCount"] = counters.retryCount; json["requestCount"] = counters.requestCount; json["delayInjectedInSecs"] = counters.delayInjectedInSecs; + json["fullThrottleCount"] = counters.fullThrottleCount; + json["partialThrottleCount"] = counters.partialThrottleCount; return json; } diff --git a/velox/common/io/IoStatistics.h b/velox/common/io/IoStatistics.h index e1d72afc9cfee..2111a8877b475 100644 --- a/velox/common/io/IoStatistics.h +++ b/velox/common/io/IoStatistics.h @@ -29,7 +29,10 @@ namespace facebook::velox::io { struct OperationCounters { uint64_t resourceThrottleCount{0}; uint64_t localThrottleCount{0}; + uint64_t networkThrottleCount{0}; uint64_t globalThrottleCount{0}; + uint64_t fullThrottleCount{0}; + uint64_t partialThrottleCount{0}; uint64_t retryCount{0}; uint64_t latencyInMs{0}; uint64_t requestCount{0}; @@ -48,19 +51,42 @@ class IoCounter { return sum_; } + uint64_t min() const { + return min_; + } + + uint64_t max() const { + return max_; + } + void increment(uint64_t amount) { ++count_; sum_ += amount; + casLoop(min_, amount, std::greater()); + casLoop(max_, amount, std::less()); } void merge(const IoCounter& other) { sum_ += other.sum_; count_ += other.count_; + casLoop(min_, other.min_, std::greater()); + casLoop(max_, other.max_, std::less()); } private: + template + static void + casLoop(std::atomic& value, uint64_t newValue, Compare compare) { + uint64_t old = value; + while (compare(old, newValue) && + !value.compare_exchange_weak(old, newValue)) { + } + } + std::atomic count_{0}; std::atomic sum_{0}; + std::atomic min_{std::numeric_limits::max()}; + std::atomic max_{0}; }; class IoStatistics { @@ -71,6 +97,7 @@ class IoStatistics { uint64_t inputBatchSize() const; uint64_t outputBatchSize() const; uint64_t totalScanTime() const; + uint64_t writeIOTimeUs() const; uint64_t incRawBytesRead(int64_t); uint64_t incRawOverreadBytes(int64_t); @@ -78,6 +105,7 @@ class IoStatistics { uint64_t incInputBatchSize(int64_t); uint64_t incOutputBatchSize(int64_t); uint64_t incTotalScanTime(int64_t); + uint64_t incWriteIOTimeUs(int64_t); IoCounter& prefetch() { return prefetch_; @@ -103,10 +131,13 @@ class IoStatistics { const std::string& operation, const uint64_t resourceThrottleCount, const uint64_t localThrottleCount, + const uint64_t networkThrottleCount, const uint64_t globalThrottleCount, const uint64_t retryCount, const uint64_t latencyInMs, - const uint64_t delayInjectedInSecs); + const uint64_t delayInjectedInSecs, + const uint64_t fullThrottleCount = 0, + const uint64_t partialThrottleCount = 0); std::unordered_map operationStats() const; @@ -121,6 +152,7 @@ class IoStatistics { std::atomic outputBatchSize_{0}; std::atomic rawOverreadBytes_{0}; std::atomic totalScanTime_{0}; + std::atomic writeIOTimeUs_{0}; // Planned read from storage or SSD. IoCounter prefetch_; @@ -135,8 +167,8 @@ class IoStatistics { // reads. IoCounter ssdRead_; - // Time spent by a query processing thread waiting for synchronously - // issued IO or for an in-progress read-ahead to finish. + // Time spent by a query processing thread waiting for synchronously issued IO + // or for an in-progress read-ahead to finish. IoCounter queryThreadIoLatency_; std::unordered_map operationStats_; diff --git a/velox/common/io/Options.h b/velox/common/io/Options.h index d10709e99bd97..93f89fe292f36 100644 --- a/velox/common/io/Options.h +++ b/velox/common/io/Options.h @@ -57,15 +57,6 @@ enum class PrefetchMode { }; class ReaderOptions { - protected: - velox::memory::MemoryPool* memoryPool; - uint64_t autoPreloadLength; - PrefetchMode prefetchMode; - int32_t loadQuantum_{kDefaultLoadQuantum}; - int32_t maxCoalesceDistance_{kDefaultCoalesceDistance}; - int64_t maxCoalesceBytes_{kDefaultCoalesceBytes}; - int32_t prefetchRowGroups_{kDefaultPrefetchRowGroups}; - public: static constexpr int32_t kDefaultLoadQuantum = 8 << 20; // 8MB static constexpr int32_t kDefaultCoalesceDistance = 512 << 10; // 512K @@ -73,91 +64,63 @@ class ReaderOptions { static constexpr int32_t kDefaultPrefetchRowGroups = 1; explicit ReaderOptions(velox::memory::MemoryPool* pool) - : memoryPool(pool), - autoPreloadLength(DEFAULT_AUTO_PRELOAD_SIZE), - prefetchMode(PrefetchMode::PREFETCH) {} - - ReaderOptions& operator=(const ReaderOptions& other) { - memoryPool = other.memoryPool; - autoPreloadLength = other.autoPreloadLength; - prefetchMode = other.prefetchMode; - maxCoalesceDistance_ = other.maxCoalesceDistance_; - maxCoalesceBytes_ = other.maxCoalesceBytes_; - prefetchRowGroups_ = other.prefetchRowGroups_; - return *this; - } + : memoryPool_(pool), + autoPreloadLength_(DEFAULT_AUTO_PRELOAD_SIZE), + prefetchMode_(PrefetchMode::PREFETCH) {} - ReaderOptions(const ReaderOptions& other) { - *this = other; - } - - /** - * Set the memory allocator. - */ + /// Sets the memory pool for allocation. ReaderOptions& setMemoryPool(velox::memory::MemoryPool& pool) { - memoryPool = &pool; + memoryPool_ = &pool; return *this; } - /** - * Modify the autoPreloadLength - */ + /// Modifies the autoPreloadLength ReaderOptions& setAutoPreloadLength(uint64_t len) { - autoPreloadLength = len; + autoPreloadLength_ = len; return *this; } - /** - * Modify the prefetch mode. - */ + /// Modifies the prefetch mode. ReaderOptions& setPrefetchMode(PrefetchMode mode) { - prefetchMode = mode; + prefetchMode_ = mode; return *this; } - /** - * Modify the load quantum. - */ + /// Modifies the load quantum. ReaderOptions& setLoadQuantum(int32_t quantum) { loadQuantum_ = quantum; return *this; } - /** - * Modify the maximum load coalesce distance. - */ + + /// Modifies the maximum load coalesce distance. ReaderOptions& setMaxCoalesceDistance(int32_t distance) { maxCoalesceDistance_ = distance; return *this; } - /** - * Modify the maximum load coalesce bytes. - */ + + /// Modifies the maximum load coalesce bytes. ReaderOptions& setMaxCoalesceBytes(int64_t bytes) { maxCoalesceBytes_ = bytes; return *this; } - /** - * Modify the number of row groups to prefetch. - */ + /// Modifies the number of row groups to prefetch. ReaderOptions& setPrefetchRowGroups(int32_t numPrefetch) { prefetchRowGroups_ = numPrefetch; return *this; } - /** - * Get the memory allocator. - */ - velox::memory::MemoryPool& getMemoryPool() const { - return *memoryPool; + /// Gets the memory allocator. + velox::memory::MemoryPool& memoryPool() const { + return *memoryPool_; } - uint64_t getAutoPreloadLength() const { - return autoPreloadLength; + uint64_t autoPreloadLength() const { + return autoPreloadLength_; } - PrefetchMode getPrefetchMode() const { - return prefetchMode; + PrefetchMode prefetchMode() const { + return prefetchMode_; } int32_t loadQuantum() const { @@ -175,5 +138,23 @@ class ReaderOptions { int64_t prefetchRowGroups() const { return prefetchRowGroups_; } + + bool noCacheRetention() const { + return noCacheRetention_; + } + + void setNoCacheRetention(bool noCacheRetention) { + noCacheRetention_ = noCacheRetention; + } + + protected: + velox::memory::MemoryPool* memoryPool_; + uint64_t autoPreloadLength_; + PrefetchMode prefetchMode_; + int32_t loadQuantum_{kDefaultLoadQuantum}; + int32_t maxCoalesceDistance_{kDefaultCoalesceDistance}; + int64_t maxCoalesceBytes_{kDefaultCoalesceBytes}; + int32_t prefetchRowGroups_{kDefaultPrefetchRowGroups}; + bool noCacheRetention_{false}; }; } // namespace facebook::velox::io diff --git a/velox/common/memory/Allocation.cpp b/velox/common/memory/Allocation.cpp index f24ef76a359b9..884af7c82cf5d 100644 --- a/velox/common/memory/Allocation.cpp +++ b/velox/common/memory/Allocation.cpp @@ -30,13 +30,20 @@ Allocation::~Allocation() { } } -void Allocation::append(uint8_t* address, int32_t numPages) { - numPages_ += numPages; +void Allocation::append(uint8_t* address, MachinePageCount numPages) { VELOX_CHECK( runs_.empty() || address != runs_.back().data(), "Appending a duplicate address into a PageRun"); + if (FOLLY_UNLIKELY(numPages > Allocation::PageRun::kMaxPagesInRun)) { + VELOX_MEM_ALLOC_ERROR(fmt::format( + "The number of pages to append {} exceeds the PageRun limit {}", + numPages, + Allocation::PageRun::kMaxPagesInRun)); + } + numPages_ += numPages; runs_.emplace_back(address, numPages); } + void Allocation::appendMove(Allocation& other) { for (auto& run : other.runs_) { numPages_ += run.numPages(); @@ -50,7 +57,7 @@ void Allocation::findRun(uint64_t offset, int32_t* index, int32_t* offsetInRun) const { uint64_t skipped = 0; for (int32_t i = 0; i < runs_.size(); ++i) { - uint64_t size = runs_[i].numPages() * AllocationTraits::kPageSize; + uint64_t size = AllocationTraits::pageBytes(runs_[i].numPages()); if (offset - skipped < size) { *index = i; *offsetInRun = static_cast(offset - skipped); diff --git a/velox/common/memory/Allocation.h b/velox/common/memory/Allocation.h index 6378378ab0a59..250db476d7866 100644 --- a/velox/common/memory/Allocation.h +++ b/velox/common/memory/Allocation.h @@ -37,6 +37,9 @@ struct AllocationTraits { /// Size of huge page as intended with MADV_HUGEPAGE. static constexpr uint64_t kHugePageSize = 2 << 20; // 2MB + static_assert(kHugePageSize >= kPageSize); + static_assert(kHugePageSize % kPageSize == 0); + /// Returns the bytes of the given number pages. FOLLY_ALWAYS_INLINE static uint64_t pageBytes(MachinePageCount numPages) { return numPages * kPageSize; @@ -47,14 +50,12 @@ struct AllocationTraits { } /// Returns the round up page bytes. - FOLLY_ALWAYS_INLINE static MachinePageCount roundUpPageBytes(uint64_t bytes) { + FOLLY_ALWAYS_INLINE static uint64_t roundUpPageBytes(uint64_t bytes) { return bits::roundUp(bytes, kPageSize); } /// The number of pages in a huge page. - FOLLY_ALWAYS_INLINE static MachinePageCount numPagesInHugePage() { - VELOX_DCHECK_GE(kHugePageSize, kPageSize); - VELOX_DCHECK_EQ(kHugePageSize % kPageSize, 0); + static constexpr MachinePageCount numPagesInHugePage() { return kHugePageSize / kPageSize; } }; @@ -171,7 +172,7 @@ class Allocation { VELOX_CHECK(numPages_ != 0 || pool_ == nullptr); } - void append(uint8_t* address, int32_t numPages); + void append(uint8_t* address, MachinePageCount numPages); void clear() { runs_.clear(); @@ -193,6 +194,7 @@ class Allocation { VELOX_FRIEND_TEST(MemoryAllocatorTest, allocationClass2); VELOX_FRIEND_TEST(AllocationTest, append); VELOX_FRIEND_TEST(AllocationTest, appendMove); + VELOX_FRIEND_TEST(AllocationTest, maxPageRunLimit); }; /// Represents a run of contiguous pages that do not belong to any size class. @@ -248,6 +250,7 @@ class ContiguousAllocation { VELOX_CHECK_NULL(pool_); pool_ = pool; } + MemoryPool* pool() const { return pool_; } @@ -263,7 +266,7 @@ class ContiguousAllocation { // Adjusts 'size' towards 'maxSize' by 'increment' pages. Rounds // 'increment' to huge pages, since this is the unit of growth of - // RSS for large contiguous runs. Increases the reservation in in + // RSS for large contiguous runs. Increases the reservation in // 'pool_' and its allocator. May fail by cap exceeded. If failing, // the size is not changed. 'size_' cannot exceed 'maxSize_'. void grow(MachinePageCount increment); diff --git a/velox/common/memory/AllocationPool.cpp b/velox/common/memory/AllocationPool.cpp index dcb709aacb916..f3f0c50f0221b 100644 --- a/velox/common/memory/AllocationPool.cpp +++ b/velox/common/memory/AllocationPool.cpp @@ -52,10 +52,7 @@ char* AllocationPool::allocateFixed(uint64_t bytes, int32_t alignment) { VELOX_CHECK_GT(bytes, 0, "Cannot allocate zero bytes"); if (freeAddressableBytes() >= bytes && alignment == 1) { auto* result = startOfRun_ + currentOffset_; - currentOffset_ += bytes; - if (currentOffset_ > endOfReservedRun()) { - growLastAllocation(); - } + maybeGrowLastAllocation(bytes); return result; } VELOX_CHECK_EQ( @@ -75,19 +72,21 @@ char* AllocationPool::allocateFixed(uint64_t bytes, int32_t alignment) { VELOX_CHECK_LE(bytes + currentOffset_, bytesInRun_); auto* result = startOfRun_ + currentOffset_; VELOX_CHECK_EQ(reinterpret_cast(result) % alignment, 0); - currentOffset_ += bytes; - if (currentOffset_ > endOfReservedRun()) { - growLastAllocation(); - } + maybeGrowLastAllocation(bytes); return result; } -void AllocationPool::growLastAllocation() { - VELOX_CHECK_GT(bytesInRun_, AllocationTraits::kHugePageSize); - const auto bytesToReserve = bits::roundUp( - currentOffset_ - endOfReservedRun(), AllocationTraits::kHugePageSize); - largeAllocations_.back().grow(AllocationTraits::numPages(bytesToReserve)); - usedBytes_ += bytesToReserve; +void AllocationPool::maybeGrowLastAllocation(uint64_t bytesRequested) { + const auto updateOffset = currentOffset_ + bytesRequested; + if (updateOffset > endOfReservedRun()) { + VELOX_CHECK_GT(bytesInRun_, AllocationTraits::kHugePageSize); + const auto bytesToReserve = bits::roundUp( + updateOffset - endOfReservedRun(), AllocationTraits::kHugePageSize); + largeAllocations_.back().grow(AllocationTraits::numPages(bytesToReserve)); + usedBytes_ += bytesToReserve; + } + // Only update currentOffset_ once it points to valid data. + currentOffset_ = updateOffset; } void AllocationPool::newRunImpl(MachinePageCount numPages) { diff --git a/velox/common/memory/AllocationPool.h b/velox/common/memory/AllocationPool.h index 7a5bfd960642a..1449be6b9af9c 100644 --- a/velox/common/memory/AllocationPool.h +++ b/velox/common/memory/AllocationPool.h @@ -18,13 +18,12 @@ #include "velox/common/memory/Memory.h" namespace facebook::velox::memory { -// A set of Allocations holding the fixed width payload -// rows. The Runs are filled to the end except for the last one. This -// is used for iterating over the payload for rehashing, returning -// results etc. This is used via HashStringAllocator for variable length -// allocation for backing ByteStreams for complex objects. In that case, there -// is a current run that is appended to and when this is exhausted a new run is -// started. +/// A set of Allocations holding the fixed width payload ows. The Runs are +/// filled to the end except for the last one. This is used for iterating over +/// the payload for rehashing, returning results etc. This is used via +/// HashStringAllocator for variable length allocation for backing ByteStreams +/// for complex objects. In that case, there is a current run that is appended +/// to and when this is exhausted a new run is started. class AllocationPool { public: static constexpr int32_t kMinPages = 16; @@ -90,7 +89,7 @@ class AllocationPool { return pool_; } - /// Returns true if 'ptr' is inside the range alocations are made from. + /// Returns true if 'ptr' is inside the range allocations are made from. bool isInCurrentRange(void* ptr) const { return reinterpret_cast(ptr) >= startOfRun_ && reinterpret_cast(ptr) < startOfRun_ + bytesInRun_; @@ -131,9 +130,10 @@ class AllocationPool { return bytesInRun_ - currentOffset_; } - // Increses the reservation in 'pool_' when 'currentOffset_' goes past - // current end of last large allocation. - void growLastAllocation(); + // Increases the reservation in 'pool_' if 'bytesRequested' moves + // 'currentOffset_' goes past current end of last large allocation, otherwise + // simply updates 'currentOffset_'. + void maybeGrowLastAllocation(uint64_t bytesRequested); void newRunImpl(memory::MachinePageCount numPages); @@ -141,7 +141,7 @@ class AllocationPool { std::vector allocations_; std::vector largeAllocations_; - // Points to the start of the run from which allocations are being nade. + // Points to the start of the run from which allocations are being made. char* startOfRun_{nullptr}; // Total addressable bytes from 'startOfRun_'. Not all are necessarily diff --git a/velox/common/memory/ByteStream.cpp b/velox/common/memory/ByteStream.cpp index d4e456f92e146..e7802f477823a 100644 --- a/velox/common/memory/ByteStream.cpp +++ b/velox/common/memory/ByteStream.cpp @@ -18,119 +18,174 @@ namespace facebook::velox { +uint32_t ByteRange::availableBytes() const { + return std::max(0, size - position); +} + std::string ByteRange::toString() const { return fmt::format("[{} starting at {}]", succinctBytes(size), position); } -size_t ByteStream::size() const { - if (ranges_.empty()) { - return 0; +std::string BufferInputStream::toString() const { + std::stringstream oss; + oss << ranges_.size() << " ranges (position/size) ["; + for (const auto& range : ranges_) { + oss << "(" << range.position << "/" << range.size + << (&range == current_ ? " current" : "") << ")"; + if (&range != &ranges_.back()) { + oss << ","; + } + } + oss << "]"; + return oss.str(); +} + +bool BufferInputStream::atEnd() const { + if (current_ == nullptr) { + return false; } + if (current_->position < current_->size) { + return false; + } + + VELOX_CHECK(current_ >= ranges_.data() && current_ <= &ranges_.back()); + return current_ == &ranges_.back(); +} + +size_t BufferInputStream::size() const { size_t total = 0; - for (auto i = 0; i < ranges_.size() - 1; ++i) { - total += ranges_[i].size; + for (const auto& range : ranges_) { + total += range.size; } - return total + std::max(ranges_.back().position, lastRangeEnd_); + return total; } -size_t ByteStream::remainingSize() const { +size_t BufferInputStream::remainingSize() const { if (ranges_.empty()) { return 0; } - const auto* lastRange = &ranges_[ranges_.size() - 1]; - auto cur = current_; - size_t total{0}; - if (cur == lastRange) { - total += (std::max(cur->position, lastRangeEnd_) - cur->position); - } else { - total += cur->size - cur->position; - } - + const auto* lastRange = &ranges_.back(); + auto* cur = current_; + size_t remainingBytes = cur->availableBytes(); while (++cur <= lastRange) { - total += (cur == lastRange) ? lastRangeEnd_ : cur->size; + remainingBytes += cur->size; } - return total; + return remainingBytes; } -bool ByteStream::atEnd() const { - if (!current_) { - return false; +std::streampos BufferInputStream::tellp() const { + if (ranges_.empty()) { + return 0; } - if (current_->position < current_->size) { - return false; + assert(current_); + int64_t size = 0; + for (auto& range : ranges_) { + if (&range == current_) { + return current_->position + size; + } + size += range.size; } - - VELOX_CHECK(current_ >= ranges_.data() && current_ <= &ranges_.back()); - return current_ == &ranges_.back(); + VELOX_FAIL("BufferInputStream 'current_' is not in 'ranges_'."); } -void ByteStream::next(bool throwIfPastEnd) { - VELOX_CHECK(current_ >= &ranges_[0]); - size_t position = current_ - &ranges_[0]; - VELOX_CHECK_LT(position, ranges_.size()); - if (position == ranges_.size() - 1) { - if (throwIfPastEnd) { - VELOX_FAIL("Reading past end of ByteStream"); - } +void BufferInputStream::seekp(std::streampos position) { + if (ranges_.empty() && position == 0) { return; } + int64_t toSkip = position; + for (auto& range : ranges_) { + if (toSkip <= range.size) { + current_ = ⦥ + current_->position = toSkip; + return; + } + toSkip -= range.size; + } + static_assert(sizeof(std::streamsize) <= sizeof(long long)); + VELOX_FAIL( + "Seeking past end of BufferInputStream: {}", + static_cast(position)); +} + +void BufferInputStream::nextRange() { + VELOX_CHECK(current_ >= &ranges_[0]); + const size_t rangeIndex = current_ - &ranges_[0]; + VELOX_CHECK_LT( + rangeIndex + 1, ranges_.size(), "Reading past end of BufferInputStream"); ++current_; current_->position = 0; } -uint8_t ByteStream::readByte() { +uint8_t BufferInputStream::readByte() { if (current_->position < current_->size) { return current_->buffer[current_->position++]; } - next(); + nextRange(); return readByte(); } -void ByteStream::readBytes(uint8_t* bytes, int32_t size) { +void BufferInputStream::readBytes(uint8_t* bytes, int32_t size) { + VELOX_CHECK_GE(size, 0, "Attempting to read negative number of bytes"); int32_t offset = 0; for (;;) { - int32_t available = current_->size - current_->position; - int32_t numUsed = std::min(available, size); - memcpy(bytes + offset, current_->buffer + current_->position, numUsed); - offset += numUsed; - size -= numUsed; - current_->position += numUsed; - if (!size) { + const int32_t availableBytes = current_->size - current_->position; + const int32_t readBytes = std::min(availableBytes, size); + simd::memcpy( + bytes + offset, current_->buffer + current_->position, readBytes); + offset += readBytes; + size -= readBytes; + current_->position += readBytes; + if (size == 0) { return; } - next(); + nextRange(); } } -std::string_view ByteStream::nextView(int32_t size) { +std::string_view BufferInputStream::nextView(int32_t size) { + VELOX_CHECK_GE(size, 0, "Attempting to view negative number of bytes"); if (current_->position == current_->size) { if (current_ == &ranges_.back()) { return std::string_view(nullptr, 0); } - next(); + nextRange(); } - VELOX_CHECK(current_->size); - auto position = current_->position; - auto viewSize = std::min(current_->size - current_->position, size); + VELOX_CHECK_GT(current_->size, 0); + const auto position = current_->position; + const auto viewSize = std::min(current_->size - current_->position, size); current_->position += viewSize; return std::string_view( reinterpret_cast(current_->buffer) + position, viewSize); } -void ByteStream::skip(int32_t size) { +void BufferInputStream::skip(int32_t size) { + VELOX_CHECK_GE(size, 0, "Attempting to skip negative number of bytes"); for (;;) { - int32_t available = current_->size - current_->position; - int32_t numUsed = std::min(available, size); - size -= numUsed; - current_->position += numUsed; - if (!size) { + const int32_t numSkipped = + std::min(current_->availableBytes(), size); + size -= numSkipped; + current_->position += numSkipped; + if (size == 0) { return; } - next(); + nextRange(); + } +} + +size_t ByteOutputStream::size() const { + if (ranges_.empty()) { + return 0; } + size_t total = 0; + for (auto i = 0; i < ranges_.size() - 1; ++i) { + total += ranges_[i].size; + } + return total + std::max(ranges_.back().position, lastRangeEnd_); } -void ByteStream::appendBool(bool value, int32_t count) { +void ByteOutputStream::appendBool(bool value, int32_t count) { + VELOX_DCHECK(isBits_); + if (count == 1 && current_->size > current_->position) { bits::setBit( reinterpret_cast(current_->buffer), @@ -139,10 +194,10 @@ void ByteStream::appendBool(bool value, int32_t count) { ++current_->position; return; } - int32_t offset = 0; - VELOX_DCHECK(isBits_); + + int32_t offset{0}; for (;;) { - int32_t bitsFit = + const int32_t bitsFit = std::min(count - offset, current_->size - current_->position); bits::fillBits( reinterpret_cast(current_->buffer), @@ -158,13 +213,44 @@ void ByteStream::appendBool(bool value, int32_t count) { } } -void ByteStream::appendStringPiece(folly::StringPiece value) { +void ByteOutputStream::appendBits( + const uint64_t* bits, + int32_t begin, + int32_t end) { + VELOX_DCHECK(isBits_); + + const int32_t count = end - begin; + int32_t offset = 0; + for (;;) { + const int32_t bitsFit = + std::min(count - offset, current_->size - current_->position); + bits::copyBits( + bits, + begin + offset, + reinterpret_cast(current_->buffer), + current_->position, + bitsFit); + + current_->position += bitsFit; + offset += bitsFit; + if (offset == count) { + return; + } + extend(bits::nbytes(count - offset)); + } +} + +void ByteOutputStream::appendStringView(StringView value) { + appendStringView((std::string_view)value); +} + +void ByteOutputStream::appendStringView(std::string_view value) { const int32_t bytes = value.size(); int32_t offset = 0; for (;;) { const int32_t bytesFit = std::min(bytes - offset, current_->size - current_->position); - ::memcpy( + simd::memcpy( current_->buffer + current_->position, value.data() + offset, bytesFit); current_->position += bytesFit; offset += bytesFit; @@ -175,7 +261,7 @@ void ByteStream::appendStringPiece(folly::StringPiece value) { } } -std::streampos ByteStream::tellp() const { +std::streampos ByteOutputStream::tellp() const { if (ranges_.empty()) { return 0; } @@ -187,10 +273,10 @@ std::streampos ByteStream::tellp() const { } size += range.size; } - VELOX_FAIL("ByteStream 'current_' is not in 'ranges_'."); + VELOX_FAIL("ByteOutputStream 'current_' is not in 'ranges_'."); } -void ByteStream::seekp(std::streampos position) { +void ByteOutputStream::seekp(std::streampos position) { int64_t toSkip = position; // Record how much was written pre-seek. updateEnd(); @@ -205,10 +291,13 @@ void ByteStream::seekp(std::streampos position) { } toSkip -= range.size; } - VELOX_FAIL("Seeking past end of ByteStream: {}", position); + static_assert(sizeof(std::streamsize) <= sizeof(long long)); + VELOX_FAIL( + "Seeking past end of ByteOutputStream: {}", + static_cast(position)); } -void ByteStream::flush(OutputStream* out) { +void ByteOutputStream::flush(OutputStream* out) { updateEnd(); for (int32_t i = 0; i < ranges_.size(); ++i) { int32_t count = i == ranges_.size() - 1 ? lastRangeEnd_ : ranges_[i].size; @@ -223,17 +312,17 @@ void ByteStream::flush(OutputStream* out) { } } -char* ByteStream::writePosition() { +char* ByteOutputStream::writePosition() { if (ranges_.empty()) { return nullptr; } return reinterpret_cast(current_->buffer) + current_->position; } -void ByteStream::extend(int32_t bytes) { +void ByteOutputStream::extend(int32_t bytes) { if (current_ && current_->position != current_->size) { - LOG(FATAL) << "Extend ByteStream before range full: " << current_->position - << " vs. " << current_->size; + LOG(FATAL) << "Extend ByteOutputStream before range full: " + << current_->position << " vs. " << current_->size; } // Check if rewriting existing content. If so, move to next range and start at @@ -243,10 +332,14 @@ void ByteStream::extend(int32_t bytes) { current_->position = 0; return; } + ranges_.emplace_back(); current_ = &ranges_.back(); lastRangeEnd_ = 0; - arena_->newRange(newRangeSize(bytes), current_); + arena_->newRange( + newRangeSize(bytes), + ranges_.size() == 1 ? nullptr : &ranges_[ranges_.size() - 2], + current_); allocatedBytes_ += current_->size; VELOX_CHECK_GT(allocatedBytes_, 0); if (isBits_) { @@ -255,7 +348,7 @@ void ByteStream::extend(int32_t bytes) { } } -int32_t ByteStream::newRangeSize(int32_t bytes) const { +int32_t ByteOutputStream::newRangeSize(int32_t bytes) const { const int32_t newSize = allocatedBytes_ + bytes; if (newSize < 128) { return 128; @@ -269,10 +362,33 @@ int32_t ByteStream::newRangeSize(int32_t bytes) const { return bits::roundUp(bytes, memory::AllocationTraits::kPageSize); } -std::string ByteStream::toString() const { +void ByteOutputStream::ensureSpace(int32_t bytes) { + const auto available = current_->size - current_->position; + int64_t toExtend = bytes - available; + const auto originalRangeIdx = current_ - ranges_.data(); + const auto originalPosition = current_->position; + while (toExtend > 0) { + current_->position = current_->size; + extend(toExtend); + toExtend -= current_->size; + } + // Restore original position. + current_ = &ranges_[originalRangeIdx]; + current_->position = originalPosition; +} + +std::unique_ptr ByteOutputStream::inputStream() const { + VELOX_CHECK(!ranges_.empty()); + updateEnd(); + auto rangeCopy = ranges_; + rangeCopy.back().size = lastRangeEnd_; + return std::make_unique(std::move(rangeCopy)); +} + +std::string ByteOutputStream::toString() const { std::stringstream oss; - oss << "ByteStream[lastRangeEnd " << lastRangeEnd_ << ", " << ranges_.size() - << " ranges (position/size) ["; + oss << "ByteOutputStream[lastRangeEnd " << lastRangeEnd_ << ", " + << ranges_.size() << " ranges (position/size) ["; for (const auto& range : ranges_) { oss << "(" << range.position << "/" << range.size << (&range == current_ ? " current" : "") << ")"; diff --git a/velox/common/memory/ByteStream.h b/velox/common/memory/ByteStream.h index d47e7c74d8a13..040aa23bccb0e 100644 --- a/velox/common/memory/ByteStream.h +++ b/velox/common/memory/ByteStream.h @@ -15,10 +15,13 @@ */ #pragma once +#include +#include "velox/common/base/Scratch.h" #include "velox/common/memory/StreamArena.h" #include "velox/type/Type.h" #include +#include namespace facebook::velox { @@ -32,6 +35,9 @@ struct ByteRange { /// Index of next byte/bit to be read/written in 'buffer'. int32_t position; + /// Returns the available bytes left in this range. + uint32_t availableBytes() const; + std::string toString() const; }; @@ -88,47 +94,175 @@ class OStreamOutputStream : public OutputStream { std::ostream* out_; }; +/// Read-only byte input stream interface. +class ByteInputStream { + public: + virtual ~ByteInputStream() = default; + + /// Returns total number of bytes available in the stream. + virtual size_t size() const = 0; + + /// Returns true if all input has been read. + virtual bool atEnd() const = 0; + + /// Returns current position (number of bytes from the start) in the stream. + virtual std::streampos tellp() const = 0; + + /// Moves current position to specified one. + virtual void seekp(std::streampos pos) = 0; + + /// Returns the remaining size left from current reading position. + virtual size_t remainingSize() const = 0; + + virtual uint8_t readByte() = 0; + + virtual void readBytes(uint8_t* bytes, int32_t size) = 0; + + template + T read() { + if (current_->position + sizeof(T) <= current_->size) { + current_->position += sizeof(T); + return *reinterpret_cast( + current_->buffer + current_->position - sizeof(T)); + } + // The number straddles two buffers. We read byte by byte and make a + // little-endian uint64_t. The bytes can be cast to any integer or floating + // point type since the wire format has the machine byte order. + static_assert(sizeof(T) <= sizeof(uint64_t)); + uint64_t value = 0; + for (int32_t i = 0; i < sizeof(T); ++i) { + value |= static_cast(readByte()) << (i * 8); + } + return *reinterpret_cast(&value); + } + + template + void readBytes(Char* data, int32_t size) { + readBytes(reinterpret_cast(data), size); + } + + /// Returns a view over the read buffer for up to 'size' next bytes. The size + /// of the value may be less if the current byte range ends within 'size' + /// bytes from the current position. The size will be 0 if at end. + virtual std::string_view nextView(int32_t size) = 0; + + virtual void skip(int32_t size) = 0; + + virtual std::string toString() const = 0; + + protected: + // Points to the current buffered byte range. + ByteRange* current_{nullptr}; + std::vector ranges_; +}; + +/// Read-only input stream backed by a set of buffers. +class BufferInputStream : public ByteInputStream { + public: + explicit BufferInputStream(std::vector ranges) { + VELOX_CHECK(!ranges.empty(), "Empty BufferInputStream"); + ranges_ = std::move(ranges); + current_ = &ranges_[0]; + } + + BufferInputStream(const BufferInputStream&) = delete; + BufferInputStream& operator=(const BufferInputStream& other) = delete; + BufferInputStream(BufferInputStream&& other) noexcept = delete; + BufferInputStream& operator=(BufferInputStream&& other) noexcept = delete; + + size_t size() const override; + + bool atEnd() const override; + + std::streampos tellp() const override; + + void seekp(std::streampos pos) override; + + size_t remainingSize() const override; + + uint8_t readByte() override; + + void readBytes(uint8_t* bytes, int32_t size) override; + + std::string_view nextView(int32_t size) override; + + void skip(int32_t size) override; + + std::string toString() const override; + + private: + // Sets 'current_' to the next range of input. The input is consecutive + // ByteRanges in 'ranges_' for the base class but any view over external + // buffers can be made by specialization. + void nextRange(); + + const std::vector& ranges() const { + return ranges_; + } +}; + +template <> +inline Timestamp ByteInputStream::read() { + Timestamp value; + readBytes(reinterpret_cast(&value), sizeof(value)); + return value; +} + +template <> +inline int128_t ByteInputStream::read() { + int128_t value; + readBytes(reinterpret_cast(&value), sizeof(value)); + return value; +} + /// Stream over a chain of ByteRanges. Provides read, write and /// comparison for equality between stream contents and memory. Used /// for streams in repartitioning or for complex variable length data /// in hash tables. The stream is seekable and supports overwriting of /// previous content, for example, writing a message body and then /// seeking back to start to write a length header. -class ByteStream { +class ByteOutputStream { public: - /// For input. - ByteStream() : isBits_(false), isReverseBitOrder_(false) {} - virtual ~ByteStream() = default; - /// For output. - ByteStream( + ByteOutputStream( StreamArena* arena, bool isBits = false, bool isReverseBitOrder = false) : arena_(arena), isBits_(isBits), isReverseBitOrder_(isReverseBitOrder) {} - ByteStream(const ByteStream& other) = delete; + ByteOutputStream(const ByteOutputStream& other) = delete; - void operator=(const ByteStream& other) = delete; + void operator=(const ByteOutputStream& other) = delete; - void resetInput(std::vector&& ranges) { - ranges_ = std::move(ranges); - current_ = &ranges_[0]; - lastRangeEnd_ = ranges_.back().size; - } + // Forcing a move constructor to be able to return ByteOutputStream objects + // from a function. + ByteOutputStream(ByteOutputStream&&) = default; - void setRange(ByteRange range) { + /// Sets 'this' to range over 'range'. If this is for purposes of writing, + /// lastWrittenPosition specifies the end of any pre-existing content in + /// 'range'. + void setRange(ByteRange range, int32_t lastWrittenPosition) { ranges_.resize(1); ranges_[0] = range; current_ = ranges_.data(); - lastRangeEnd_ = ranges_[0].size; + VELOX_CHECK_GE(ranges_.back().size, lastWrittenPosition); + lastRangeEnd_ = lastWrittenPosition; } const std::vector& ranges() const { return ranges_; } + /// Prepares 'this' for writing. Can be called several times, + /// e.g. PrestoSerializer resets these. The memory formerly backing + /// 'ranges_' is not owned and the caller needs to recycle or free + /// this independently. void startWrite(int32_t initialSize) { + ranges_.clear(); + isReversed_ = false; + allocatedBytes_ = 0; + current_ = nullptr; + lastRangeEnd_ = 0; extend(initialSize); } @@ -141,73 +275,27 @@ class ByteStream { void seekp(std::streampos position); - /// Returns the size written into ranges_. This is the sum of the - /// capacities of non-last ranges + the greatest write position of - /// the last range. + /// Returns the size written into ranges_. This is the sum of the capacities + /// of non-last ranges + the greatest write position of the last range. size_t size() const; - /// Returns the remaining size left from current reading position. - size_t remainingSize() const; - - /// For input. Returns true if all input has been read. - bool atEnd() const; - - int32_t lastRangeEnd() { + int32_t lastRangeEnd() const { updateEnd(); return lastRangeEnd_; } - /// Sets 'current_' to point to the next range of input. // The - /// input is consecutive ByteRanges in 'ranges_' for the base class - /// but any view over external buffers can be made by specialization. - virtual void next(bool throwIfPastEnd = true); - - uint8_t readByte(); - - void readBytes(uint8_t* bytes, int32_t size); - - template - T read() { - if (current_->position + sizeof(T) <= current_->size) { - current_->position += sizeof(T); - return *reinterpret_cast( - current_->buffer + current_->position - sizeof(T)); - } - // The number straddles two buffers. We read byte by byte and make - // a little-endian uint64_t. The bytes can be cast to any integer - // or floating point type since the wire format has the machine byte order. - static_assert(sizeof(T) <= sizeof(uint64_t)); - uint64_t value = 0; - for (int32_t i = 0; i < sizeof(T); ++i) { - value |= static_cast(readByte()) << (i * 8); - } - return *reinterpret_cast(&value); - } - - template - void readBytes(Char* data, int32_t size) { - readBytes(reinterpret_cast(data), size); - } - - /// Returns a view over the read buffer for up to 'size' next - /// bytes. The size of the value may be less if the current byte - /// range ends within 'size' bytes from the current position. The - /// size will be 0 if at end. - std::string_view nextView(int32_t size); - - void skip(int32_t size); - template void append(folly::Range values) { if (current_->position + sizeof(T) * values.size() > current_->size) { - appendStringPiece(folly::StringPiece( + appendStringView(std::string_view( reinterpret_cast(&values[0]), values.size() * sizeof(T))); return; } - auto target = reinterpret_cast(current_->buffer + current_->position); - auto end = target + values.size(); - auto valuePtr = &values[0]; + + auto* target = reinterpret_cast(current_->buffer + current_->position); + const auto* end = target + values.size(); + auto* valuePtr = &values[0]; while (target != end) { *target = *valuePtr; ++target; @@ -218,7 +306,33 @@ class ByteStream { void appendBool(bool value, int32_t count); - void appendStringPiece(folly::StringPiece value); + // A fast path for appending bits into pre-cleared buffers after first extend. + inline void + appendBitsFresh(const uint64_t* bits, int32_t begin, int32_t end) { + const auto position = current_->position; + if (begin == 0 && end <= 56) { + const auto available = current_->size - position; + // There must be 8 bytes writable. If available is 56, there are 7, so >. + if (available > 56) { + const auto offset = position & 7; + uint64_t* buffer = + reinterpret_cast(current_->buffer + (position >> 3)); + const auto mask = bits::lowMask(offset); + *buffer = (*buffer & mask) | (bits[0] << offset); + current_->position += end; + return; + } + } + appendBits(bits, begin, end); + } + + // Writes 'bits' from bit positions begin..end to the current position of + // 'this'. Extends 'this' if writing past end. + void appendBits(const uint64_t* bits, int32_t begin, int32_t end); + + void appendStringView(StringView value); + + void appendStringView(std::string_view value); template void appendOne(const T& value) { @@ -236,14 +350,45 @@ class ByteStream { return allocatedBytes_; } + /// Returns a ByteInputStream to range over the current content of 'this'. The + /// result is valid as long as 'this' is live and not changed. + std::unique_ptr inputStream() const; + std::string toString() const; private: + // Returns a range of 'size' items of T. If there is no contiguous space in + // 'this', uses 'scratch' to make a temp block that is appended to 'this' in + template + T* getAppendWindow(int32_t size, ScratchPtr& scratchPtr) { + const int32_t bytes = sizeof(T) * size; + if (!current_) { + extend(bytes); + } + auto available = current_->size - current_->position; + if (available >= bytes) { + current_->position += bytes; + return reinterpret_cast( + current_->buffer + current_->position - bytes); + } + // If the tail is not large enough, make temp of the right size + // in scratch. Extend the stream so that there is guaranteed space to copy + // the scratch to the stream. This copy takes place in destruction of + // AppendWindow and must not allocate so that it is noexcept. + ensureSpace(bytes); + return scratchPtr.get(size); + } + void extend(int32_t bytes); + // Calls extend() enough times to make sure 'bytes' bytes can be + // appended without new allocation. Does not change the append + // position. + void ensureSpace(int32_t bytes); + int32_t newRangeSize(int32_t bytes) const; - void updateEnd() { + void updateEnd() const { if (!ranges_.empty() && current_ == &ranges_.back() && current_->position > lastRangeEnd_) { lastRangeEnd_ = current_->position; @@ -272,22 +417,43 @@ class ByteStream { // of 'ranges_'. In a write situation, all non-last ranges are full // and the last may be partly full. The position in the last range // is not necessarily the the end if there has been a seek. - int32_t lastRangeEnd_{0}; + mutable int32_t lastRangeEnd_{0}; + + template + friend class AppendWindow; }; -template <> -inline Timestamp ByteStream::read() { - Timestamp value; - readBytes(reinterpret_cast(&value), sizeof(value)); - return value; -} +/// A scoped wrapper that provides 'size' T's of writable space in 'stream'. +/// Normally gives an address into 'stream's buffer but can use 'scratch' to +/// make a contiguous piece if stream does not have a suitable run. +template +class AppendWindow { + public: + AppendWindow(ByteOutputStream& stream, Scratch& scratch) + : stream_(stream), scratchPtr_(scratch) {} + + ~AppendWindow() noexcept { + if (scratchPtr_.size()) { + try { + stream_.appendStringView(std::string_view( + reinterpret_cast(scratchPtr_.get()), + scratchPtr_.size() * sizeof(T))); + } catch (const std::exception& e) { + // This is impossible because construction ensures there is space for + // the bytes in the stream. + LOG(FATAL) << "throw from AppendWindo append: " << e.what(); + } + } + } -template <> -inline int128_t ByteStream::read() { - int128_t value; - readBytes(reinterpret_cast(&value), sizeof(value)); - return value; -} + T* get(int32_t size) { + return stream_.getAppendWindow(size, scratchPtr_); + } + + private: + ByteOutputStream& stream_; + ScratchPtr scratchPtr_; +}; class IOBufOutputStream : public OutputStream { public: @@ -297,12 +463,12 @@ class IOBufOutputStream : public OutputStream { int32_t initialSize = memory::AllocationTraits::kPageSize) : OutputStream(listener), arena_(std::make_shared(&pool)), - out_(std::make_unique(arena_.get())) { + out_(std::make_unique(arena_.get())) { out_->startWrite(initialSize); } void write(const char* s, std::streamsize count) override { - out_->appendStringPiece(folly::StringPiece(s, count)); + out_->appendStringView(std::string_view(s, count)); if (listener_) { listener_->onWrite(s, count); } @@ -318,7 +484,7 @@ class IOBufOutputStream : public OutputStream { private: std::shared_ptr arena_; - std::unique_ptr out_; + std::unique_ptr out_; }; } // namespace facebook::velox diff --git a/velox/common/memory/CMakeLists.txt b/velox/common/memory/CMakeLists.txt index dc0e618d93f16..2bb6e2d2d1542 100644 --- a/velox/common/memory/CMakeLists.txt +++ b/velox/common/memory/CMakeLists.txt @@ -15,7 +15,7 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() -add_library( +velox_add_library( velox_memory Allocation.cpp AllocationPool.cpp @@ -31,9 +31,10 @@ add_library( SharedArbitrator.cpp StreamArena.cpp) -target_link_libraries( +velox_link_libraries( velox_memory PUBLIC velox_common_base + velox_common_config velox_exception velox_flag_definitions velox_time diff --git a/velox/common/memory/CompactDoubleList.h b/velox/common/memory/CompactDoubleList.h index 0ed4b5f36f4f5..d048d38e44e0a 100644 --- a/velox/common/memory/CompactDoubleList.h +++ b/velox/common/memory/CompactDoubleList.h @@ -21,9 +21,11 @@ namespace facebook::velox { -// Circular double linked list with 6 byte pointers. Used for free -// list in HashStringAllocator so that we get a minimum allocation -// payload size of 16 bytes. (12 bits for the links, 4 for trailer). +/// Circular double linked list with 6 byte pointers. Used for free list in +/// HashStringAllocator so that we get a minimum allocation payload size of 16 +/// bytes. (12 bits for the links, 4 for trailer). +/// +/// NOTE: this class is not thread-safe. class CompactDoubleList { public: CompactDoubleList() { @@ -36,12 +38,12 @@ class CompactDoubleList { void operator=(const CompactDoubleList& other) = delete; void operator=(CompactDoubleList&& other) = delete; - // Return true if 'this' is the only element. + /// Returns true if 'this' is the only element. bool empty() const { return next() == this; } - // inserts 'entry' after 'this' + /// Inserts 'entry' after 'this' void insert(CompactDoubleList* entry) { entry->setNext(next()); entry->setPrevious(this); @@ -49,7 +51,7 @@ class CompactDoubleList { setNext(entry); } - // Unlinks 'this' from its list. Throws if 'this' is the only element. + /// Unlinks 'this' from its list. Throws if 'this' is the only element. void remove() { VELOX_CHECK(!empty()); previous()->setNext(next()); @@ -64,6 +66,17 @@ class CompactDoubleList { return loadPointer(previousLow_, previousHigh_); } + /// Updates links after the next() of 'this' has been moved to 'newNext'. Sets + /// the next link of this, the previous link of 'newNext' and the previous + /// link of the next() of the moved 'newNext'. The use case is taking the + /// head of a free list block without a full remove of block plus reinsert of + /// the remainder of the block. + void nextMoved(CompactDoubleList* newNext) { + setNext(newNext); + VELOX_CHECK(newNext->previous() == this); + newNext->next()->setPrevious(newNext); + } + private: static constexpr uint8_t kPointerSignificantBits = 48; @@ -81,11 +94,11 @@ class CompactDoubleList { } void storePointer(CompactDoubleList* pointer, uint32_t& low, uint16_t& high) { - DCHECK_EQ( + VELOX_DCHECK_EQ( reinterpret_cast(pointer) & ~bits::lowMask(kPointerSignificantBits), 0); - uint64_t data = reinterpret_cast(pointer); + const uint64_t data = reinterpret_cast(pointer); low = static_cast(data); high = static_cast(data >> 32); } @@ -97,6 +110,4 @@ class CompactDoubleList { uint16_t previousHigh_; }; -; - } // namespace facebook::velox diff --git a/velox/common/memory/HashStringAllocator.cpp b/velox/common/memory/HashStringAllocator.cpp index e2159af6f67e3..8cb6781a16e3c 100644 --- a/velox/common/memory/HashStringAllocator.cpp +++ b/velox/common/memory/HashStringAllocator.cpp @@ -20,22 +20,21 @@ namespace facebook::velox { namespace { -/// Returns the size of the previous free block. The size is stored in the -/// last 4 bytes of the free block, e.g. 4 bytes just before the current -/// header. +// Returns the size of the previous free block. The size is stored in the last 4 +// bytes of the free block, e.g. 4 bytes just before the current header. uint32_t* previousFreeSize(HashStringAllocator::Header* header) { return reinterpret_cast(header) - 1; } -/// Returns the header of the previous free block or nullptr if previous block -/// is not free. -HashStringAllocator::Header* FOLLY_NULLABLE -getPreviousFree(HashStringAllocator::Header* FOLLY_NONNULL header) { +// Returns the header of the previous free block or nullptr if previous block is +// not free. +HashStringAllocator::Header* getPreviousFree( + HashStringAllocator::Header* header) { if (!header->isPreviousFree()) { return nullptr; } - auto numBytes = *previousFreeSize(header); - auto previous = reinterpret_cast( + const auto numBytes = *previousFreeSize(header); + auto* previous = reinterpret_cast( header->begin() - numBytes - 2 * sizeof(HashStringAllocator::Header)); VELOX_CHECK_EQ(previous->size(), numBytes); VELOX_CHECK(previous->isFree()); @@ -43,13 +42,12 @@ getPreviousFree(HashStringAllocator::Header* FOLLY_NONNULL header) { return previous; } -/// Sets kFree flag in the 'header' and writes the size of the block to the -/// last 4 bytes of the block. Sets kPreviousFree flag in the next block's -/// 'header'. -void markAsFree(HashStringAllocator::Header* FOLLY_NONNULL header) { +// Sets kFree flag in the 'header' and writes the size of the block to the last +// 4 bytes of the block. Sets kPreviousFree flag in the next block's 'header'. +void markAsFree(HashStringAllocator::Header* header) { header->setFree(); - auto nextHeader = header->next(); - if (nextHeader) { + auto* nextHeader = header->next(); + if (nextHeader != nullptr) { nextHeader->setPreviousFree(); *previousFreeSize(nextHeader) = header->size(); } @@ -88,76 +86,135 @@ HashStringAllocator::~HashStringAllocator() { } void HashStringAllocator::clear() { - numFree_ = 0; - freeBytes_ = 0; - freeNonEmpty_ = 0; - for (auto& pair : allocationsFromPool_) { - pool()->free(pair.first, pair.second); - } - allocationsFromPool_.clear(); + state_.numFree() = 0; + state_.freeBytes() = 0; + std::fill( + std::begin(state_.freeNonEmpty()), std::end(state_.freeNonEmpty()), 0); + for (auto& pair : state_.allocationsFromPool()) { + const auto size = pair.second; + pool()->free(pair.first, size); + state_.sizeFromPool() -= size; + state_.currentBytes() -= size; + } + state_.allocationsFromPool().clear(); for (auto i = 0; i < kNumFreeLists; ++i) { - new (&free_[i]) CompactDoubleList(); + new (&state_.freeLists()[i]) CompactDoubleList(); + } + +#ifndef NDEBUG + static const auto kHugePageSize = memory::AllocationTraits::kHugePageSize; + for (auto i = 0; i < state_.pool().numRanges(); ++i) { + const auto range = state_.pool().rangeAt(i); + const auto rangeSize = range.size(); + if (rangeSize >= kHugePageSize) { + VELOX_CHECK_EQ(0, rangeSize % kHugePageSize); + } + + for (int64_t blockOffset = 0; blockOffset < rangeSize; + blockOffset += kHugePageSize) { + auto blockRange = folly::Range( + range.data() + blockOffset, + std::min(rangeSize, kHugePageSize)); + const auto size = blockRange.size() - simd::kPadding; + auto* end = castToHeader(blockRange.data() + size); + auto* header = castToHeader(blockRange.data()); + while (header != end) { + VELOX_CHECK_GE(reinterpret_cast(header), blockRange.data()); + VELOX_CHECK_LT( + reinterpret_cast(header), reinterpret_cast(end)); + VELOX_CHECK_LE( + reinterpret_cast(header->end()), + reinterpret_cast(end)); + + // Continued block & Non-free block. + if (!header->isFree()) { + state_.currentBytes() -= blockBytes(header); + } + header = castToHeader(header->end()); + } + } } - pool_.clear(); + + VELOX_DCHECK_EQ(state_.currentBytes(), 0); + VELOX_DCHECK_EQ(state_.sizeFromPool(), 0); +#endif + state_.pool().clear(); + + state_.currentBytes() = 0; + state_.sizeFromPool() = 0; } void* HashStringAllocator::allocateFromPool(size_t size) { - auto ptr = pool()->allocate(size); - cumulativeBytes_ += size; - allocationsFromPool_[ptr] = size; - sizeFromPool_ += size; + auto* ptr = pool()->allocate(size); + state_.currentBytes() += size; + state_.allocationsFromPool()[ptr] = size; + state_.sizeFromPool() += size; return ptr; } void HashStringAllocator::freeToPool(void* ptr, size_t size) { - auto it = allocationsFromPool_.find(ptr); + auto it = state_.allocationsFromPool().find(ptr); VELOX_CHECK( - it != allocationsFromPool_.end(), + it != state_.allocationsFromPool().end(), "freeToPool for block not allocated from pool of HashStringAllocator"); VELOX_CHECK_EQ( size, it->second, "Bad size in HashStringAllocator::freeToPool()"); - allocationsFromPool_.erase(it); - sizeFromPool_ -= size; - cumulativeBytes_ -= size; + state_.allocationsFromPool().erase(it); + state_.sizeFromPool() -= size; + state_.currentBytes() -= size; pool()->free(ptr, size); } // static -void HashStringAllocator::prepareRead(const Header* begin, ByteStream& stream) { +std::unique_ptr HashStringAllocator::prepareRead( + const Header* begin, + size_t maxBytes) { std::vector ranges; - auto header = const_cast(begin); + auto* header = const_cast(begin); + + size_t totalBytes{0}; for (;;) { ranges.push_back(ByteRange{ reinterpret_cast(header->begin()), header->usableSize(), 0}); + totalBytes += ranges.back().size; if (!header->isContinued()) { break; } + + if (totalBytes >= maxBytes) { + break; + } + header = header->nextContinued(); } - stream.resetInput(std::move(ranges)); + return std::make_unique(std::move(ranges)); } HashStringAllocator::Position HashStringAllocator::newWrite( - ByteStream& stream, + ByteOutputStream& stream, int32_t preferredSize) { - VELOX_CHECK( - !currentHeader_, + VELOX_CHECK_NULL( + state_.currentHeader(), "Do not call newWrite before finishing the previous write to " "HashStringAllocator"); - currentHeader_ = allocate(preferredSize, false); + state_.currentHeader() = allocate(preferredSize, false); - stream.setRange(ByteRange{ - reinterpret_cast(currentHeader_->begin()), - currentHeader_->size(), - 0}); + stream.setRange( + ByteRange{ + reinterpret_cast(state_.currentHeader()->begin()), + state_.currentHeader()->size(), + 0}, + 0); - startPosition_ = Position::atOffset(currentHeader_, 0); + state_.startPosition() = Position::atOffset(state_.currentHeader(), 0); - return startPosition_; + return state_.startPosition(); } -void HashStringAllocator::extendWrite(Position position, ByteStream& stream) { - auto header = position.header; +void HashStringAllocator::extendWrite( + Position position, + ByteOutputStream& stream) { + auto* header = position.header; const auto offset = position.offset(); VELOX_CHECK_GE( offset, 0, "Starting extendWrite outside of the current range"); @@ -171,110 +228,127 @@ void HashStringAllocator::extendWrite(Position position, ByteStream& stream) { header->clearContinued(); } - stream.setRange(ByteRange{ - reinterpret_cast(position.position), - static_cast(header->end() - position.position), - 0}); - currentHeader_ = header; - startPosition_ = position; + stream.setRange( + ByteRange{ + reinterpret_cast(position.header->begin()), + position.header->size(), + static_cast(position.position - position.header->begin())}, + 0); + state_.currentHeader() = header; + state_.startPosition() = position; } std::pair -HashStringAllocator::finishWrite(ByteStream& stream, int32_t numReserveBytes) { - VELOX_CHECK( - currentHeader_, "Must call newWrite or extendWrite before finishWrite"); - auto writePosition = stream.writePosition(); - const auto offset = writePosition - currentHeader_->begin(); +HashStringAllocator::finishWrite( + ByteOutputStream& stream, + int32_t numReserveBytes) { + VELOX_CHECK_NOT_NULL( + state_.currentHeader(), + "Must call newWrite or extendWrite before finishWrite"); + auto* writePosition = stream.writePosition(); + const auto offset = writePosition - state_.currentHeader()->begin(); VELOX_CHECK_GE( offset, 0, "finishWrite called with writePosition out of range"); VELOX_CHECK_LE( offset, - currentHeader_->usableSize(), + state_.currentHeader()->usableSize(), "finishWrite called with writePosition out of range"); - Position currentPosition = Position::atOffset(currentHeader_, offset); - if (currentHeader_->isContinued()) { - free(currentHeader_->nextContinued()); - currentHeader_->clearContinued(); + const Position currentPosition = + Position::atOffset(state_.currentHeader(), offset); + if (state_.currentHeader()->isContinued()) { + free(state_.currentHeader()->nextContinued()); + state_.currentHeader()->clearContinued(); } // Free remainder of block if there is a lot left over. freeRestOfBlock( - currentHeader_, - writePosition - currentHeader_->begin() + numReserveBytes); - currentHeader_ = nullptr; + state_.currentHeader(), + writePosition - state_.currentHeader()->begin() + numReserveBytes); + state_.currentHeader() = nullptr; // The starting position may have shifted if it was at the end of the block // and the block was extended. Calculate the new position. - if (startPosition_.header->isContinued()) { - auto header = startPosition_.header; - const auto offset = startPosition_.offset(); + if (state_.startPosition().header->isContinued()) { + auto* header = state_.startPosition().header; + const auto offset = state_.startPosition().offset(); const auto extra = offset - header->usableSize(); if (extra > 0) { - auto newHeader = header->nextContinued(); - auto newPosition = newHeader->begin() + extra; - startPosition_ = {newHeader, newPosition}; + auto* newHeader = header->nextContinued(); + auto* newPosition = newHeader->begin() + extra; + state_.startPosition() = {newHeader, newPosition}; } } - return {startPosition_, currentPosition}; + return {state_.startPosition(), currentPosition}; } void HashStringAllocator::newSlab() { - constexpr int32_t kSimdPadding = simd::kPadding - sizeof(Header); - const int64_t needed = pool_.allocatedBytes() >= pool_.hugePageThreshold() + constexpr int32_t kSimdPadding = simd::kPadding - kHeaderSize; + const int64_t needed = + state_.pool().allocatedBytes() >= state_.pool().hugePageThreshold() ? memory::AllocationTraits::kHugePageSize : kUnitSize; - auto run = pool_.allocateFixed(needed); - // We check we got exactly the requested amount. checkConsistency() - // depends on slabs made here coinciding with ranges from - // AllocationPool::rangeAt(). Sometimes the last range can be - // several huge pages for severl huge page sized arenas but - // checkConsistency() can interpret that. - VELOX_CHECK_EQ(0, pool_.freeBytes()); - auto available = needed - sizeof(Header) - kSimdPadding; - + auto* run = state_.pool().allocateFixed(needed); VELOX_CHECK_NOT_NULL(run); + // We check we got exactly the requested amount. checkConsistency() depends on + // slabs made here coinciding with ranges from AllocationPool::rangeAt(). + // Sometimes the last range can be several huge pages for severl huge page + // sized arenas but checkConsistency() can interpret that. + VELOX_CHECK_EQ(state_.pool().freeBytes(), 0); + const auto available = needed - kHeaderSize - kSimdPadding; VELOX_CHECK_GT(available, 0); - // Write end marker. + + // Write end marker. *reinterpret_cast(run + available) = Header::kArenaEnd; - cumulativeBytes_ += available; + state_.currentBytes() += available; - // Add the new memory to the free list: Placement construct a header - // that covers the space from start to the end marker and add this - // to free list. - free(new (run) Header(available - sizeof(Header))); + // Add the new memory to the free list: Placement construct a header that + // covers the space from start to the end marker and add this to free list. + free(new (run) Header(available - kHeaderSize)); } void HashStringAllocator::newRange( int32_t bytes, + ByteRange* lastRange, ByteRange* range, bool contiguous) { - // Allocates at least kMinContiguous or to the end of the current - // run. At the end of the write the unused space will be made - // free. - VELOX_CHECK( - currentHeader_, + // Allocates at least kMinContiguous or to the end of the current run. At the + // end of the write, the unused space will be made free. + VELOX_CHECK_NOT_NULL( + state_.currentHeader(), "Must have called newWrite or extendWrite before newRange"); - auto newHeader = allocate(bytes, contiguous); + auto* newHeader = allocate(bytes, contiguous); - auto lastWordPtr = reinterpret_cast( - currentHeader_->end() - Header::kContinuedPtrSize); + // Copy the last word of the current range to the head of new range, and then + // used the space to store the new range pointer. + auto** lastWordPtr = reinterpret_cast( + state_.currentHeader()->end() - Header::kContinuedPtrSize); *reinterpret_cast(newHeader->begin()) = *lastWordPtr; *lastWordPtr = newHeader; - currentHeader_->setContinued(); - currentHeader_ = newHeader; + state_.currentHeader()->setContinued(); + state_.currentHeader() = newHeader; + if (lastRange) { + // The last bytes of the last range are no longer payload. So do not count + // them in size and do not overwrite them if overwriting the multi-range + // entry. Set position at the new end. + lastRange->size -= Header::kContinuedPtrSize; + lastRange->position = std::min(lastRange->size, lastRange->position); + } *range = ByteRange{ - reinterpret_cast(currentHeader_->begin()), - currentHeader_->size(), + reinterpret_cast(state_.currentHeader()->begin()), + state_.currentHeader()->size(), Header::kContinuedPtrSize}; } -void HashStringAllocator::newRange(int32_t bytes, ByteRange* range) { - newRange(bytes, range, false); +void HashStringAllocator::newRange( + int32_t bytes, + ByteRange* lastRange, + ByteRange* range) { + newRange(bytes, lastRange, range, false); } void HashStringAllocator::newContiguousRange(int32_t bytes, ByteRange* range) { - newRange(bytes, range, true); + newRange(bytes, nullptr, range, true); } // static @@ -284,241 +358,164 @@ StringView HashStringAllocator::contiguousString( if (view.isInline()) { return view; } - auto header = headerOf(view.data()); + auto* header = headerOf(view.data()); if (view.size() <= header->size()) { return view; } - ByteStream stream; - prepareRead(headerOf(view.data()), stream); + auto stream = prepareRead(headerOf(view.data())); storage.resize(view.size()); - stream.readBytes(storage.data(), view.size()); + stream->readBytes(storage.data(), view.size()); return StringView(storage); } void HashStringAllocator::freeRestOfBlock(Header* header, int32_t keepBytes) { keepBytes = std::max(keepBytes, kMinAlloc); - int32_t freeSize = header->size() - keepBytes - sizeof(Header); + const int32_t freeSize = header->size() - keepBytes - kHeaderSize; if (freeSize <= kMinAlloc) { return; } header->setSize(keepBytes); - auto newHeader = new (header->end()) Header(freeSize); + auto* newHeader = new (header->end()) Header(freeSize); free(newHeader); } -// Free list sizes align with size of containers. + 20 allows for padding for an -// alignment of 16 bytes. -int32_t HashStringAllocator::freeListSizes_[kNumFreeLists + 7] = { - 30, - 50, - 72, - 8 * 16 + 20, - 100, - 16 * 16 + 20, - 32 * 16 + 20, - 64 * 16 + 20, - 128 * 16 + 20, - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), -}; - -// static -folly::Range HashStringAllocator::freeListSizeClasses() { - return folly::Range(&freeListSizes_[0], kNumFreeLists); +int32_t HashStringAllocator::freeListIndex(int size) { + return std::min(size - kMinAlloc, kNumFreeLists - 1); } -int32_t HashStringAllocator::freeListIndex(int32_t size, uint32_t mask) { - static_assert(sizeof(freeListSizes_) >= sizeof(xsimd::batch)); - auto vsize = xsimd::broadcast(size); - if constexpr (sizeof(freeListSizes_) == sizeof(xsimd::batch)) { - auto sizes = xsimd::load_unaligned(freeListSizes_); - auto bits = simd::toBitMask(vsize < sizes) & mask; - return count_trailing_zeros(bits); - } else { - for (int offset = 0; offset <= kNumFreeLists; offset += vsize.size) { - auto sizes = xsimd::load_unaligned(freeListSizes_ + offset); - auto bits = simd::toBitMask(vsize < sizes) & mask; - if (bits) { - return offset + count_trailing_zeros(bits); - } - mask >>= vsize.size; - } - return count_trailing_zeros(0); +void HashStringAllocator::removeFromFreeList(Header* header) { + VELOX_CHECK(header->isFree()); + header->clearFree(); + const auto index = freeListIndex(header->size()); + reinterpret_cast(header->begin())->remove(); + if (state_.freeLists()[index].empty()) { + bits::clearBit(state_.freeNonEmpty(), index); } } -HashStringAllocator::Header* FOLLY_NULLABLE -HashStringAllocator::allocate(int32_t size, bool exactSize) { +HashStringAllocator::Header* HashStringAllocator::allocate( + int32_t size, + bool exactSize) { if (size > kMaxAlloc && exactSize) { - VELOX_CHECK(size <= Header::kSizeMask); - auto header = - reinterpret_cast(allocateFromPool(size + sizeof(Header))); + VELOX_CHECK_LE(size, Header::kSizeMask); + auto* header = castToHeader(allocateFromPool(size + kHeaderSize)); new (header) Header(size); return header; } - auto header = allocateFromFreeLists(size, exactSize, exactSize); - if (!header) { + + auto* header = allocateFromFreeLists(size, exactSize, exactSize); + if (header == nullptr) { newSlab(); header = allocateFromFreeLists(size, exactSize, exactSize); - VELOX_CHECK(header != nullptr); + VELOX_CHECK_NOT_NULL(header); VELOX_CHECK_GT(header->size(), 0); } - return header; } -HashStringAllocator::Header* FOLLY_NULLABLE -HashStringAllocator::allocateFromFreeLists( +HashStringAllocator::Header* HashStringAllocator::allocateFromFreeLists( int32_t preferredSize, bool mustHaveSize, bool isFinalSize) { - preferredSize = std::max(kMinAlloc, preferredSize); - if (!numFree_) { + if (state_.numFree() == 0) { return nullptr; } - auto index = freeListIndex(preferredSize, freeNonEmpty_); - while (index < kNumFreeLists) { - if (auto header = allocateFromFreeList( - preferredSize, mustHaveSize, isFinalSize, index)) { - return header; - } - // Go to the next larger size non-empty free list. - index = count_trailing_zeros(freeNonEmpty_ & ~bits::lowMask(index + 1)); + preferredSize = std::max(kMinAlloc, preferredSize); + const auto index = freeListIndex(preferredSize); + auto available = + bits::findFirstBit(state_.freeNonEmpty(), index, kNumFreeLists); + if (!mustHaveSize && available == -1) { + available = bits::findLastBit(state_.freeNonEmpty(), 0, index); } - if (mustHaveSize) { + if (available == -1) { return nullptr; } - index = freeListIndex(preferredSize) - 1; - for (; index >= 0; --index) { - if (auto header = - allocateFromFreeList(preferredSize, false, isFinalSize, index)) { - return header; - } - } - return nullptr; + auto* header = + allocateFromFreeList(preferredSize, mustHaveSize, isFinalSize, available); + VELOX_CHECK_NOT_NULL(header); + return header; } -HashStringAllocator::Header* FOLLY_NULLABLE -HashStringAllocator::allocateFromFreeList( +HashStringAllocator::Header* HashStringAllocator::allocateFromFreeList( int32_t preferredSize, bool mustHaveSize, bool isFinalSize, int32_t freeListIndex) { - constexpr int32_t kMaxCheckedForFit = 5; - int32_t counter = 0; - if (mustHaveSize && largestInFreeList_[freeListIndex] < preferredSize) { + auto* item = state_.freeLists()[freeListIndex].next(); + if (item == &state_.freeLists()[freeListIndex]) { return nullptr; } - int32_t largestFreeSize = 0; - Header* largest = nullptr; - Header* found = nullptr; - for (auto* item = free_[freeListIndex].next(); item != &free_[freeListIndex]; - item = item->next()) { - auto header = headerOf(item); - VELOX_CHECK(header->isFree()); - auto size = header->size(); - if (size >= preferredSize) { - found = header; - break; - } - if (!largest || size > largest->size()) { - largest = header; - } - ++counter; - if (!mustHaveSize && counter > kMaxCheckedForFit) { - break; - } - } - numFreeListNoFit_ += counter; - if (!mustHaveSize && !found) { - found = largest; - } - if (!found) { - // We have traversed the complete free list and therefore know the largest - // size. Either the list is empty or mustHaveSize is true and there is no - // block large enough. - largestInFreeList_[freeListIndex] = largest ? largest->size() : 0; - return nullptr; - } - --numFree_; - freeBytes_ -= found->size() + sizeof(Header); + auto* found = headerOf(item); + VELOX_CHECK( + found->isFree() && (!mustHaveSize || found->size() >= preferredSize)); + --state_.numFree(); + state_.freeBytes() -= blockBytes(found); removeFromFreeList(found); - - auto next = found->next(); - if (next) { + auto* next = found->next(); + if (next != nullptr) { next->clearPreviousFree(); } - cumulativeBytes_ += found->size(); + state_.currentBytes() += blockBytes(found); if (isFinalSize) { freeRestOfBlock(found, preferredSize); } return found; } -void HashStringAllocator::free(Header* _header) { - Header* header = _header; - if (header->size() > kMaxAlloc && !pool_.isInCurrentRange(header) && - allocationsFromPool_.find(header) != allocationsFromPool_.end()) { - // A large free can either be a rest of block or a standalone allocation. - VELOX_CHECK(!header->isContinued()); - freeToPool(header, header->size() + sizeof(Header)); - return; - } - +void HashStringAllocator::free(Header* header) { + Header* headerToFree = header; do { Header* continued = nullptr; - if (header->isContinued()) { - continued = header->nextContinued(); - header->clearContinued(); + if (headerToFree->isContinued()) { + continued = headerToFree->nextContinued(); + headerToFree->clearContinued(); } - VELOX_CHECK(!header->isFree()); - freeBytes_ += header->size() + sizeof(Header); - cumulativeBytes_ -= header->size(); - Header* next = header->next(); - if (next) { - VELOX_CHECK(!next->isPreviousFree()); - if (next->isFree()) { - --numFree_; - removeFromFreeList(next); - header->setSize(header->size() + next->size() + sizeof(Header)); - next = reinterpret_cast(header->end()); - VELOX_CHECK(next->isArenaEnd() || !next->isFree()); - } - } - if (header->isPreviousFree()) { - auto previousFree = getPreviousFree(header); - removeFromFreeList(previousFree); - previousFree->setSize( - previousFree->size() + header->size() + sizeof(Header)); - - header = previousFree; + if (headerToFree->size() > kMaxAlloc && + !state_.pool().isInCurrentRange(headerToFree) && + state_.allocationsFromPool().find(headerToFree) != + state_.allocationsFromPool().end()) { + freeToPool(headerToFree, headerToFree->size() + kHeaderSize); } else { - ++numFree_; - } - auto freedSize = header->size(); - auto freeIndex = freeListIndex(freedSize); - freeNonEmpty_ |= 1 << freeIndex; - if (largestInFreeList_[freeIndex] < freedSize) { - largestInFreeList_[freeIndex] = freedSize; + VELOX_CHECK(!headerToFree->isFree()); + state_.freeBytes() += blockBytes(headerToFree); + state_.currentBytes() -= blockBytes(headerToFree); + Header* next = headerToFree->next(); + if (next != nullptr) { + VELOX_CHECK(!next->isPreviousFree()); + if (next->isFree()) { + --state_.numFree(); + removeFromFreeList(next); + headerToFree->setSize( + headerToFree->size() + next->size() + kHeaderSize); + next = castToHeader(headerToFree->end()); + VELOX_CHECK(next->isArenaEnd() || !next->isFree()); + } + } + if (headerToFree->isPreviousFree()) { + auto* previousFree = getPreviousFree(headerToFree); + removeFromFreeList(previousFree); + previousFree->setSize( + previousFree->size() + headerToFree->size() + kHeaderSize); + + headerToFree = previousFree; + } else { + ++state_.numFree(); + } + const auto freedSize = headerToFree->size(); + const auto freeIndex = freeListIndex(freedSize); + bits::setBit(state_.freeNonEmpty(), freeIndex); + state_.freeLists()[freeIndex].insert( + reinterpret_cast(headerToFree->begin())); + markAsFree(headerToFree); } - free_[freeIndex].insert( - reinterpret_cast(header->begin())); - markAsFree(header); - header = continued; - } while (header); + headerToFree = continued; + } while (headerToFree != nullptr); } // static -int64_t HashStringAllocator::offset( - Header* FOLLY_NONNULL header, - Position position) { +int64_t HashStringAllocator::offset(Header* header, Position position) { static const int64_t kOutOfRange = -1; if (!position.isSet()) { return kOutOfRange; @@ -526,7 +523,7 @@ int64_t HashStringAllocator::offset( int64_t size = 0; for (;;) { - assert(header); + VELOX_CHECK_NOT_NULL(header); const auto length = header->usableSize(); const auto offset = position.position - header->begin(); if (offset >= 0 && offset <= length) { @@ -542,12 +539,12 @@ int64_t HashStringAllocator::offset( // static HashStringAllocator::Position HashStringAllocator::seek( - Header* FOLLY_NONNULL header, + Header* header, int64_t offset) { int64_t size = 0; for (;;) { - assert(header); - auto length = header->usableSize(); + VELOX_CHECK_NOT_NULL(header); + const auto length = header->usableSize(); if (offset <= size + length) { return Position::atOffset(header, offset - size); } @@ -561,12 +558,12 @@ HashStringAllocator::Position HashStringAllocator::seek( // static int64_t HashStringAllocator::available(const Position& position) { - auto header = position.header; + auto* header = position.header; const auto startOffset = position.offset(); // startOffset bytes from the first block are already used. int64_t size = -startOffset; for (;;) { - assert(header); + VELOX_CHECK_NOT_NULL(header); size += header->usableSize(); if (!header->isContinued()) { return size; @@ -580,31 +577,106 @@ void HashStringAllocator::ensureAvailable(int32_t bytes, Position& position) { return; } - ByteStream stream(this); + ByteOutputStream stream(this); extendWrite(position, stream); static char data[128]; - while (bytes) { - auto written = std::min(bytes, sizeof(data)); + while (bytes > 0) { + const auto written = std::min(bytes, sizeof(data)); stream.append(folly::StringPiece(data, written)); bytes -= written; } position = finishWrite(stream, 0).first; } +inline bool HashStringAllocator::storeStringFast( + const char* bytes, + int32_t numBytes, + char* destination) { + const auto roundedBytes = std::max(numBytes, kMinAlloc); + + Header* header = nullptr; + if (state_.freeLists()[kNumFreeLists - 1].empty()) { + if (roundedBytes >= kMaxAlloc) { + return false; + } + const auto index = freeListIndex(roundedBytes); + const auto available = + bits::findFirstBit(state_.freeNonEmpty(), index, kNumFreeLists); + if (available < 0) { + return false; + } + header = allocateFromFreeList(roundedBytes, true, true, available); + VELOX_CHECK_NOT_NULL(header); + } else { + auto& freeList = state_.freeLists()[kNumFreeLists - 1]; + header = headerOf(freeList.next()); + const auto spaceTaken = roundedBytes + kHeaderSize; + if (spaceTaken > header->size()) { + return false; + } + if (header->size() - spaceTaken > kMaxAlloc) { + // The entry after allocation stays in the largest free list. + // The size at the end of the block is changed in place. + reinterpret_cast(header->end())[-1] -= spaceTaken; + auto* freeHeader = new (header->begin() + roundedBytes) + Header(header->size() - spaceTaken); + freeHeader->setFree(); + header->clearFree(); + ::memcpy(freeHeader->begin(), header->begin(), sizeof(CompactDoubleList)); + freeList.nextMoved( + reinterpret_cast(freeHeader->begin())); + header->setSize(roundedBytes); + state_.freeBytes() -= spaceTaken; + state_.currentBytes() += spaceTaken; + } else { + header = + allocateFromFreeList(roundedBytes, true, true, kNumFreeLists - 1); + if (!header) { + return false; + } + } + } + + simd::memcpy(header->begin(), bytes, numBytes); + *reinterpret_cast(destination) = + StringView(reinterpret_cast(header->begin()), numBytes); + return true; +} + +void HashStringAllocator::copyMultipartNoInline( + const StringView& srcStr, + char* group, + int32_t offset) { + const auto numBytes = srcStr.size(); + if (storeStringFast(srcStr.data(), numBytes, group + offset)) { + return; + } + // Write the string as non-contiguous chunks. + ByteOutputStream stream(this, false, false); + auto position = newWrite(stream, numBytes); + stream.appendStringView(srcStr); + finishWrite(stream, 0); + + // The stringView has a pointer to the first byte and the total + // size. Read with contiguousString(). + *reinterpret_cast(group + offset) = + StringView(reinterpret_cast(position.position), numBytes); +} + std::string HashStringAllocator::toString() const { std::ostringstream out; - out << "allocated: " << cumulativeBytes_ << " bytes" << std::endl; - out << "free: " << freeBytes_ << " bytes in " << numFree_ << " blocks" - << std::endl; - out << "standalone allocations: " << sizeFromPool_ << " bytes in " - << allocationsFromPool_.size() << " allocations" << std::endl; - out << "ranges: " << pool_.numRanges() << std::endl; + out << "allocated: " << state_.currentBytes() << " bytes" << std::endl; + out << "free: " << state_.freeBytes() << " bytes in " << state_.numFree() + << " blocks" << std::endl; + out << "standalone allocations: " << state_.sizeFromPool() << " bytes in " + << state_.allocationsFromPool().size() << " allocations" << std::endl; + out << "ranges: " << state_.pool().numRanges() << std::endl; static const auto kHugePageSize = memory::AllocationTraits::kHugePageSize; - for (auto i = 0; i < pool_.numRanges(); ++i) { - auto topRange = pool_.rangeAt(i); + for (auto i = 0; i < state_.pool().numRanges(); ++i) { + auto topRange = state_.pool().rangeAt(i); auto topRangeSize = topRange.size(); out << "range " << i << ": " << topRangeSize << " bytes" << std::endl; @@ -618,8 +690,8 @@ std::string HashStringAllocator::toString() const { std::min(topRangeSize, kHugePageSize)); auto size = range.size() - simd::kPadding; - auto end = reinterpret_cast(range.data() + size); - auto header = reinterpret_cast(range.data()); + auto end = castToHeader(range.data() + size); + auto header = castToHeader(range.data()); while (header != nullptr && header != end) { out << "\t" << header->toString() << std::endl; header = header->next(); @@ -636,8 +708,8 @@ int64_t HashStringAllocator::checkConsistency() const { uint64_t numFree = 0; uint64_t freeBytes = 0; int64_t allocatedBytes = 0; - for (auto i = 0; i < pool_.numRanges(); ++i) { - auto topRange = pool_.rangeAt(i); + for (auto i = 0; i < state_.pool().numRanges(); ++i) { + auto topRange = state_.pool().rangeAt(i); auto topRangeSize = topRange.size(); if (topRangeSize >= kHugePageSize) { VELOX_CHECK_EQ(0, topRangeSize % kHugePageSize); @@ -649,10 +721,10 @@ int64_t HashStringAllocator::checkConsistency() const { auto range = folly::Range( topRange.data() + subRangeStart, std::min(topRangeSize, kHugePageSize)); - auto size = range.size() - simd::kPadding; + const auto size = range.size() - simd::kPadding; bool previousFree = false; - auto end = reinterpret_cast(range.data() + size); - auto header = reinterpret_cast(range.data()); + auto* end = castToHeader(range.data() + size); + auto* header = castToHeader(range.data()); while (header != end) { VELOX_CHECK_GE(reinterpret_cast(header), range.data()); VELOX_CHECK_LT( @@ -665,59 +737,60 @@ int64_t HashStringAllocator::checkConsistency() const { if (header->isFree()) { VELOX_CHECK(!previousFree); VELOX_CHECK(!header->isContinued()); - if (header->next()) { + if (header->next() != nullptr) { VELOX_CHECK_EQ( header->size(), *(reinterpret_cast(header->end()) - 1)); } ++numFree; - freeBytes += sizeof(Header) + header->size(); + freeBytes += blockBytes(header); } else if (header->isContinued()) { - // If the content of the header is continued, check the - // continue header is readable and not free. - auto continued = header->nextContinued(); + // If the content of the header is continued, check the continued + // header is readable and not free. + auto* continued = header->nextContinued(); VELOX_CHECK(!continued->isFree()); - allocatedBytes += header->size() - sizeof(void*); + allocatedBytes += blockBytes(header); } else { - allocatedBytes += header->size(); + allocatedBytes += blockBytes(header); } previousFree = header->isFree(); - header = reinterpret_cast(header->end()); + header = castToHeader(header->end()); } } } - VELOX_CHECK_EQ(numFree, numFree_); - VELOX_CHECK_EQ(freeBytes, freeBytes_); + VELOX_CHECK_EQ(numFree, state_.numFree()); + VELOX_CHECK_EQ(freeBytes, state_.freeBytes()); uint64_t numInFreeList = 0; uint64_t bytesInFreeList = 0; for (auto i = 0; i < kNumFreeLists; ++i) { - bool hasData = freeNonEmpty_ & (1 << i); - bool listNonEmpty = !free_[i].empty(); + const bool hasData = bits::isBitSet(state_.freeNonEmpty(), i); + const bool listNonEmpty = !state_.freeLists()[i].empty(); VELOX_CHECK_EQ(hasData, listNonEmpty); - for (auto free = free_[i].next(); free != &free_[i]; free = free->next()) { + for (auto* free = state_.freeLists()[i].next(); + free != &state_.freeLists()[i]; + free = free->next()) { ++numInFreeList; - auto size = headerOf(free)->size(); - if (i > 0) { - VELOX_CHECK_GE(size, freeListSizes_[i - 1]); + VELOX_CHECK( + free->next()->previous() == free, + "free list previous link inconsistent"); + const auto size = headerOf(free)->size(); + VELOX_CHECK_GE(size, kMinAlloc); + if (size - kMinAlloc < kNumFreeLists - 1) { + VELOX_CHECK_EQ(size - kMinAlloc, i); + } else { + VELOX_CHECK_GE(size - kMinAlloc, kNumFreeLists - 1); } - VELOX_CHECK_LT(size, freeListSizes_[i]); - bytesInFreeList += size + sizeof(Header); + bytesInFreeList += size + kHeaderSize; } } - VELOX_CHECK_EQ(numInFreeList, numFree_); - VELOX_CHECK_EQ(bytesInFreeList, freeBytes_); + VELOX_CHECK_EQ(numInFreeList, state_.numFree()); + VELOX_CHECK_EQ(bytesInFreeList, state_.freeBytes()); return allocatedBytes; } bool HashStringAllocator::isEmpty() const { - return sizeFromPool_ == 0 && checkConsistency() == 0; + return state_.sizeFromPool() == 0 && checkConsistency() == 0; } - -void HashStringAllocator::checkEmpty() const { - VELOX_CHECK_EQ(0, sizeFromPool_); - VELOX_CHECK_EQ(0, checkConsistency()); -} - } // namespace facebook::velox diff --git a/velox/common/memory/HashStringAllocator.h b/velox/common/memory/HashStringAllocator.h index a7293201168ba..485da69966d76 100644 --- a/velox/common/memory/HashStringAllocator.h +++ b/velox/common/memory/HashStringAllocator.h @@ -27,26 +27,26 @@ namespace facebook::velox { -// Implements an arena backed by MappedMemory::Allocation. This is for backing -// ByteStream or for allocating single blocks. Blocks can be individually freed. -// Adjacent frees are coalesced and free blocks are kept in a free list. -// Allocated blocks are prefixed with a Header. This has a size and flags. -// kContinue means that last 8 bytes are a pointer to another Header after which -// the contents of this allocation continue. kFree means the block is free. A -// free block has pointers to the next and previous free block via a -// CompactDoubleList struct immediately after the header. The last 4 bytes of a -// free block contain its length. kPreviousFree means that the block immediately -// below is free. In this case the uint32_t below the header has the size of the -// previous free block. The last word of a Allocation::PageRun backing a -// HashStringAllocator is set to kArenaEnd. +/// Implements an arena backed by memory::Allocation. This is for backing +/// ByteOutputStream or for allocating single blocks. Blocks can be individually +/// freed. Adjacent frees are coalesced and free blocks are kept in a free list. +/// Allocated blocks are prefixed with a Header. This has a size and flags. +/// kContinue means that last 8 bytes are a pointer to another Header after +/// which the contents of this allocation continue. kFree means the block is +/// free. A free block has pointers to the next and previous free block via a +/// CompactDoubleList struct immediately after the header. The last 4 bytes of a +/// free block contain its length. kPreviousFree means that the block +/// immediately below is free. In this case the uint32_t below the header has +/// the size of the previous free block. The last word of a Allocation::PageRun +/// backing a HashStringAllocator is set to kArenaEnd. class HashStringAllocator : public StreamArena { public: - // The minimum allocation must have space after the header for the - // free list pointers and the trailing length. + /// The minimum allocation must have space after the header for the free list + /// pointers and the trailing length. static constexpr int32_t kMinAlloc = sizeof(CompactDoubleList) + sizeof(uint32_t); - // Sizes larger than this will come direct from 'pool(). + /// Sizes larger than this will come direct from pool(). static constexpr int32_t kMaxAlloc = memory::AllocationTraits::kPageSize / 4 * 3; @@ -58,12 +58,12 @@ class HashStringAllocator : public StreamArena { static constexpr uint32_t kSizeMask = (1U << 29) - 1; static constexpr uint32_t kContinuedPtrSize = sizeof(void*); - // Marker at end of a PageRun. Distinct from valid headers since - // all the 3 high bits are set, which is not valid for a header. + /// Marker at end of a PageRun. Distinct from valid headers since all the 3 + /// high bits are set, which is not valid for a header. static constexpr uint32_t kArenaEnd = 0xf0aeab0d; explicit Header(uint32_t size) : data_(size) { - VELOX_CHECK(size <= kSizeMask); + VELOX_CHECK_LE(size, kSizeMask); } bool isContinued() const { @@ -115,22 +115,22 @@ class HashStringAllocator : public StreamArena { } void setSize(int32_t size) { - VELOX_CHECK(size <= kSizeMask); + VELOX_CHECK_LE(size, kSizeMask); data_ = size | (data_ & ~kSizeMask); } - char* FOLLY_NONNULL begin() { + char* begin() { return reinterpret_cast(this + 1); } - char* FOLLY_NONNULL end() { + char* end() { return begin() + size(); } /// Returns the Header of the block that is physically next to this block or /// null if this is the last block of the arena. - Header* FOLLY_NULLABLE next() { - auto next = reinterpret_cast(end()); + Header* next() { + auto* next = castToHeader(end()); return next->data_ == kArenaEnd ? nullptr : next; } @@ -149,8 +149,8 @@ class HashStringAllocator : public StreamArena { }; struct Position { - Header* FOLLY_NULLABLE header{nullptr}; - char* FOLLY_NULLABLE position{nullptr}; + Header* header{nullptr}; + char* position{nullptr}; int32_t offset() const { VELOX_DCHECK_NOT_NULL(header); @@ -174,60 +174,37 @@ class HashStringAllocator : public StreamArena { } }; - explicit HashStringAllocator(memory::MemoryPool* FOLLY_NONNULL pool) - : StreamArena(pool), pool_(pool) {} + explicit HashStringAllocator(memory::MemoryPool* pool) + : StreamArena(pool), state_(pool) {} ~HashStringAllocator(); - // Copies a StringView at 'offset' in 'group' to storage owned by - // the hash table. Updates the StringView. - void copy(char* FOLLY_NONNULL group, int32_t offset) { - StringView* string = reinterpret_cast(group + offset); - if (string->isInline()) { + // Copies the StringView 'srcStr' to storage owned by 'this'. Creates a new + // StringView at 'offset' in 'group' pointing to the copy. A large string may + // be copied into non-contiguous allocation pieces. The size in the StringView + // is the sum of the sizes. The pieces are linked via Headers, the first + // header is below the first byte of the StringView's data. StringViews + // written by this are to be read with contiguousString(). This is nearly + // always zero copy but will accommodate the odd extra large string. + void copyMultipart(const StringView& str, char* group, int32_t offset) { + if (str.isInline()) { + *reinterpret_cast(group + offset) = str; return; } - auto data = pool_.allocateFixed(string->size()); - memcpy(data, string->data(), string->size()); - *string = StringView(data, string->size()); + copyMultipartNoInline(str, group, offset); } - // Copies a StringView at 'offset' in 'group' to storage owned by - // 'this'. Updates the StringView. A large string may be copied into - // non-contiguous allocation pieces. The size in the StringView is - // the sum of the sizes. The pieces are linked via Headers, the - // first header is below the first byte of the StringView's - // data. StringViews written by this are to be read with - // contiguousString(). This is nearly always zero copy but will - // accommodate the odd extra large string. - void copyMultipart(char* FOLLY_NONNULL group, int32_t offset) { - auto string = reinterpret_cast(group + offset); - if (string->isInline()) { - return; - } - auto numBytes = string->size(); - - // Write the string as non-contiguous chunks. - ByteStream stream(this, false, false); - auto position = newWrite(stream, numBytes); - stream.appendStringPiece(folly::StringPiece(string->data(), numBytes)); - finishWrite(stream, 0); - - // The stringView has a pointer to the first byte and the total - // size. Read with contiguousString(). - *string = StringView(reinterpret_cast(position.position), numBytes); - } - - // Returns a contiguous view on 'view', where 'view' comes from - // copyMultipart(). Uses 'storage' to own a possible temporary - // copy. Making a temporary copy only happens for non-contiguous - // strings. + /// Returns a contiguous view on 'view', where 'view' comes from + /// copyMultipart(). Uses 'storage' to own a possible temporary copy. Making a + /// temporary copy only happens for non-contiguous strings. static StringView contiguousString(StringView view, std::string& storage); - // Allocates 'size' contiguous bytes preceded by a Header. Returns - // the address of Header. - Header* FOLLY_NONNULL allocate(int32_t size) { - VELOX_CHECK( - !currentHeader_, "Do not call allocate() when a write is in progress"); + /// Allocates 'size' contiguous bytes preceded by a Header. Returns the + /// address of Header. + Header* allocate(int32_t size) { + VELOX_CHECK_NULL( + state_.currentHeader(), + "Do not call allocate() when a write is in progress"); return allocate(std::max(size, kMinAlloc), true); } @@ -240,178 +217,181 @@ class HashStringAllocator : public StreamArena { /// match. void freeToPool(void* ptr, size_t size); - // Returns the header immediately below 'data'. - static Header* FOLLY_NONNULL headerOf(const void* FOLLY_NONNULL data) { + /// Returns the header immediately below 'data'. + static Header* headerOf(const void* data) { + return castToHeader(data) - 1; + } + + /// Returns the header below 'data'. + static Header* castToHeader(const void* data) { return reinterpret_cast( - const_cast(reinterpret_cast(data))) - - 1; + const_cast(reinterpret_cast(data))); } - // Sets 'stream' to range over the data in the range of 'header' and - // possible continuation ranges. - static void prepareRead( - const Header* FOLLY_NONNULL header, - ByteStream& stream); + /// Returns the byte size of block pointed by 'header'. + inline size_t blockBytes(const Header* header) const { + return header->size() + kHeaderSize; + } + + /// Returns ByteInputStream over the data in the range of 'header' and + /// possible continuation ranges. + /// @param maxBytes If provided, the returned stream will cover at most that + /// many bytes. + static std::unique_ptr prepareRead( + const Header* header, + size_t maxBytes = std::numeric_limits::max()); - // Returns the number of payload bytes between 'header->begin()' and - // 'position'. - static int64_t offset(Header* FOLLY_NONNULL header, Position position); + /// Returns the number of payload bytes between 'header->begin()' and + /// 'position'. + static int64_t offset(Header* header, Position position); - // Returns a position 'offset' bytes after 'header->begin()'. - static Position seek(Header* FOLLY_NONNULL header, int64_t offset); + /// Returns a position 'offset' bytes after 'header->begin()'. + static Position seek(Header* header, int64_t offset); - // Returns the number of bytes that can be written starting at 'position' - // without allocating more space. + /// Returns the number of bytes that can be written starting at 'position' + /// without allocating more space. static int64_t available(const Position& position); - // Ensures that one can write at least 'bytes' data starting at - // 'position' without allocating more space. 'position' can be - // changed but will logically point at the same data. Data to the - // right of 'position is not preserved. + /// Ensures that one can write at least 'bytes' data starting at 'position' + /// without allocating more space. 'position' can be changed but will + /// logically point at the same data. Data to the right of 'position is not + /// preserved. void ensureAvailable(int32_t bytes, Position& position); - // Sets stream to write to this pool. The write can span multiple - // non-contiguous runs. Each contiguous run will have at least - // kMinContiguous bytes of contiguous space. finishWrite finalizes - // the allocation information after the write is done. - // Returns the position at the start of the allocated block. - Position newWrite(ByteStream& stream, int32_t preferredSize = kMinContiguous); + /// Sets stream to write to this pool. The write can span multiple + /// non-contiguous runs. Each contiguous run will have at least kMinContiguous + /// bytes of contiguous space. finishWrite finalizes the allocation + /// information after the write is done. Returns the position at the start of + /// the allocated block. + Position newWrite( + ByteOutputStream& stream, + int32_t preferredSize = kMinContiguous); // Sets 'stream' to write starting at 'position'. If new ranges have to // be allocated when writing, headers will be updated accordingly. - void extendWrite(Position position, ByteStream& stream); + void extendWrite(Position position, ByteOutputStream& stream); - // Completes a write prepared with newWrite or - // extendWrite. Up to 'numReserveBytes' unused bytes, if available, are left - // after the end of the write to accommodate another write. Returns a pair of - // positions: (1) position at the start of this 'write', (2) position - // immediately after the last written byte. + /// Completes a write prepared with newWrite or extendWrite. Up to + /// 'numReserveBytes' unused bytes, if available, are left after the end of + /// the write to accommodate another write. Returns a pair of positions: (1) + /// position at the start of this 'write', (2) position immediately after the + /// last written byte. std::pair finishWrite( - ByteStream& stream, + ByteOutputStream& stream, int32_t numReserveBytes); /// Allocates a new range for a stream writing to 'this'. Sets the last word /// of the previous range to point to the new range and copies the overwritten - /// word as the first word of the new range. + /// word as the first word of the new range. If 'lastRange' is non-null, we + /// are continuing an existing entry and setting the last word of the + /// previous entry point to the new one. In this case, we decrement the size + /// in 'lastEntry' by the size of the continue pointer, so that the sum of the + /// sizes reflects the payload size without any overheads. Furthermore, + /// rewriting a multirange entry is safe because a write spanning multiple + /// ranges will not overwrite the next pointer. /// /// May allocate less than 'bytes'. - void newRange(int32_t bytes, ByteRange* FOLLY_NONNULL range) override; + void newRange(int32_t bytes, ByteRange* lastRange, ByteRange* range) override; /// Allocates a new range of at least 'bytes' size. void newContiguousRange(int32_t bytes, ByteRange* range); - void newTinyRange(int32_t bytes, ByteRange* FOLLY_NONNULL range) override { - newRange(bytes, range); + void newTinyRange(int32_t bytes, ByteRange* lastRange, ByteRange* range) + override { + newRange(bytes, lastRange, range); } - // Returns the total memory footprint of 'this'. + /// Returns the total memory footprint of 'this'. int64_t retainedSize() const { - return pool_.allocatedBytes() + sizeFromPool_; + return state_.pool().allocatedBytes() + state_.sizeFromPool(); } - // Adds the allocation of 'header' and any extensions (if header has - // kContinued set) to the free list. - void free(Header* FOLLY_NONNULL header); + /// Adds the allocation of 'header' and any extensions (if header has + /// kContinued set) to the free list. + void free(Header* header); - // Returns a lower bound on bytes available without growing - // 'this'. This is the sum of free block sizes minus size of pointer - // for each. We subtract the pointer because in the worst case we - // would have one allocation that chains many small free blocks - // together via kContinued. + /// Returns a lower bound on bytes available without growing 'this'. This is + /// the sum of free block sizes minus size of pointer for each. We subtract + /// the pointer because in the worst case we would have one allocation that + /// chains many small free blocks together via kContinued. uint64_t freeSpace() const { - int64_t minFree = freeBytes_ - numFree_ * (sizeof(Header) + sizeof(void*)); + const int64_t minFree = state_.freeBytes() - + state_.numFree() * (kHeaderSize + Header::kContinuedPtrSize); VELOX_CHECK_GE(minFree, 0, "Guaranteed free space cannot be negative"); return minFree; } - // Frees all memory associated with 'this' and leaves 'this' ready for reuse. - void clear(); + /// Frees all memory associated with 'this' and leaves 'this' ready for reuse. + void clear() override; - memory::MemoryPool* FOLLY_NONNULL pool() const { - return pool_.pool(); + memory::MemoryPool* pool() const { + return state_.pool().pool(); } - uint64_t cumulativeBytes() const { - return cumulativeBytes_; + uint64_t currentBytes() const { + return state_.currentBytes(); } - // Returns the starting sizes of free lists. Allocating one of these - // sizes will always be fast because all elements of the free list - // in question will fit. - static folly::Range freeListSizeClasses(); - - // Checks the free space accounting and consistency of - // Headers. Throws when detects corruption. Returns the number of allocated - // payload bytes, excluding headers, continue links and other overhead. + /// Checks the free space accounting and consistency of Headers. Throws when + /// detects corruption. Returns the number of allocated payload bytes, + /// excluding headers, continue links and other overhead. int64_t checkConsistency() const; /// Returns 'true' if this is empty. The implementation includes a call to /// checkConsistency() which makes it slow. Do not use in hot paths. bool isEmpty() const; - /// Throws if 'this' is not empty. Checks consistency of - /// 'this'. This is a fast check for RowContainer users freeing the - /// variable length data they store. Can be used in non-debug - /// builds. - void checkEmpty() const; + std::string toString() const; - // Returns the cumulative number of free list items allocations have looked at - // and skipped because too small. - int64_t numFreeListNoFit() const { - return numFreeListNoFit_; - } + /// Effectively makes this immutable while executing f, any attempt to access + /// state_ in a mutable way while f is executing will cause an exception to be + /// thrown. + template + void freezeAndExecute(F&& f) { + state_.freeze(); - std::string toString() const; + SCOPE_EXIT { + state_.unfreeze(); + }; + + f(); + } private: static constexpr int32_t kUnitSize = 16 * memory::AllocationTraits::kPageSize; static constexpr int32_t kMinContiguous = 48; - static constexpr int32_t kNumFreeLists = 10; - - // different sizes have different free lists. Sizes below first size - // go to freeLists_[0]. Sizes >= freeListSize_[i] go to freeLists_[i - // + 1]. The sizes match the size progression for growing F14 - // containers. Static array of multiple of 8 ints for simd. - static int32_t freeListSizes_[HashStringAllocator::kNumFreeLists + 7]; + static constexpr int32_t kNumFreeLists = kMaxAlloc - kMinAlloc + 2; + static constexpr uint32_t kHeaderSize = sizeof(Header); - // The largest size present in each free list. This is updated when freing and - // when failing to find a large enough block in the free list in question. - int32_t largestInFreeList_[HashStringAllocator::kNumFreeLists] = {}; - - void newRange(int32_t bytes, ByteRange* range, bool contiguous); + void newRange( + int32_t bytes, + ByteRange* lastRange, + ByteRange* range, + bool contiguous); // Adds a new standard size slab to the free list. This // grows the footprint in MemoryAllocator but does not allocate // anything yet. Throws if fails to grow. void newSlab(); - void removeFromFreeList(Header* FOLLY_NONNULL header) { - VELOX_CHECK(header->isFree()); - header->clearFree(); - auto index = freeListIndex(header->size()); - reinterpret_cast(header->begin())->remove(); - if (free_[index].empty()) { - freeNonEmpty_ &= ~(1 << index); - } - } + void removeFromFreeList(Header* header); + + // Allocates a block of specified size. If exactSize is false, the block may + // be smaller or larger. Checks free list before allocating new memory. + Header* allocate(int32_t size, bool exactSize); - /// Allocates a block of specified size. If exactSize is false, the block may - /// be smaller or larger. Checks free list before allocating new memory. - Header* FOLLY_NULLABLE allocate(int32_t size, bool exactSize); - - // Allocates memory from free list. Returns nullptr if no memory in - // free list, otherwise returns a header of a free block of some - // size. if 'mustHaveSize' is true, the block will not be smaller - // than 'preferredSize'. If 'isFinalSize' is true, this will not - // return a block that is much larger than preferredSize. Otherwise, - // the block can be larger and the user is expected to call - // freeRestOfBlock to finalize the allocation. - Header* FOLLY_NULLABLE allocateFromFreeLists( + // Allocates memory from free list. Returns nullptr if no memory in free list, + // otherwise returns a header of a free block of some size. if 'mustHaveSize' + // is true, the block will not be smaller than 'preferredSize'. If + // 'isFinalSize' is true, this will not return a block that is much larger + // than preferredSize. Otherwise, the block can be larger and the user is + // expected to call freeRestOfBlock to finalize the allocation. + Header* allocateFromFreeLists( int32_t preferredSize, bool mustHaveSize, bool isFinalSize); - Header* FOLLY_NULLABLE allocateFromFreeList( + Header* allocateFromFreeList( int32_t preferredSize, bool mustHaveSize, bool isFinalSize, @@ -420,68 +400,138 @@ class HashStringAllocator : public StreamArena { // Sets 'header' to be 'keepBytes' long and adds the remainder of // 'header's memory to free list. Does nothing if the resulting // blocks would be below minimum size. - void freeRestOfBlock(Header* FOLLY_NONNULL header, int32_t keepBytes); + void freeRestOfBlock(Header* header, int32_t keepBytes); - // Returns the free list index for 'size'. Masks out empty sizes that can be - // given by 'mask'. If 'mask' excludes all free lists, returns > - // kNumFreeLists. - int32_t freeListIndex(int32_t size, uint32_t mask = ~0); + void + copyMultipartNoInline(const StringView& str, char* group, int32_t offset); - // Circular list of free blocks. - CompactDoubleList free_[kNumFreeLists]; + // Fast path for storing a string as a single part. Returns true if succeeded, + // has no effect if returns false. + bool storeStringFast(const char* bytes, int32_t size, char* destination); - // Bitmap with a 1 if the corresponding list in 'free_' is not empty. - int32_t freeNonEmpty_{0}; + // Returns the free list index for 'size'. + int32_t freeListIndex(int size); + + /// A class that wraps any fields in the HashStringAllocator, it's main + /// purpose is to simplify the freeze/unfreeze mechanic. Fields are exposed + /// via accessor methods, attempting to invoke a non-const accessor when the + /// HashStringAllocator is frozen will cause an exception to be thrown. + class State { + public: + explicit State(memory::MemoryPool* pool) : pool_(pool) {} + + void freeze() { + VELOX_CHECK( + mutable_, + "Attempting to freeze an already frozen HashStringAllocator."); + mutable_ = false; + } + + void unfreeze() { + VELOX_CHECK( + !mutable_, + "Attempting to unfreeze an already unfrozen HashStringAllocator."); + mutable_ = true; + } + + private: +// Every field has two accessors, one that returns a reference and one that +// returns a const reference. The one that returns a reference ensures that the +// HashStringAllocator isn't frozen first. +#define DECLARE_GETTERS(TYPE, NAME) \ + public: \ + inline TYPE& NAME() { \ + assertMutability(); \ + return NAME##_; \ + } \ + \ + inline TYPE const& NAME() const { \ + return NAME##_; \ + } - // Count of elements in 'free_'. This is 0 when all free_[i].next() == - // &free_[i]. - uint64_t numFree_ = 0; +// Declare a default initialized field. +#define DECLARE_FIELD(TYPE, NAME) \ + DECLARE_GETTERS(TYPE, NAME) \ + \ + private: \ + TYPE NAME##_; - // Sum of the size of blocks in 'free_', excluding headers. - uint64_t freeBytes_ = 0; +// Declare a field initialized with a specific value. +#define DECLARE_FIELD_WITH_INIT_VALUE(TYPE, NAME, VALUE) \ + DECLARE_GETTERS(TYPE, NAME) \ + \ + private: \ + TYPE NAME##_{VALUE}; - // Counter of allocated bytes. The difference of two point in time values - // tells how much memory has been consumed by activity between these points in - // time. Incremented by allocation and decremented by free. Used for tracking - // the row by row space usage in a RowContainer. - uint64_t cumulativeBytes_{0}; + typedef CompactDoubleList FreeList[kNumFreeLists]; + typedef uint64_t FreeNonEmptyBitMap[bits::nwords(kNumFreeLists)]; + typedef folly::F14FastMap AllocationsFromPool; - // Pointer to Header for the range being written. nullptr if a write is not in - // progress. - Position startPosition_; - Header* FOLLY_NULLABLE currentHeader_ = nullptr; + // Circular list of free blocks. + DECLARE_FIELD(FreeList, freeLists); - // Pool for getting new slabs. - memory::AllocationPool pool_; + // Bitmap with a 1 if the corresponding list in 'free_' is not empty. + DECLARE_FIELD_WITH_INIT_VALUE(FreeNonEmptyBitMap, freeNonEmpty, {}); - // Map from pointer to size for large blocks allocated from pool(). - folly::F14FastMap allocationsFromPool_; + // Count of elements in 'free_'. This is 0 when all free_[i].next() == + // &free_[i]. + DECLARE_FIELD_WITH_INIT_VALUE(uint64_t, numFree, 0); - // Sum of sizes in 'allocationsFromPool_'. - int64_t sizeFromPool_{0}; + // Sum of the size of blocks in 'free_', excluding headers. + DECLARE_FIELD_WITH_INIT_VALUE(uint64_t, freeBytes, 0); - // Count of times a free list item was skipped because it did not fit - // requested size. - int64_t numFreeListNoFit_{0}; + // Counter of allocated bytes. The difference of two point in time values + // tells how much memory has been consumed by activity between these points + // in time. Incremented by allocation and decremented by free. Used for + // tracking the row by row space usage in a RowContainer. + DECLARE_FIELD_WITH_INIT_VALUE(uint64_t, currentBytes, 0); + + // Pointer to Header for the range being written. nullptr if a write is not + // in progress. + DECLARE_FIELD(Position, startPosition); + DECLARE_FIELD_WITH_INIT_VALUE(Header*, currentHeader, nullptr); + + // Pool for getting new slabs. + DECLARE_FIELD(memory::AllocationPool, pool); + + // Map from pointer to size for large blocks allocated from pool(). + DECLARE_FIELD(AllocationsFromPool, allocationsFromPool); + + // Sum of sizes in 'allocationsFromPool_'. + DECLARE_FIELD_WITH_INIT_VALUE(int64_t, sizeFromPool, 0); + +#undef DECLARE_FIELD_WITH_INIT_VALUE +#undef DECLARE_FIELD +#undef DECLARE_GETTERS + + void assertMutability() const { + VELOX_CHECK(mutable_, "The HashStringAllocator is immutable."); + } + + tsan_atomic mutable_ = true; + }; + + // This should be the only field in HashStringAllocator, any additional fields + // should be added as private members of State exposed through accessors. + State state_; }; -// Utility for keeping track of allocation between two points in -// time. A counter on a row supplied at construction is incremented -// by the change in allocation between construction and -// destruction. This is a scoped guard to use around setting -// variable length data in a RowContainer or similar. +/// Utility for keeping track of allocation between two points in time. A +/// counter on a row supplied at construction is incremented by the change in +/// allocation between construction and destruction. This is a scoped guard to +/// use around setting variable length data in a RowContainer or similar. template class RowSizeTracker { public: - // Will update the counter at pointer cast to TCounter* - // with the change in allocation during the lifetime of 'this' + /// Will update the counter at pointer cast to TCounter* with the change in + /// allocation during the lifetime of 'this' RowSizeTracker(T& counter, HashStringAllocator& allocator) - : allocator_(allocator), - size_(allocator_.cumulativeBytes()), + : allocator_(&allocator), + size_(allocator_->currentBytes()), counter_(counter) {} ~RowSizeTracker() { - auto delta = allocator_.cumulativeBytes() - size_; + auto delta = allocator_->currentBytes() - size_; if (delta) { saturatingIncrement(&counter_, delta); } @@ -489,23 +539,23 @@ class RowSizeTracker { private: // Increments T at *pointer without wrapping around at overflow. - void saturatingIncrement(T* FOLLY_NONNULL pointer, int64_t delta) { + void saturatingIncrement(T* pointer, int64_t delta) { auto value = *reinterpret_cast(pointer) + delta; *reinterpret_cast(pointer) = std::min(value, std::numeric_limits::max()); } - HashStringAllocator& allocator_; + HashStringAllocator* const allocator_; const uint64_t size_; T& counter_; }; -// An Allocator based by HashStringAllocator to use with STL containers. +/// An Allocator based by HashStringAllocator to use with STL containers. template struct StlAllocator { using value_type = T; - explicit StlAllocator(HashStringAllocator* FOLLY_NONNULL allocator) + explicit StlAllocator(HashStringAllocator* allocator) : allocator_{allocator} { VELOX_CHECK(allocator); } @@ -516,7 +566,7 @@ struct StlAllocator { VELOX_CHECK_NOT_NULL(allocator_); } - T* FOLLY_NONNULL allocate(std::size_t n) { + T* allocate(std::size_t n) { if (n * sizeof(T) > HashStringAllocator::kMaxAlloc) { return reinterpret_cast(allocator_->allocateFromPool(n * sizeof(T))); } @@ -524,14 +574,14 @@ struct StlAllocator { allocator_->allocate(checkedMultiply(n, sizeof(T)))->begin()); } - void deallocate(T* FOLLY_NONNULL p, std::size_t n) noexcept { + void deallocate(T* p, std::size_t n) noexcept { if (n * sizeof(T) > HashStringAllocator::kMaxAlloc) { return allocator_->freeToPool(p, n * sizeof(T)); } allocator_->free(HashStringAllocator::headerOf(p)); } - HashStringAllocator* FOLLY_NONNULL allocator() const { + HashStringAllocator* allocator() const { return allocator_; } @@ -544,12 +594,12 @@ struct StlAllocator { } private: - HashStringAllocator* FOLLY_NONNULL allocator_; + HashStringAllocator* allocator_; }; -// An allocator backed by HashStringAllocator that guaratees a configurable -// alignment. The alignment must be a power of 2 and not be 0. This allocator -// can be used with folly F14 containers that requires 16-bytes alignment. +/// An allocator backed by HashStringAllocator that guaratees a configurable +/// alignment. The alignment must be a power of 2 and not be 0. This allocator +/// can be used with folly F14 containers that requires 16-bytes alignment. template struct AlignedStlAllocator { using value_type = T; @@ -566,7 +616,7 @@ struct AlignedStlAllocator { using other = AlignedStlAllocator; }; - explicit AlignedStlAllocator(HashStringAllocator* FOLLY_NONNULL allocator) + explicit AlignedStlAllocator(HashStringAllocator* allocator) : allocator_{allocator}, poolAligned_(allocator_->pool()->alignment() >= Alignment) { VELOX_CHECK(allocator); @@ -579,37 +629,43 @@ struct AlignedStlAllocator { VELOX_CHECK(allocator_); } - T* FOLLY_NONNULL allocate(std::size_t n) { - if (n * sizeof(T) > HashStringAllocator::kMaxAlloc && poolAligned_) { - return reinterpret_cast(allocator_->allocateFromPool(n * sizeof(T))); + T* allocate(std::size_t n) { + if (n * sizeof(T) > HashStringAllocator::kMaxAlloc) { + if (poolAligned_) { + return reinterpret_cast( + allocator_->allocateFromPool(n * sizeof(T))); + } else { + auto paddedSize = calculatePaddedSize(n); + // Allocate the memory from pool directly. + auto ptr = + reinterpret_cast(allocator_->allocateFromPool(paddedSize)); + + return alignPtr((char*)ptr, n, paddedSize); + } } - // Allocate extra Alignment bytes for alignment and 4 bytes to store the - // delta between unaligned and aligned pointers. - auto size = - checkedPlus(Alignment + 4, checkedMultiply(n, sizeof(T))); - auto ptr = reinterpret_cast(allocator_->allocate(size)->begin()); - // Align 'ptr + 4'. - void* alignedPtr = (char*)ptr + 4; - size -= 4; - std::align(Alignment, n * sizeof(T), alignedPtr, size); + auto paddedSize = calculatePaddedSize(n); + auto ptr = reinterpret_cast(allocator_->allocate(paddedSize)->begin()); - // Write alignment delta just before the aligned pointer. - int32_t delta = (char*)alignedPtr - (char*)ptr - 4; - *reinterpret_cast((char*)alignedPtr - 4) = delta; - - return reinterpret_cast(alignedPtr); + return alignPtr((char*)ptr, n, paddedSize); } - void deallocate(T* FOLLY_NONNULL p, std::size_t n) noexcept { + void deallocate(T* p, std::size_t n) noexcept { if (n * sizeof(T) > HashStringAllocator::kMaxAlloc) { - return allocator_->freeToPool(p, n * sizeof(T)); + if (poolAligned_) { + return allocator_->freeToPool(p, n * sizeof(T)); + } else { + auto delta = *reinterpret_cast((char*)p - 4); + return allocator_->freeToPool( + (char*)p - 4 - delta, calculatePaddedSize(n)); + } } + auto delta = *reinterpret_cast((char*)p - 4); allocator_->free(HashStringAllocator::headerOf((char*)p - 4 - delta)); } - HashStringAllocator* FOLLY_NONNULL allocator() const { + HashStringAllocator* allocator() const { return allocator_; } @@ -626,7 +682,30 @@ struct AlignedStlAllocator { } private: - HashStringAllocator* FOLLY_NONNULL allocator_; + // Pad the memory user requested by some padding to facilitate memory + // alignment later. Memory layout: + // - padding(length is stored in `delta`) + // - delta(4 bytes storing the size of padding) + // - the aligned ptr + FOLLY_ALWAYS_INLINE std::size_t calculatePaddedSize(std::size_t n) { + return checkedPlus(Alignment + 4, checkedMultiply(n, sizeof(T))); + } + + FOLLY_ALWAYS_INLINE T* + alignPtr(char* ptr, std::size_t allocateCount, std::size_t& paddedSize) { + // Align 'ptr + 4'. + void* alignedPtr = ptr + 4; + paddedSize -= 4; + std::align(Alignment, allocateCount * sizeof(T), alignedPtr, paddedSize); + + // Write alignment delta just before the aligned pointer. + int32_t delta = (char*)alignedPtr - ptr - 4; + *reinterpret_cast((char*)alignedPtr - 4) = delta; + + return reinterpret_cast(alignedPtr); + } + + HashStringAllocator* allocator_; const bool poolAligned_; }; diff --git a/velox/common/memory/MallocAllocator.cpp b/velox/common/memory/MallocAllocator.cpp index c8cdb42ca6d7c..ff44791763ace 100644 --- a/velox/common/memory/MallocAllocator.cpp +++ b/velox/common/memory/MallocAllocator.cpp @@ -20,100 +20,99 @@ #include namespace facebook::velox::memory { -MallocAllocator::MallocAllocator(size_t capacity) - : kind_(MemoryAllocator::Kind::kMalloc), capacity_(capacity) {} +MallocAllocator::MallocAllocator(size_t capacity, uint32_t reservationByteLimit) + : kind_(MemoryAllocator::Kind::kMalloc), + capacity_(capacity), + reservationByteLimit_(reservationByteLimit), + reserveFunc_( + [this](uint32_t& counter, uint32_t increment, std::mutex& lock) { + return incrementUsageWithReservationFunc(counter, increment, lock); + }), + releaseFunc_( + [&](uint32_t& counter, uint32_t decrement, std::mutex& lock) { + decrementUsageWithReservationFunc(counter, decrement, lock); + return true; + }), + reservations_(std::thread::hardware_concurrency()) {} + +MallocAllocator::~MallocAllocator() { + // TODO: Remove the check when memory leak issue is resolved. + if (FLAGS_velox_memory_leak_check_enabled) { + VELOX_CHECK( + ((allocatedBytes_ - reservations_.read()) == 0) && + (numAllocated_ == 0) && (numMapped_ == 0), + "{}", + toString()); + } +} bool MallocAllocator::allocateNonContiguousWithoutRetry( - MachinePageCount numPages, - Allocation& out, - ReservationCallback reservationCB, - MachinePageCount minSizeClass) { - const uint64_t freedBytes = freeNonContiguous(out); - if (numPages == 0) { - if (freedBytes != 0 && reservationCB != nullptr) { - reservationCB(freedBytes, false); - } + const SizeMix& sizeMix, + Allocation& out) { + freeNonContiguous(out); + if (sizeMix.totalPages == 0) { return true; } - const SizeMix mix = allocationSize(numPages, minSizeClass); - const auto totalBytes = AllocationTraits::pageBytes(mix.totalPages); - if (!incrementUsage(totalBytes)) { - if (freedBytes != 0 && reservationCB != nullptr) { - reservationCB(freedBytes, false); - } + const auto totalBytes = AllocationTraits::pageBytes(sizeMix.totalPages); + if (testingHasInjectedFailure(InjectedFailure::kCap) || + !incrementUsage(totalBytes)) { + const auto errorMsg = fmt::format( + "Exceeded memory allocator limit when allocating {} new pages" + ", the memory allocator capacity is {}", + sizeMix.totalPages, + succinctBytes(capacity_)); + VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) << errorMsg; + setAllocatorFailureMessage(errorMsg); return false; } - uint64_t bytesToAllocate = 0; - if (reservationCB != nullptr) { - bytesToAllocate = AllocationTraits::pageBytes(mix.totalPages) - freedBytes; - try { - reservationCB(bytesToAllocate, true); - } catch (std::exception& e) { - VELOX_MEM_LOG(WARNING) - << "Failed to reserve " << succinctBytes(bytesToAllocate) - << " for non-contiguous allocation of " << numPages - << " pages, then release " << succinctBytes(freedBytes) - << " from the old allocation"; - // If the new memory reservation fails, we need to release the memory - // reservation of the freed memory of previously allocation. - reservationCB(freedBytes, false); - decrementUsage(totalBytes); - std::rethrow_exception(std::current_exception()); - } - } - - std::vector pages; - pages.reserve(mix.numSizes); - for (int32_t i = 0; i < mix.numSizes; ++i) { - // Trigger allocation failure by breaking out the loop. - if (testingHasInjectedFailure(InjectedFailure::kAllocate)) { - break; - } + std::vector buffers; + buffers.reserve(sizeMix.numSizes); + for (int32_t i = 0; i < sizeMix.numSizes; ++i) { MachinePageCount numSizeClassPages = - mix.sizeCounts[i] * sizeClassSizes_[mix.sizeIndices[i]]; - void* ptr; - stats_.recordAllocate( - AllocationTraits::pageBytes(sizeClassSizes_[mix.sizeIndices[i]]), - mix.sizeCounts[i], - [&]() { - ptr = ::malloc( - AllocationTraits::pageBytes(numSizeClassPages)); // NOLINT - }); + sizeMix.sizeCounts[i] * sizeClassSizes_[sizeMix.sizeIndices[i]]; + void* ptr = nullptr; + // Trigger allocation failure by skipping malloc + if (!testingHasInjectedFailure(InjectedFailure::kAllocate)) { + stats_.recordAllocate( + AllocationTraits::pageBytes(sizeClassSizes_[sizeMix.sizeIndices[i]]), + sizeMix.sizeCounts[i], + [&]() { + ptr = ::malloc( + AllocationTraits::pageBytes(numSizeClassPages)); // NOLINT + }); + } if (ptr == nullptr) { // Failed to allocate memory from memory. + const auto errorMsg = fmt::format( + "Malloc failed to allocate {} of memory while allocating for " + "non-contiguous allocation of {} pages", + succinctBytes(AllocationTraits::pageBytes(numSizeClassPages)), + sizeMix.totalPages); + VELOX_MEM_LOG(WARNING) << errorMsg; + setAllocatorFailureMessage(errorMsg); break; } - pages.emplace_back(ptr); + buffers.push_back(ptr); out.append(reinterpret_cast(ptr), numSizeClassPages); // NOLINT } - if (pages.size() != mix.numSizes) { + if (buffers.size() != sizeMix.numSizes) { // Failed to allocate memory using malloc. Free any malloced pages and // return false. - for (auto ptr : pages) { - ::free(ptr); + for (auto* buffer : buffers) { + ::free(buffer); } out.clear(); - if (reservationCB != nullptr) { - VELOX_MEM_LOG(WARNING) - << "Failed to allocate memory for non-contiguous allocation of " - << numPages << " pages, then release " - << succinctBytes(bytesToAllocate + freedBytes) - << " of memory reservation including the old allocation"; - reservationCB(bytesToAllocate + freedBytes, false); - } + VELOX_MEM_LOG(WARNING) + << "Failed to allocate memory for non-contiguous allocation of " + << sizeMix.totalPages << " pages"; decrementUsage(totalBytes); return false; } - { - std::lock_guard l(mallocsMutex_); - mallocs_.insert(pages.begin(), pages.end()); - } - // Successfully allocated all pages. - numAllocated_.fetch_add(mix.totalPages); + numAllocated_.fetch_add(sizeMix.totalPages); return true; } @@ -121,12 +120,10 @@ bool MallocAllocator::allocateContiguousWithoutRetry( MachinePageCount numPages, Allocation* collateral, ContiguousAllocation& allocation, - ReservationCallback reservationCB, MachinePageCount maxPages) { bool result; stats_.recordAllocate(AllocationTraits::pageBytes(numPages), 1, [&]() { - result = allocateContiguousImpl( - numPages, collateral, allocation, reservationCB, maxPages); + result = allocateContiguousImpl(numPages, collateral, allocation, maxPages); }); return result; } @@ -135,17 +132,14 @@ bool MallocAllocator::allocateContiguousImpl( MachinePageCount numPages, Allocation* collateral, ContiguousAllocation& allocation, - ReservationCallback reservationCB, MachinePageCount maxPages) { if (maxPages == 0) { maxPages = numPages; } else { VELOX_CHECK_LE(numPages, maxPages); } - MachinePageCount numCollateralPages = 0; if (collateral != nullptr) { - numCollateralPages = - freeNonContiguous(*collateral) / AllocationTraits::kPageSize; + freeNonContiguous(*collateral); } auto numContiguousCollateralPages = allocation.numPages(); if (numContiguousCollateralPages > 0) { @@ -159,41 +153,22 @@ bool MallocAllocator::allocateContiguousImpl( decrementUsage(AllocationTraits::pageBytes(numContiguousCollateralPages)); allocation.clear(); } - const auto totalCollateralPages = - numCollateralPages + numContiguousCollateralPages; - const auto totalCollateralBytes = - AllocationTraits::pageBytes(totalCollateralPages); if (numPages == 0) { - if (totalCollateralBytes != 0 && reservationCB != nullptr) { - reservationCB(totalCollateralBytes, false); - } return true; } const auto totalBytes = AllocationTraits::pageBytes(numPages); - if (!incrementUsage(totalBytes)) { - if (totalCollateralBytes != 0 && reservationCB != nullptr) { - reservationCB(totalCollateralBytes, false); - } + if (testingHasInjectedFailure(InjectedFailure::kCap) || + !incrementUsage(totalBytes)) { + const auto errorMsg = fmt::format( + "Exceeded memory allocator limit when allocating {} new pages, the " + "memory allocator capacity is {}", + numPages, + succinctBytes(capacity_)); + setAllocatorFailureMessage(errorMsg); + VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) << errorMsg; return false; } - const int64_t numNeededPages = numPages - totalCollateralPages; - if (reservationCB != nullptr) { - try { - reservationCB(AllocationTraits::pageBytes(numNeededPages), true); - } catch (std::exception& e) { - // If the new memory reservation fails, we need to release the memory - // reservation of the freed contiguous and non-contiguous memory. - VELOX_MEM_LOG(WARNING) - << "Failed to reserve " << AllocationTraits::pageBytes(numNeededPages) - << " bytes for contiguous allocation of " << numPages - << " pages, then release " << succinctBytes(totalCollateralBytes) - << " from the old allocations"; - reservationCB(totalCollateralBytes, false); - decrementUsage(totalBytes); - std::rethrow_exception(std::current_exception()); - } - } numAllocated_.fetch_add(numPages); numMapped_.fetch_add(numPages); void* data = ::mmap( @@ -216,28 +191,20 @@ int64_t MallocAllocator::freeNonContiguous(Allocation& allocation) { if (allocation.empty()) { return 0; } - MachinePageCount numFreed = 0; + MachinePageCount freedPages{0}; for (int32_t i = 0; i < allocation.numRuns(); ++i) { Allocation::PageRun run = allocation.runAt(i); - numFreed += run.numPages(); void* ptr = run.data(); - { - std::lock_guard l(mallocsMutex_); - const auto ret = mallocs_.erase(ptr); - VELOX_CHECK_EQ(ret, 1, "Bad free page pointer: {}", ptr); - } - stats_.recordFree( - std::min( - AllocationTraits::pageBytes(sizeClassSizes_.back()), - AllocationTraits::pageBytes(run.numPages())), - [&]() { - ::free(ptr); // NOLINT - }); + const int64_t numPages = run.numPages(); + freedPages += numPages; + stats_.recordFree(AllocationTraits::pageBytes(numPages), [&]() { + ::free(ptr); // NOLINT + }); } - const auto freedBytes = AllocationTraits::pageBytes(numFreed); + const auto freedBytes = AllocationTraits::pageBytes(freedPages); decrementUsage(freedBytes); - numAllocated_.fetch_sub(numFreed); + numAllocated_.fetch_sub(freedPages); allocation.clear(); return freedBytes; } @@ -267,19 +234,16 @@ void MallocAllocator::freeContiguousImpl(ContiguousAllocation& allocation) { bool MallocAllocator::growContiguousWithoutRetry( MachinePageCount increment, - ContiguousAllocation& allocation, - ReservationCallback reservationCB) { - VELOX_CHECK_LE( - allocation.size() + increment * AllocationTraits::kPageSize, - allocation.maxSize()); - if (reservationCB != nullptr) { - // May throw. If does, there is nothing to revert. - reservationCB(AllocationTraits::pageBytes(increment), true); - } + ContiguousAllocation& allocation) { if (!incrementUsage(AllocationTraits::pageBytes(increment))) { - if (reservationCB != nullptr) { - reservationCB(AllocationTraits::pageBytes(increment), false); - } + const auto errorMsg = fmt::format( + "Exceeded memory allocator limit when allocating {} new pages for " + "total allocation of {} pages, the memory allocator capacity is {}", + increment, + allocation.numPages(), + succinctBytes(capacity_)); + setAllocatorFailureMessage(errorMsg); + VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) << errorMsg; return false; } numAllocated_ += increment; @@ -295,6 +259,13 @@ void* MallocAllocator::allocateBytesWithoutRetry( uint64_t bytes, uint16_t alignment) { if (!incrementUsage(bytes)) { + auto errorMsg = fmt::format( + "Failed to allocateBytes {}: Exceeded memory allocator " + "limit of {}", + succinctBytes(bytes), + succinctBytes(capacity_)); + VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) << errorMsg; + setAllocatorFailureMessage(errorMsg); return nullptr; } if (!isAlignmentValid(bytes, alignment)) { @@ -315,6 +286,13 @@ void* MallocAllocator::allocateBytesWithoutRetry( void* MallocAllocator::allocateZeroFilledWithoutRetry(uint64_t bytes) { if (!incrementUsage(bytes)) { + auto errorMsg = fmt::format( + "Failed to allocateZeroFilled {}: Exceeded memory allocator " + "limit of {}", + succinctBytes(bytes), + succinctBytes(capacity_)); + VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) << errorMsg; + setAllocatorFailureMessage(errorMsg); return nullptr; } void* result = std::calloc(1, bytes); diff --git a/velox/common/memory/MallocAllocator.h b/velox/common/memory/MallocAllocator.h index 0b5adb0a8d653..ade9536f0fb6b 100644 --- a/velox/common/memory/MallocAllocator.h +++ b/velox/common/memory/MallocAllocator.h @@ -16,6 +16,7 @@ #pragma once +#include "velox/common/base/ConcurrentCounter.h" #include "velox/common/memory/Memory.h" #include "velox/common/memory/MemoryAllocator.h" @@ -25,17 +26,9 @@ namespace facebook::velox::memory { /// The implementation of MemoryAllocator using malloc. class MallocAllocator : public MemoryAllocator { public: - explicit MallocAllocator(size_t capacity); - - ~MallocAllocator() override { - // TODO: Remove the check when memory leak issue is resolved. - if (FLAGS_velox_memory_leak_check_enabled) { - VELOX_CHECK( - (allocatedBytes_ == 0) && (numAllocated_ == 0) && (numMapped_ == 0), - "{}", - toString()); - } - } + MallocAllocator(size_t capacity, uint32_t reservationByteLimit); + + ~MallocAllocator() override; void registerCache(const std::shared_ptr& cache) override { VELOX_CHECK_NULL(cache_); @@ -62,13 +55,18 @@ class MallocAllocator : public MemoryAllocator { bool growContiguousWithoutRetry( MachinePageCount increment, - ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr) override; + ContiguousAllocation& allocation) override; void freeBytes(void* p, uint64_t bytes) noexcept override; + MachinePageCount unmap(MachinePageCount targetPages) override { + // NOTE: MallocAllocator doesn't support unmap as it delegates all the + // memory allocations to std::malloc. + return 0; + } + size_t totalUsedBytes() const override { - return allocatedBytes_; + return allocatedBytes_ - reservations_.read(); } MachinePageCount numAllocated() const override { @@ -85,23 +83,19 @@ class MallocAllocator : public MemoryAllocator { private: bool allocateNonContiguousWithoutRetry( - MachinePageCount numPages, - Allocation& out, - ReservationCallback reservationCB = nullptr, - MachinePageCount minSizeClass = 0) override; + const SizeMix& sizeMix, + Allocation& out) override; bool allocateContiguousWithoutRetry( MachinePageCount numPages, - Allocation* FOLLY_NULLABLE collateral, + Allocation* collateral, ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr, MachinePageCount maxPages = 0) override; bool allocateContiguousImpl( MachinePageCount numPages, - Allocation* FOLLY_NULLABLE collateral, + Allocation* collateral, ContiguousAllocation& allocation, - ReservationCallback reservationCB, MachinePageCount maxPages); void freeContiguousImpl(ContiguousAllocation& allocation); @@ -110,13 +104,52 @@ class MallocAllocator : public MemoryAllocator { void* allocateZeroFilledWithoutRetry(uint64_t bytes) override; - /// Increment current usage and check current allocator consistency to make - /// sure current usage does not go above 'capacity_'. If it goes above - /// 'capacity_', the increment will not be applied. Returns true if within - /// capacity, false otherwise. - /// - /// NOTE: This method should always be called BEFORE actual allocation. + // Increments current usage and check current 'allocatedBytes_' counter to + // make sure current usage does not go above 'capacity_'. If it goes above + // 'capacity_', the increment will not be applied. Returns true if within + // capacity, false otherwise. + // + // NOTE: This method should always be called BEFORE the actual allocation. inline bool incrementUsage(int64_t bytes) { + if (bytes < reservationByteLimit_) { + return incrementUsageWithReservation(bytes); + } + return incrementUsageWithoutReservation(bytes); + } + + // Increments the memory usage in the local sharded counter from + // 'reservations_' for memory allocation with size < 'reservationByteLimit_' + // without updating the global 'allocatedBytes_' counter. If there is not + // enough reserved bytes in local sharded counter, then 'reserveFunc_' is + // called to reserve 'reservationByteLimit_' bytes from the global counter at + // a time. + inline bool incrementUsageWithReservation(uint32_t bytes) { + return reservations_.update(bytes, reserveFunc_); + } + + inline bool incrementUsageWithReservationFunc( + uint32_t& counter, + uint32_t increment, + std::mutex& lock) { + VELOX_CHECK_LT(increment, reservationByteLimit_); + std::lock_guard l(lock); + if (counter > increment) { + counter -= increment; + return true; + } + if (!incrementUsageWithoutReservation(reservationByteLimit_)) { + return false; + } + counter += reservationByteLimit_; + counter -= increment; + VELOX_CHECK_GT(counter, 0); + return true; + } + + // Increments the memory usage from the global 'allocatedBytes_' counter + // directly. + inline bool incrementUsageWithoutReservation(int64_t bytes) { + VELOX_CHECK_GE(bytes, reservationByteLimit_); const auto originalBytes = allocatedBytes_.fetch_add(bytes); // We don't do the check when capacity_ is 0, meaning unlimited capacity. if (capacity_ != 0 && originalBytes + bytes > capacity_) { @@ -126,11 +159,45 @@ class MallocAllocator : public MemoryAllocator { return true; } - /// Decrement current usage and check current allocator consistency to make - /// sure current usage does not go below 0. Throws if usage goes below 0. - /// - /// NOTE: This method should always be called AFTER actual free. + // Decrements current usage and check current 'allocatedBytes_' counter to + // make sure current usage does not go below 0. Throws if usage goes below 0. + // + // NOTE: This method should always be called AFTER actual free. inline void decrementUsage(int64_t bytes) { + if (bytes < reservationByteLimit_) { + decrementUsageWithReservation(bytes); + return; + } + decrementUsageWithoutReservation(bytes); + } + + // Decrements the memory usage in the local sharded counter from + // 'reservations_' for memory free with size < 'reservationByteLimit_' + // without updating the global 'allocatedBytes_' counter. If there is more + // than 2 * 'reservationByteLimit_' free reserved bytes in local sharded + // counter, then 'releaseFunc_' is called to release 'reservationByteLimit_' + // bytes back to the global counter. + inline void decrementUsageWithReservation(int64_t bytes) { + reservations_.update(bytes, releaseFunc_); + } + + inline void decrementUsageWithReservationFunc( + uint32_t& counter, + uint32_t decrement, + std::mutex& lock) { + VELOX_CHECK_LT(decrement, reservationByteLimit_); + std::lock_guard l(lock); + counter += decrement; + if (counter >= 2 * reservationByteLimit_) { + decrementUsageWithoutReservation(reservationByteLimit_); + counter -= reservationByteLimit_; + } + VELOX_CHECK_LT(counter, 2 * reservationByteLimit_); + } + + // Decrements the memory usage from the global 'allocatedBytes_' counter + // directly. + inline void decrementUsageWithoutReservation(int64_t bytes) { const auto originalBytes = allocatedBytes_.fetch_sub(bytes); if (originalBytes - bytes < 0) { // In case of inconsistency while freeing memory, do not revert in this @@ -145,18 +212,18 @@ class MallocAllocator : public MemoryAllocator { const Kind kind_; - /// Capacity in bytes. Total allocation byte is not allowed to exceed this - /// value. + // Capacity in bytes. Total allocation byte is not allowed to exceed this + // value. const size_t capacity_; + const uint32_t reservationByteLimit_; - /// Current total allocated bytes by this 'MallocAllocator'. - std::atomic allocatedBytes_{0}; + const ConcurrentCounter::UpdateFn reserveFunc_; + const ConcurrentCounter::UpdateFn releaseFunc_; - /// Mutex for 'mallocs_'. - std::mutex mallocsMutex_; + ConcurrentCounter reservations_; - /// Tracks malloc'd pointers to detect bad frees. - std::unordered_set mallocs_; + // Current total allocated bytes by this 'MallocAllocator'. + std::atomic allocatedBytes_{0}; std::shared_ptr cache_; }; diff --git a/velox/common/memory/Memory.cpp b/velox/common/memory/Memory.cpp index 882a27ebb4777..6eb2437d2f8e7 100644 --- a/velox/common/memory/Memory.cpp +++ b/velox/common/memory/Memory.cpp @@ -16,93 +16,257 @@ #include "velox/common/memory/Memory.h" +#include + +#include "velox/common/base/Counters.h" +#include "velox/common/base/StatsReporter.h" +#include "velox/common/memory/MallocAllocator.h" +#include "velox/common/memory/MmapAllocator.h" + DECLARE_int32(velox_memory_num_shared_leaf_pools); namespace facebook::velox::memory { namespace { -constexpr folly::StringPiece kDefaultRootName{"__default_root__"}; -constexpr folly::StringPiece kDefaultLeafName("__default_leaf__"); +constexpr std::string_view kSysRootName{"__sys_root__"}; +constexpr std::string_view kSysSharedLeafNamePrefix{"__sys_shared_leaf__"}; + +struct SingletonState { + ~SingletonState() { + delete instance.load(std::memory_order_acquire); + } + + std::atomic instance{nullptr}; + std::mutex mutex; +}; + +SingletonState& singletonState() { + static SingletonState state; + return state; +} + +std::shared_ptr createAllocator( + const MemoryManagerOptions& options) { + if (options.useMmapAllocator) { + MmapAllocator::Options mmapOptions; + mmapOptions.capacity = options.allocatorCapacity; + mmapOptions.largestSizeClass = options.largestSizeClassPages; + mmapOptions.useMmapArena = options.useMmapArena; + mmapOptions.mmapArenaCapacityRatio = options.mmapArenaCapacityRatio; + return std::make_shared(mmapOptions); + } else { + return std::make_shared( + options.allocatorCapacity, + options.allocationSizeThresholdWithReservation); + } +} + +std::unique_ptr createArbitrator( + const MemoryManagerOptions& options) { + // TODO: consider to reserve a small amount of memory to compensate for the + // non-reclaimable cache memory which are pinned by query accesses if + // enabled. + + // TODO(jtan6): [Config Refactor] clean up the if condition after Prestissimo + // switched to use extra configs map. + if (options.extraArbitratorConfigs.empty()) { + std::unordered_map extraArbitratorConfigs; + try { + // The literal string is temporary in order to not depend on + // SharedArbitrator class. After Prestissimo switches, this part of the + // code will be removed. + extraArbitratorConfigs["reserved-capacity"] = + folly::to(options.arbitratorReservedCapacity) + "B"; + extraArbitratorConfigs["memory-pool-initial-capacity"] = + folly::to(options.memoryPoolInitCapacity) + "B"; + extraArbitratorConfigs["memory-pool-reserved-capacity"] = + folly::to(options.memoryPoolReservedCapacity) + "B"; + extraArbitratorConfigs["memory-pool-transfer-capacity"] = + folly::to(options.memoryPoolTransferCapacity) + "B"; + extraArbitratorConfigs["fast-exponential-growth-capacity-limit"] = + folly::to(options.fastExponentialGrowthCapacityLimit) + + "B"; + extraArbitratorConfigs["slow-capacity-grow-pct"] = + folly::to(options.slowCapacityGrowPct); + extraArbitratorConfigs["memory-pool-min-free-capacity"] = + folly::to(options.memoryPoolMinFreeCapacity) + "B"; + extraArbitratorConfigs["memory-pool-min-free-capacity-pct"] = + folly::to(options.memoryPoolMinFreeCapacityPct); + extraArbitratorConfigs["memory-reclaim-max-wait-time"] = + folly::to(options.memoryReclaimWaitMs) + "ms"; + extraArbitratorConfigs["global-arbitration-enabled"] = + folly::to(options.globalArbitrationEnabled); + extraArbitratorConfigs["check-usage-leak"] = + folly::to(options.checkUsageLeak); + } catch (const std::exception& e) { + VELOX_USER_FAIL("Failed to parse extra arbitrator configs: {}", e.what()); + } + return MemoryArbitrator::create( + {.kind = options.arbitratorKind, + .capacity = + std::min(options.arbitratorCapacity, options.allocatorCapacity), + .arbitrationStateCheckCb = options.arbitrationStateCheckCb, + .extraConfigs = extraArbitratorConfigs}); + } else { + return MemoryArbitrator::create( + {.kind = options.arbitratorKind, + .capacity = + std::min(options.arbitratorCapacity, options.allocatorCapacity), + .arbitrationStateCheckCb = options.arbitrationStateCheckCb, + .extraConfigs = options.extraArbitratorConfigs}); + } +} + +std::vector> createSharedLeafMemoryPools( + MemoryPool& sysPool) { + VELOX_CHECK_EQ(sysPool.name(), kSysRootName); + std::vector> leafPools; + const size_t numSharedPools = + std::max(1, FLAGS_velox_memory_num_shared_leaf_pools); + leafPools.reserve(numSharedPools); + for (size_t i = 0; i < numSharedPools; ++i) { + leafPools.emplace_back( + sysPool.addLeafChild(fmt::format("{}{}", kSysSharedLeafNamePrefix, i))); + } + return leafPools; +} } // namespace MemoryManager::MemoryManager(const MemoryManagerOptions& options) - : capacity_{options.capacity}, - allocator_{options.allocator->shared_from_this()}, - // TODO: consider to reserve a small amount of memory to compensate for - // the unreclaimable cache memory which are pinned by query accesses if - // enabled. - arbitrator_(MemoryArbitrator::create( - {.kind = options.arbitratorKind, - .capacity = std::min(options.queryMemoryCapacity, options.capacity), - .memoryPoolInitCapacity = options.memoryPoolInitCapacity, - .memoryPoolTransferCapacity = options.memoryPoolTransferCapacity, - .arbitrationStateCheckCb = options.arbitrationStateCheckCb})), + : allocator_{createAllocator(options)}, + arbitrator_(createArbitrator(options)), alignment_(std::max(MemoryAllocator::kMinAlignment, options.alignment)), checkUsageLeak_(options.checkUsageLeak), debugEnabled_(options.debugEnabled), + coreOnAllocationFailureEnabled_(options.coreOnAllocationFailureEnabled), + disableMemoryPoolTracking_(options.disableMemoryPoolTracking), poolDestructionCb_([&](MemoryPool* pool) { dropPool(pool); }), - defaultRoot_{std::make_shared( + sysRoot_{std::make_shared( this, - kDefaultRootName.str(), + std::string(kSysRootName), MemoryPool::Kind::kAggregate, nullptr, nullptr, - nullptr, // NOTE: the default root memory pool has no capacity limit, and it is // used for system usage in production such as disk spilling. MemoryPool::Options{ .alignment = alignment_, .maxCapacity = kMaxMemory, .trackUsage = options.trackDefaultUsage, - .checkUsageLeak = options.checkUsageLeak, - .debugEnabled = options.debugEnabled})} { + .debugEnabled = options.debugEnabled, + .coreOnAllocationFailureEnabled = + options.coreOnAllocationFailureEnabled})}, + spillPool_{addLeafPool("__sys_spilling__")}, + tracePool_{addLeafPool("__sys_tracing__")}, + sharedLeafPools_(createSharedLeafMemoryPools(*sysRoot_)) { VELOX_CHECK_NOT_NULL(allocator_); VELOX_CHECK_NOT_NULL(arbitrator_); - VELOX_CHECK_EQ( - allocator_->capacity(), - capacity_, - "MemoryAllocator capacity {} must be the same as MemoryManager capacity {}.", - allocator_->capacity(), - capacity_); - VELOX_USER_CHECK_GE(capacity_, 0); + VELOX_USER_CHECK_GE(capacity(), 0); + VELOX_CHECK_GE(allocator_->capacity(), arbitrator_->capacity()); MemoryAllocator::alignmentCheck(0, alignment_); - defaultRoot_->grow(defaultRoot_->maxCapacity()); - const size_t numSharedPools = - std::max(1, FLAGS_velox_memory_num_shared_leaf_pools); - sharedLeafPools_.reserve(numSharedPools); - for (size_t i = 0; i < numSharedPools; ++i) { - sharedLeafPools_.emplace_back( - addLeafPool(fmt::format("default_shared_leaf_pool_{}", i))); - } + const bool ret = sysRoot_->grow(sysRoot_->maxCapacity(), 0); + VELOX_CHECK( + ret, + "Failed to set max capacity {} for {}", + succinctBytes(sysRoot_->maxCapacity()), + sysRoot_->name()); + VELOX_CHECK_EQ( + sharedLeafPools_.size(), + std::max(1, FLAGS_velox_memory_num_shared_leaf_pools)); } MemoryManager::~MemoryManager() { - if (checkUsageLeak_) { - VELOX_CHECK_EQ( - numPools(), + if (pools_.size() != 0) { + const auto errMsg = fmt::format( + "pools_.size() != 0 ({} vs {}). There are unexpected alive memory " + "pools allocated by user on memory manager destruction:\n{}", + pools_.size(), 0, - "There are {} unexpected alive memory pools allocated by user on memory manager destruction:\n{}", - numPools(), - toString()); + toString(true)); + if (checkUsageLeak_) { + VELOX_FAIL(errMsg); + } else { + LOG(ERROR) << errMsg; + } } } // static -MemoryManager& MemoryManager::getInstance(const MemoryManagerOptions& options) { - static MemoryManager manager{options}; - return manager; +MemoryManager& MemoryManager::deprecatedGetInstance( + const MemoryManagerOptions& options) { + auto& state = singletonState(); + if (auto* instance = state.instance.load(std::memory_order_acquire)) { + return *instance; + } + + std::lock_guard l(state.mutex); + auto* instance = state.instance.load(std::memory_order_acquire); + if (instance != nullptr) { + return *instance; + } + instance = new MemoryManager(options); + state.instance.store(instance, std::memory_order_release); + return *instance; +} + +// static +void MemoryManager::initialize(const MemoryManagerOptions& options) { + auto& state = singletonState(); + std::lock_guard l(state.mutex); + auto* instance = state.instance.load(std::memory_order_acquire); + VELOX_CHECK_NULL( + instance, + "The memory manager has already been set: {}", + instance->toString()); + instance = new MemoryManager(options); + state.instance.store(instance, std::memory_order_release); +} + +// static. +MemoryManager* MemoryManager::getInstance() { + auto* instance = singletonState().instance.load(std::memory_order_acquire); + VELOX_CHECK_NOT_NULL(instance, "The memory manager is not set"); + return instance; +} + +// static. +MemoryManager& MemoryManager::testingSetInstance( + const MemoryManagerOptions& options) { + auto& state = singletonState(); + std::lock_guard l(state.mutex); + auto* instance = new MemoryManager(options); + delete state.instance.exchange(instance, std::memory_order_acq_rel); + return *instance; } int64_t MemoryManager::capacity() const { - return capacity_; + return allocator_->capacity(); } uint16_t MemoryManager::alignment() const { return alignment_; } +std::shared_ptr MemoryManager::createRootPool( + std::string poolName, + std::unique_ptr& reclaimer, + MemoryPool::Options& options) { + auto pool = std::make_shared( + this, + poolName, + MemoryPool::Kind::kAggregate, + nullptr, + std::move(reclaimer), + options); + VELOX_CHECK_EQ(pool->capacity(), 0); + arbitrator_->addPool(pool); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricMemoryPoolInitialCapacityBytes, pool->capacity()); + return pool; +} + std::shared_ptr MemoryManager::addRootPool( const std::string& name, - int64_t capacity, + int64_t maxCapacity, std::unique_ptr reclaimer) { std::string poolName = name; if (poolName.empty()) { @@ -112,26 +276,28 @@ std::shared_ptr MemoryManager::addRootPool( MemoryPool::Options options; options.alignment = alignment_; - options.maxCapacity = capacity; + options.maxCapacity = maxCapacity; options.trackUsage = true; - options.checkUsageLeak = checkUsageLeak_; options.debugEnabled = debugEnabled_; + options.coreOnAllocationFailureEnabled = coreOnAllocationFailureEnabled_; - folly::SharedMutex::WriteHolder guard{mutex_}; - if (pools_.find(poolName) != pools_.end()) { - VELOX_FAIL("Duplicate root pool name found: {}", poolName); + auto pool = createRootPool(poolName, reclaimer, options); + if (!disableMemoryPoolTracking_) { + try { + std::unique_lock guard{mutex_}; + if (pools_.find(poolName) != pools_.end()) { + VELOX_FAIL("Duplicate root pool name found: {}", poolName); + } + pools_.emplace(poolName, pool); + } catch (const VeloxRuntimeError& ex) { + arbitrator_->removePool(pool.get()); + throw; + } } - auto pool = std::make_shared( - this, - poolName, - MemoryPool::Kind::kAggregate, - nullptr, - std::move(reclaimer), - poolDestructionCb_, - options); - pools_.emplace(poolName, pool); - VELOX_CHECK_EQ(pool->capacity(), 0); - arbitrator_->reserveMemory(pool.get(), capacity); + // NOTE: we need to set destruction callback at the end to avoid potential + // deadlock or failure because of duplicate memory pool name or unexpected + // failure to add memory pool to the arbitrator. + pool->setDestructionCallback(poolDestructionCb_); return pool; } @@ -143,33 +309,33 @@ std::shared_ptr MemoryManager::addLeafPool( static std::atomic poolId{0}; poolName = fmt::format("default_leaf_{}", poolId++); } - return defaultRoot_->addLeafChild(poolName, threadSafe, nullptr); + return sysRoot_->addLeafChild(poolName, threadSafe, nullptr); } -bool MemoryManager::growPool(MemoryPool* pool, uint64_t incrementBytes) { - VELOX_CHECK_NOT_NULL(pool); - VELOX_CHECK_NE(pool->capacity(), kMaxMemory); - return arbitrator_->growMemory(pool, getAlivePools(), incrementBytes); -} - -uint64_t MemoryManager::shrinkPools(uint64_t targetBytes) { - return arbitrator_->shrinkMemory(getAlivePools(), targetBytes); +uint64_t MemoryManager::shrinkPools( + uint64_t targetBytes, + bool allowSpill, + bool allowAbort) { + return arbitrator_->shrinkCapacity(targetBytes, allowSpill, allowAbort); } void MemoryManager::dropPool(MemoryPool* pool) { VELOX_CHECK_NOT_NULL(pool); - folly::SharedMutex::WriteHolder guard{mutex_}; + VELOX_DCHECK_EQ(pool->reservedBytes(), 0); + arbitrator_->removePool(pool); + if (disableMemoryPoolTracking_) { + return; + } + std::unique_lock guard{mutex_}; auto it = pools_.find(pool->name()); if (it == pools_.end()) { VELOX_FAIL("The dropped memory pool {} not found", pool->name()); } pools_.erase(it); - arbitrator_->releaseMemory(pool); } MemoryPool& MemoryManager::deprecatedSharedLeafPool() { const auto idx = std::hash{}(std::this_thread::get_id()); - folly::SharedMutex::ReadHolder guard{mutex_}; return *sharedLeafPools_.at(idx % sharedLeafPools_.size()); } @@ -178,35 +344,45 @@ int64_t MemoryManager::getTotalBytes() const { } size_t MemoryManager::numPools() const { - size_t numPools = defaultRoot_->getChildCount(); - VELOX_CHECK_GE(numPools, 0); + size_t numPools = sysRoot_->getChildCount(); { - folly::SharedMutex::ReadHolder guard{mutex_}; + std::shared_lock guard{mutex_}; numPools += pools_.size() - sharedLeafPools_.size(); } return numPools; } -MemoryAllocator& MemoryManager::allocator() { - return *allocator_; +MemoryAllocator* MemoryManager::allocator() { + return allocator_.get(); } MemoryArbitrator* MemoryManager::arbitrator() { return arbitrator_.get(); } -std::string MemoryManager::toString() const { +std::string MemoryManager::toString(bool detail) const { + const int64_t allocatorCapacity = capacity(); std::stringstream out; out << "Memory Manager[capacity " - << (capacity_ == kMaxMemory ? "UNLIMITED" : succinctBytes(capacity_)) + << (allocatorCapacity == kMaxMemory ? "UNLIMITED" + : succinctBytes(allocatorCapacity)) << " alignment " << succinctBytes(alignment_) << " usedBytes " << succinctBytes(getTotalBytes()) << " number of pools " << numPools() << "\n"; out << "List of root pools:\n"; - out << "\t" << defaultRoot_->name() << "\n"; + if (detail) { + out << sysRoot_->treeMemoryUsage(false); + } else { + out << "\t" << sysRoot_->name() << "\n"; + } std::vector> pools = getAlivePools(); for (const auto& pool : pools) { - out << "\t" << pool->name() << "\n"; + if (detail) { + out << pool->treeMemoryUsage(false); + } else { + out << "\t" << pool->name() << "\n"; + } + out << "\trefcount " << pool.use_count() << "\n"; } out << allocator_->toString() << "\n"; out << arbitrator_->toString(); @@ -216,7 +392,7 @@ std::string MemoryManager::toString() const { std::vector> MemoryManager::getAlivePools() const { std::vector> pools; - folly::SharedMutex::ReadHolder guard{mutex_}; + std::shared_lock guard{mutex_}; pools.reserve(pools_.size()); for (const auto& entry : pools_) { auto pool = entry.second.lock(); @@ -227,27 +403,38 @@ std::vector> MemoryManager::getAlivePools() const { return pools; } -MemoryManager& defaultMemoryManager() { +void initializeMemoryManager(const MemoryManagerOptions& options) { + MemoryManager::initialize(options); +} + +MemoryManager* memoryManager() { return MemoryManager::getInstance(); } -std::shared_ptr addDefaultLeafMemoryPool( +MemoryManager& deprecatedDefaultMemoryManager() { + return MemoryManager::deprecatedGetInstance(); +} + +std::shared_ptr deprecatedAddDefaultLeafMemoryPool( const std::string& name, bool threadSafe) { - auto& memoryManager = defaultMemoryManager(); + auto& memoryManager = deprecatedDefaultMemoryManager(); return memoryManager.addLeafPool(name, threadSafe); } MemoryPool& deprecatedSharedLeafPool() { - return defaultMemoryManager().deprecatedSharedLeafPool(); + return deprecatedDefaultMemoryManager().deprecatedSharedLeafPool(); } memory::MemoryPool* spillMemoryPool() { - static auto pool = memory::addDefaultLeafMemoryPool("_sys.spilling"); - return pool.get(); + return memory::MemoryManager::getInstance()->spillPool(); } bool isSpillMemoryPool(memory::MemoryPool* pool) { return pool == spillMemoryPool(); } + +memory::MemoryPool* traceMemoryPool() { + return memory::MemoryManager::getInstance()->tracePool(); +} } // namespace facebook::velox::memory diff --git a/velox/common/memory/Memory.h b/velox/common/memory/Memory.h index 1f6fa29836de6..5daea43bd1379 100644 --- a/velox/common/memory/Memory.h +++ b/velox/common/memory/Memory.h @@ -63,21 +63,6 @@ struct MemoryManagerOptions { /// Specifies the default memory allocation alignment. uint16_t alignment{MemoryAllocator::kMaxAlignment}; - /// Specifies the max memory capacity in bytes. MemoryManager will not - /// enforce capacity. This will be used by MemoryArbitrator - int64_t capacity{MemoryAllocator::kDefaultCapacityBytes}; - - /// Memory capacity for query/task memory pools. This capacity setting should - /// be equal or smaller than 'capacity'. The difference between 'capacity' and - /// 'queryMemoryCapacity' is reserved for system usage such as cache and - /// spilling. - /// - /// NOTE: - /// - if 'queryMemoryCapacity' is greater than 'capacity', the behavior - /// will be equivalent to as if they are equal, meaning no reservation - /// capacity for system usage. - int64_t queryMemoryCapacity{kMaxMemory}; - /// If true, enable memory usage tracking in the default memory pool. bool trackDefaultUsage{ FLAGS_velox_enable_memory_usage_track_in_default_memory_pool}; @@ -93,10 +78,85 @@ struct MemoryManagerOptions { /// testing purpose. bool debugEnabled{FLAGS_velox_memory_pool_debug_enabled}; - /// Specifies the backing memory allocator. - MemoryAllocator* allocator{MemoryAllocator::getInstance()}; + /// Terminates the process and generates a core file on an allocation failure + bool coreOnAllocationFailureEnabled{false}; + + /// Disables the memory manager's tracking on memory pools. + bool disableMemoryPoolTracking{false}; + + /// ================== 'MemoryAllocator' settings ================== + + /// Specifies the max memory allocation capacity in bytes enforced by + /// MemoryAllocator, default unlimited. + int64_t allocatorCapacity{kMaxMemory}; + + /// If true, uses MmapAllocator for memory allocation which manages the + /// physical memory allocation on its own through std::mmap techniques. If + /// false, use MallocAllocator which delegates the memory allocation to + /// std::malloc. + bool useMmapAllocator{false}; + + /// Number of pages in the largest size class in MmapAllocator. + int32_t largestSizeClassPages{256}; + + /// If true, allocations larger than largest size class size will be delegated + /// to ManagedMmapArena. Otherwise a system mmap call will be issued for each + /// such allocation. + /// + /// NOTE: this only applies for MmapAllocator. + bool useMmapArena{false}; + + /// Used to determine MmapArena capacity. The ratio represents + /// 'allocatorCapacity' to single MmapArena capacity ratio. + /// + /// NOTE: this only applies for MmapAllocator. + int32_t mmapArenaCapacityRatio{10}; + + /// If not zero, reserve 'smallAllocationReservePct'% of space from + /// 'allocatorCapacity' for ad hoc small allocations. And those allocations + /// are delegated to std::malloc. If 'maxMallocBytes' is 0, this value will be + /// disregarded. + /// + /// NOTE: this only applies for MmapAllocator. + uint32_t smallAllocationReservePct{0}; + + /// The allocation threshold less than which an allocation is delegated to + /// std::malloc(). If it is zero, then we don't delegate any allocation + /// std::malloc, and 'smallAllocationReservePct' will be automatically set to + /// 0 disregarding any passed in value. + /// + /// NOTE: this only applies for MmapAllocator. + int32_t maxMallocBytes{3072}; + + /// The memory allocations with size smaller than this threshold check the + /// capacity with local sharded counter to reduce the lock contention on the + /// global allocation counter. The sharded local counters reserve/release + /// memory capacity from the global counter in batch. With this optimization, + /// we don't have to update the global counter for each individual small + /// memory allocation. If it is zero, then this optimization is disabled. The + /// default is 1MB. + /// + /// NOTE: this only applies for MallocAllocator. + uint32_t allocationSizeThresholdWithReservation{1 << 20}; + + /// ================== 'MemoryArbitrator' settings ================= - /// ================== 'MemoryArbitrator' settings ================== + /// Memory capacity available for query/task memory pools. This capacity + /// setting should be equal or smaller than 'allocatorCapacity'. The + /// difference between 'allocatorCapacity' and 'arbitratorCapacity' is + /// reserved for system usage such as cache and spilling. + /// + /// NOTE: + /// - if 'arbitratorCapacity' is greater than 'allocatorCapacity', the + /// behavior will be equivalent to as if they are equal, meaning no + /// reservation capacity for system usage. + int64_t arbitratorCapacity{kMaxMemory}; + + /// Memory capacity reserved to ensure that a query has minimal memory + /// capacity to run. This capacity should be less than 'arbitratorCapacity'. + /// A query's minimal memory capacity is defined by + /// 'memoryPoolReservedCapacity'. + int64_t arbitratorReservedCapacity{0}; /// The string kind of memory arbitrator used in the memory manager. /// @@ -104,12 +164,67 @@ struct MemoryManagerOptions { /// Otherwise MemoryArbitrator::create returns a nullptr. std::string arbitratorKind{}; - /// The initial memory capacity to reserve for a newly created memory pool. + /// The initial memory capacity to reserve for a newly created query memory + /// pool. uint64_t memoryPoolInitCapacity{256 << 20}; + /// The minimal query memory pool capacity that is ensured during arbitration. + /// During arbitration, memory arbitrator ensures the participants' memory + /// pool capacity to be no less than this value on a best-effort basis, for + /// more smooth executions of the queries, to avoid frequent arbitration + /// requests. + uint64_t memoryPoolReservedCapacity{0}; + /// The minimal memory capacity to transfer out of or into a memory pool /// during the memory arbitration. - uint64_t memoryPoolTransferCapacity{32 << 20}; + uint64_t memoryPoolTransferCapacity{128 << 20}; + + /// When growing capacity, the growth bytes will be adjusted in the + /// following way: + /// - If 2 * current capacity is less than or equal to + /// 'fastExponentialGrowthCapacityLimit', grow through fast path by at + /// least doubling the current capacity, when conditions allow (see below + /// NOTE section). + /// - If 2 * current capacity is greater than + /// 'fastExponentialGrowthCapacityLimit', grow through slow path by growing + /// capacity by at least 'slowCapacityGrowPct' * current capacity if + /// allowed (see below NOTE section). + /// + /// NOTE: If original requested growth bytes is larger than the adjusted + /// growth bytes or adjusted growth bytes reaches max capacity limit, the + /// adjusted growth bytes will not be respected. + /// + /// NOTE: Capacity growth adjust is only enabled if both + /// 'fastExponentialGrowthCapacityLimit' and 'slowCapacityGrowPct' are set, + /// otherwise it is disabled. + uint64_t fastExponentialGrowthCapacityLimit{512 << 20}; + double slowCapacityGrowPct{0.25}; + + /// When shrinking capacity, the shrink bytes will be adjusted in a way such + /// that AFTER shrink, the stricter (whichever is smaller) of the following + /// conditions is met, in order to better fit the pool's current memory + /// usage: + /// - Free capacity is greater or equal to capacity * + /// 'memoryPoolMinFreeCapacityPct' + /// - Free capacity is greater or equal to 'memoryPoolMinFreeCapacity' + /// + /// NOTE: In the conditions when original requested shrink bytes ends up + /// with more free capacity than above 2 conditions, the adjusted shrink + /// bytes is not respected. + /// + /// NOTE: Capacity shrink adjustment is enabled when both + /// 'memoryPoolMinFreeCapacityPct' and 'memoryPoolMinFreeCapacity' are set. + uint64_t memoryPoolMinFreeCapacity{128 << 20}; + double memoryPoolMinFreeCapacityPct{0.25}; + + /// Specifies the max time to wait for memory reclaim by arbitration. The + /// memory reclaim might fail if the max wait time has exceeded. If it is + /// zero, then there is no timeout. The default is 5 mins. + uint64_t memoryReclaimWaitMs{300'000}; + + /// If true, it allows memory arbitrator to reclaim used memory cross query + /// memory pools. + bool globalArbitrationEnabled{false}; /// Provided by the query system to validate the state after a memory pool /// enters arbitration if not null. For instance, Prestissimo provides @@ -118,11 +233,16 @@ struct MemoryManagerOptions { /// potential deadlock when reclaim memory from the task of the request memory /// pool. MemoryArbitrationStateCheckCB arbitrationStateCheckCb{nullptr}; + + /// TODO(jtan6): [Config Refactor] Remove above shared arbitrator specific + /// configs after Prestissimo switch to use extra configs map. + /// + /// Additional configs that are arbitrator implementation specific. + std::unordered_map extraArbitratorConfigs{}; }; -/// 'MemoryManager' is responsible for managing the memory pools. For now, users -/// wanting multiple different allocators would need to instantiate different -/// MemoryManager classes and manage them across static boundaries. +/// 'MemoryManager' is responsible for creating allocator, arbitrator and +/// managing the memory pools. class MemoryManager { public: explicit MemoryManager( @@ -130,11 +250,23 @@ class MemoryManager { ~MemoryManager(); - /// Tries to get the singleton memory manager. If not previously initialized, - /// the process singleton manager will be initialized. - FOLLY_EXPORT static MemoryManager& getInstance( + /// Creates process-wide memory manager using specified options. Throws if + /// memory manager has already been created by an easier call. + static void initialize(const MemoryManagerOptions& options); + + /// Returns process-wide memory manager. Throws if 'initialize' hasn't been + /// called yet. + static MemoryManager* getInstance(); + + /// Deprecated. Do not use. Remove once existing call sites are updated. + /// Returns the process-wide default memory manager instance if exists, + /// otherwise creates one based on the specified 'options'. + FOLLY_EXPORT static MemoryManager& deprecatedGetInstance( const MemoryManagerOptions& options = MemoryManagerOptions{}); + /// Used by test to override the process-wide memory manager. + static MemoryManager& testingSetInstance(const MemoryManagerOptions& options); + /// Returns the memory capacity of this memory manager which puts a hard cap /// on memory usage, and any allocation that exceeds this capacity throws. int64_t capacity() const; @@ -142,12 +274,12 @@ class MemoryManager { /// Returns the memory allocation alignment of this memory manager. uint16_t alignment() const; - /// Creates a root memory pool with specified 'name' and 'capacity'. If 'name' - /// is missing, the memory manager generates a default name internally to - /// ensure uniqueness. + /// Creates a root memory pool with specified 'name' and 'maxCapacity'. If + /// 'name' is missing, the memory manager generates a default name internally + /// to ensure uniqueness. std::shared_ptr addRootPool( const std::string& name = "", - int64_t capacity = kMaxMemory, + int64_t maxCapacity = kMaxMemory, std::unique_ptr reclaimer = nullptr); /// Creates a leaf memory pool for direct memory allocation use with specified @@ -160,13 +292,17 @@ class MemoryManager { const std::string& name = "", bool threadSafe = true); - /// Invoked to grows a memory pool's free capacity with at least - /// 'incrementBytes'. The function returns true on success, otherwise false. - bool growPool(MemoryPool* pool, uint64_t incrementBytes); - /// Invoked to shrink alive pools to free 'targetBytes' capacity. The function - /// returns the actual freed memory capacity in bytes. - uint64_t shrinkPools(uint64_t targetBytes); + /// returns the actual freed memory capacity in bytes. If 'targetBytes' is + /// zero, then try to reclaim all the memory from the alive pools. If + /// 'allowSpill' is true, it reclaims the used memory by spilling. If + /// 'allowAbort' is true, it reclaims the used memory by aborting the queries + /// with the most memory usage. If both are true, it first reclaims the used + /// memory by spilling and then abort queries to reach the reclaim target. + uint64_t shrinkPools( + uint64_t targetBytes = 0, + bool allowSpill = true, + bool allowAbort = false); /// Default unmanaged leaf pool with no threadsafe stats support. Libraries /// using this method can get a pool that is shared with other threads. The @@ -186,17 +322,29 @@ class MemoryManager { /// leaf memory pools. size_t numPools() const; - MemoryAllocator& allocator(); + MemoryAllocator* allocator(); MemoryArbitrator* arbitrator(); - /// Returns debug string of this memory manager. - std::string toString() const; + /// Returns debug string of this memory manager. If 'detail' is true, it + /// returns the detailed tree memory usage from all the top level root memory + /// pools. + std::string toString(bool detail = false) const; /// Returns the memory manger's internal default root memory pool for testing /// purpose. MemoryPool& testingDefaultRoot() const { - return *defaultRoot_; + return *sysRoot_; + } + + /// Returns the process wide leaf memory pool used for disk spilling. + MemoryPool* spillPool() { + return spillPool_.get(); + } + + /// Returns the process wide leaf memory pool used for query tracing. + MemoryPool* tracePool() const { + return tracePool_.get(); } const std::vector>& testingSharedLeafPools() { @@ -204,41 +352,62 @@ class MemoryManager { } private: + std::shared_ptr createRootPool( + std::string poolName, + std::unique_ptr& reclaimer, + MemoryPool::Options& options); + void dropPool(MemoryPool* pool); // Returns the shared references to all the alive memory pools in 'pools_'. std::vector> getAlivePools() const; - // Specifies the total memory capacity. Memory manager itself doesn't enforce - // the capacity but relies on memory allocator and memory arbitrator to do the - // enforcement. Memory allocator ensures physical memory allocations are - // within capacity limit. Memory arbitrator ensures that total allocated - // memory pool capacity is within the limit. - const int64_t capacity_; const std::shared_ptr allocator_; + // If not null, used to arbitrate the memory capacity among 'pools_'. const std::unique_ptr arbitrator_; const uint16_t alignment_; const bool checkUsageLeak_; const bool debugEnabled_; - // The destruction callback set for the allocated root memory pools which are + const bool coreOnAllocationFailureEnabled_; + const bool disableMemoryPoolTracking_; + + // The destruction callback set for the allocated root memory pools which are // tracked by 'pools_'. It is invoked on the root pool destruction and removes // the pool from 'pools_'. const MemoryPoolImpl::DestructionCallback poolDestructionCb_; - const std::shared_ptr defaultRoot_; - std::vector> sharedLeafPools_; + const std::shared_ptr sysRoot_; + const std::shared_ptr spillPool_; + const std::shared_ptr tracePool_; + const std::vector> sharedLeafPools_; mutable folly::SharedMutex mutex_; + // All user root pools allocated from 'this'. std::unordered_map> pools_; }; -MemoryManager& defaultMemoryManager(); +/// Initializes the process-wide memory manager based on the specified +/// 'options'. +/// +/// NOTE: user should only call this once on query system startup. Otherwise, +/// the function throws. +void initializeMemoryManager(const MemoryManagerOptions& options); + +/// Returns the process-wide memory manager. +/// +/// NOTE: user should have already initialized memory manager by calling. +/// Otherwise, the function throws. +MemoryManager* memoryManager(); +/// Deprecated. Do not use. +MemoryManager& deprecatedDefaultMemoryManager(); + +/// Deprecated. Do not use. /// Creates a leaf memory pool from the default memory manager for memory /// allocation use. If 'threadSafe' is true, then creates a leaf memory pool /// with thread-safe memory usage tracking. -std::shared_ptr addDefaultLeafMemoryPool( +std::shared_ptr deprecatedAddDefaultLeafMemoryPool( const std::string& name = "", bool threadSafe = true); @@ -257,6 +426,9 @@ memory::MemoryPool* spillMemoryPool(); /// Returns true if the provided 'pool' is the spilling memory pool. bool isSpillMemoryPool(memory::MemoryPool* pool); +/// Returns the system-wide memory pool for tracing memory usage. +memory::MemoryPool* traceMemoryPool(); + FOLLY_ALWAYS_INLINE int32_t alignmentPadding(void* address, int32_t alignment) { auto extra = reinterpret_cast(address) % alignment; return extra == 0 ? 0 : alignment - extra; diff --git a/velox/common/memory/MemoryAllocator.cpp b/velox/common/memory/MemoryAllocator.cpp index f2da6e71f0818..2dd22b2ff5777 100644 --- a/velox/common/memory/MemoryAllocator.cpp +++ b/velox/common/memory/MemoryAllocator.cpp @@ -18,6 +18,7 @@ #include "velox/common/memory/MallocAllocator.h" #include +#include #include #include @@ -28,9 +29,39 @@ DECLARE_bool(velox_memory_use_hugepages); namespace facebook::velox::memory { -std::shared_ptr MemoryAllocator::instance_; -MemoryAllocator* MemoryAllocator::customInstance_; -std::mutex MemoryAllocator::initMutex_; +// static +std::vector MemoryAllocator::makeSizeClassSizes( + MachinePageCount largest) { + VELOX_CHECK_LE(256, largest); + VELOX_CHECK_EQ(largest, bits::nextPowerOfTwo(largest)); + std::vector sizes; + for (auto size = 1; size <= largest; size *= 2) { + sizes.push_back(size); + } + return sizes; +} + +namespace { +std::string& cacheFailureMessage() { + thread_local std::string message; + return message; +} + +std::string& allocatorFailureMessage() { + thread_local std::string errMsg; + return errMsg; +} +} // namespace + +void setCacheFailureMessage(std::string message) { + cacheFailureMessage() = std::move(message); +} + +std::string getAndClearCacheFailureMessage() { + auto errMsg = std::move(cacheFailureMessage()); + cacheFailureMessage().clear(); // ensure its in valid state + return errMsg; +} std::string MemoryAllocator::kindString(Kind kind) { switch (kind) { @@ -57,78 +88,49 @@ MemoryAllocator::SizeMix MemoryAllocator::allocationSize( "Requesting minimum size {} larger than largest size class {}", minSizeClass, sizeClassSizes_.back()); - MemoryAllocator::SizeMix mix; - int32_t needed = numPages; - int32_t pagesToAlloc = 0; + int32_t neededPages = numPages; + MachinePageCount pagesToAlloc{0}; for (int32_t sizeIndex = sizeClassSizes_.size() - 1; sizeIndex >= 0; --sizeIndex) { - const int32_t size = sizeClassSizes_[sizeIndex]; + const MachinePageCount classPageSize = sizeClassSizes_[sizeIndex]; const bool isSmallest = sizeIndex == 0 || sizeClassSizes_[sizeIndex - 1] < minSizeClass; // If the size is less than 1/8 of the size from the next larger, // use the next larger size. - if (size > (needed + (needed / 8)) && !isSmallest) { + if (classPageSize > (neededPages + (neededPages / 8)) && !isSmallest) { continue; } - int32_t numUnits = std::max(1, needed / size); - needed -= numUnits * size; - if (isSmallest && needed > 0) { - // If needed / size had a remainder, add one more unit. Do this - // if the present size class is the smallest or 'minSizeClass' - // size. - ++numUnits; - needed -= size; - } - if (FOLLY_UNLIKELY(numUnits * size > Allocation::PageRun::kMaxPagesInRun)) { - VELOX_MEM_ALLOC_ERROR(fmt::format( - "Too many pages {} to allocate, the number of units {} at size class of {} exceeds the PageRun limit {}", - numPages, - numUnits, - size, - Allocation::PageRun::kMaxPagesInRun)); + const MachinePageCount maxNumClassPages = + Allocation::PageRun::kMaxPagesInRun / classPageSize; + MachinePageCount numClassPages = std::min( + maxNumClassPages, + std::max(1, neededPages / classPageSize)); + neededPages -= numClassPages * classPageSize; + if (isSmallest && neededPages > 0 && numClassPages < maxNumClassPages) { + // If needed / size had a remainder, add one more unit. Do this if the + // present size class is the smallest or 'minSizeClass' size. + ++numClassPages; + neededPages -= classPageSize; } - mix.sizeCounts[mix.numSizes] = numUnits; - pagesToAlloc += numUnits * size; - mix.sizeIndices[mix.numSizes++] = sizeIndex; - if (needed <= 0) { + VELOX_CHECK_LE( + classPageSize * numClassPages, Allocation::PageRun::kMaxPagesInRun); + + mix.sizeCounts.push_back(numClassPages); + mix.sizeIndices.push_back(sizeIndex); + ++mix.numSizes; + pagesToAlloc += numClassPages * classPageSize; + if (neededPages <= 0) { break; } + if (FOLLY_UNLIKELY(numClassPages == maxNumClassPages)) { + ++sizeIndex; + } } mix.totalPages = pagesToAlloc; return mix; } -// static -MemoryAllocator* MemoryAllocator::getInstance() { - std::lock_guard l(initMutex_); - if (customInstance_ != nullptr) { - return customInstance_; - } - if (instance_ != nullptr) { - return instance_.get(); - } - instance_ = createDefaultInstance(); - return instance_.get(); -} - -// static -std::shared_ptr MemoryAllocator::createDefaultInstance() { - return std::make_shared(kDefaultCapacityBytes); -} - -// static -void MemoryAllocator::setDefaultInstance(MemoryAllocator* instance) { - std::lock_guard l(initMutex_); - customInstance_ = instance; -} - -// static -void MemoryAllocator::testingDestroyInstance() { - std::lock_guard l(initMutex_); - instance_ = nullptr; -} - // static bool MemoryAllocator::isAlignmentValid( uint64_t allocateBytes, @@ -154,8 +156,7 @@ void MemoryAllocator::alignmentCheck( MachinePageCount MemoryAllocator::roundUpToSizeClassSize( size_t bytes, const std::vector& sizes) { - auto pages = bits::roundUp(bytes, AllocationTraits::kPageSize) / - AllocationTraits::kPageSize; + auto pages = AllocationTraits::numPages(bytes); VELOX_CHECK_LE(pages, sizes.back()); return *std::lower_bound(sizes.begin(), sizes.end(), pages); } @@ -173,26 +174,57 @@ bool MemoryAllocator::allocateNonContiguous( Allocation& out, ReservationCallback reservationCB, MachinePageCount minSizeClass) { + const MachinePageCount numPagesToFree = out.numPages(); + const uint64_t bytesToFree = AllocationTraits::pageBytes(numPagesToFree); + auto cleanupAllocAndReleaseReservation = [&](uint64_t reservationBytes) { + if (!out.empty()) { + freeNonContiguous(out); + } + if (reservationCB != nullptr && reservationBytes > 0) { + reservationCB(reservationBytes, false); + } + }; + if (numPages == 0) { + cleanupAllocAndReleaseReservation(bytesToFree); + return true; + } + + const SizeMix mix = allocationSize(numPages, minSizeClass); + if (reservationCB != nullptr) { + if (mix.totalPages >= numPagesToFree) { + const uint64_t numNeededPages = mix.totalPages - numPagesToFree; + try { + reservationCB(AllocationTraits::pageBytes(numNeededPages), true); + } catch (const std::exception&) { + VELOX_MEM_LOG_EVERY_MS(WARNING, 1'000) + << "Exceeded memory reservation limit when reserve " + << numNeededPages << " new pages when allocate " << mix.totalPages + << " pages"; + cleanupAllocAndReleaseReservation(bytesToFree); + std::rethrow_exception(std::current_exception()); + } + } else { + const uint64_t numExtraPages = numPagesToFree - mix.totalPages; + reservationCB(AllocationTraits::pageBytes(numExtraPages), false); + } + } + + const auto totalBytesReserved = AllocationTraits::pageBytes(mix.totalPages); + bool success = false; if (cache() == nullptr) { - return allocateNonContiguousWithoutRetry( - numPages, out, reservationCB, minSizeClass); + success = allocateNonContiguousWithoutRetry(mix, out); + } else { + success = cache()->makeSpace( + pagesToAcquire(numPages, out.numPages()), [&](Allocation& acquired) { + freeNonContiguous(acquired); + return allocateNonContiguousWithoutRetry(mix, out); + }); } - bool success = cache()->makeSpace( - pagesToAcquire(numPages, out.numPages()), [&](Allocation& acquired) { - freeNonContiguous(acquired); - return allocateNonContiguousWithoutRetry( - numPages, out, reservationCB, minSizeClass); - }); if (!success) { // There can be a failure where allocation was never called because there // never was a chance based on numAllocated() and capacity(). Make sure old // data is still freed. - if (!out.empty()) { - if (reservationCB) { - reservationCB(AllocationTraits::pageBytes(out.numPages()), false); - } - freeNonContiguous(out); - } + cleanupAllocAndReleaseReservation(totalBytesReserved); } return success; } @@ -203,38 +235,66 @@ bool MemoryAllocator::allocateContiguous( ContiguousAllocation& allocation, ReservationCallback reservationCB, MachinePageCount maxPages) { - if (cache() == nullptr) { - return allocateContiguousWithoutRetry( - numPages, collateral, allocation, reservationCB, maxPages); - } - auto numCollateralPages = + const MachinePageCount numCollateralPages = allocation.numPages() + (collateral ? collateral->numPages() : 0); - bool success = cache()->makeSpace( - pagesToAcquire(numPages, numCollateralPages), [&](Allocation& acquired) { - freeNonContiguous(acquired); - return allocateContiguousWithoutRetry( - numPages, collateral, allocation, reservationCB, maxPages); - }); - if (!success) { - // never was a chance based on numAllocated() and capacity(). Make sure old - // data is still freed. - int64_t bytes = 0; - ; - if (collateral && !collateral->empty()) { - if (reservationCB) { - bytes += AllocationTraits::pageBytes(collateral->numPages()); - } + const uint64_t totalCollateralBytes = + AllocationTraits::pageBytes(numCollateralPages); + auto cleanupCollateralAndReleaseReservation = [&](uint64_t reservationBytes) { + if ((collateral != nullptr) && !collateral->empty()) { freeNonContiguous(*collateral); } if (!allocation.empty()) { - if (reservationCB) { - bytes += allocation.size(); - } freeContiguous(allocation); } - if (bytes) { - reservationCB(bytes, false); + if ((reservationCB) != nullptr && (reservationBytes > 0)) { + reservationCB(reservationBytes, false); } + }; + + if (numPages == 0) { + cleanupCollateralAndReleaseReservation(totalCollateralBytes); + return true; + } + + if (reservationCB != nullptr) { + if (numPages >= numCollateralPages) { + const int64_t numNeededPages = numPages - numCollateralPages; + try { + reservationCB(AllocationTraits::pageBytes(numNeededPages), true); + } catch (const std::exception& e) { + VELOX_MEM_LOG_EVERY_MS(WARNING, 1'000) + << "Exceeded memory reservation limit when reserve " + << numNeededPages << " new pages when allocate " << numPages + << " pages, error: " << e.what(); + cleanupCollateralAndReleaseReservation(totalCollateralBytes); + std::rethrow_exception(std::current_exception()); + } + } else { + const uint64_t numExtraPages = numCollateralPages - numPages; + reservationCB(AllocationTraits::pageBytes(numExtraPages), false); + } + } + + const uint64_t totalBytesReserved = AllocationTraits::pageBytes(numPages); + bool success = false; + if (cache() == nullptr) { + success = allocateContiguousWithoutRetry( + numPages, collateral, allocation, maxPages); + } else { + success = cache()->makeSpace( + pagesToAcquire(numPages, numCollateralPages), + [&](Allocation& acquired) { + freeNonContiguous(acquired); + return allocateContiguousWithoutRetry( + numPages, collateral, allocation, maxPages); + }); + } + + if (!success) { + // There can be a failure where allocation was never called because there + // never was a chance based on numAllocated() and capacity(). Make sure old + // data is still freed. + cleanupCollateralAndReleaseReservation(totalBytesReserved); } return success; } @@ -243,13 +303,29 @@ bool MemoryAllocator::growContiguous( MachinePageCount increment, ContiguousAllocation& allocation, ReservationCallback reservationCB) { + VELOX_CHECK_LE( + allocation.size() + increment * AllocationTraits::kPageSize, + allocation.maxSize()); + if (increment == 0) { + return true; + } + if (reservationCB != nullptr) { + // May throw. If it does, there is nothing to revert. + reservationCB(AllocationTraits::pageBytes(increment), true); + } + bool success = false; if (cache() == nullptr) { - return growContiguousWithoutRetry(increment, allocation, reservationCB); + success = growContiguousWithoutRetry(increment, allocation); + } else { + success = cache()->makeSpace(increment, [&](Allocation& acquired) { + freeNonContiguous(acquired); + return growContiguousWithoutRetry(increment, allocation); + }); } - return cache()->makeSpace(increment, [&](Allocation& acquired) { - freeNonContiguous(acquired); - return growContiguousWithoutRetry(increment, allocation, reservationCB); - }); + if (!success && reservationCB != nullptr) { + reservationCB(AllocationTraits::pageBytes(increment), false); + } + return success; } void* MemoryAllocator::allocateBytes(uint64_t bytes, uint16_t alignment) { @@ -301,14 +377,17 @@ std::string Stats::toString() const { std::stringstream out; int64_t totalClocks = 0; int64_t totalBytes = 0; + int64_t totalAllocations = 0; for (auto i = 0; i < sizes.size(); ++i) { totalClocks += sizes[i].clocks(); totalBytes += sizes[i].totalBytes; + totalAllocations += sizes[i].numAllocations; } out << fmt::format( - "Alloc: {}MB {} Gigaclocks, {}MB advised\n", + "Alloc: {}MB {} Gigaclocks Allocations={}, advised={} MB\n", totalBytes >> 20, totalClocks >> 30, + totalAllocations, numAdvise >> 8); // Sort the size classes by decreasing clocks. @@ -323,10 +402,11 @@ std::string Stats::toString() const { break; } out << fmt::format( - "Size {}K: {}MB {} Megaclocks\n", + "Size {}K: {}MB {} Megaclocks {} Allocations\n", sizes[i].size * 4, sizes[i].totalBytes >> 20, - sizes[i].clocks() >> 20); + sizes[i].clocks() >> 20, + sizes[i].numAllocations); } return out.str(); } @@ -353,4 +433,82 @@ void MemoryAllocator::useHugePages( #endif } +void MemoryAllocator::setAllocatorFailureMessage(std::string message) { + allocatorFailureMessage() = std::move(message); +} + +std::string MemoryAllocator::getAndClearFailureMessage() { + auto allocatorErrMsg = std::move(allocatorFailureMessage()); + allocatorFailureMessage().clear(); + if (cache()) { + if (allocatorErrMsg.empty()) { + return getAndClearCacheFailureMessage(); + } + allocatorErrMsg = + fmt::format("{} {}", allocatorErrMsg, getAndClearCacheFailureMessage()); + } + return allocatorErrMsg; +} + +namespace { +struct TraceState { + struct rusage rusage; + Stats allocatorStats; + int64_t ioTotal; + struct timeval tv; +}; + +int64_t toUsec(struct timeval tv) { + return tv.tv_sec * 1000000LL + tv.tv_usec; +} + +int32_t elapsedUsec(struct timeval end, struct timeval begin) { + return toUsec(end) - toUsec(begin); +} +} // namespace + +void MemoryAllocator::getTracingHooks( + std::function& init, + std::function& report, + std::function ioVolume) { + auto allocator = shared_from_this(); + auto state = std::make_shared(); + init = [state, allocator, ioVolume]() { + getrusage(RUSAGE_SELF, &state->rusage); + struct timezone tz; + gettimeofday(&state->tv, &tz); + state->allocatorStats = allocator->stats(); + state->ioTotal = ioVolume ? ioVolume() : 0; + }; + report = [state, allocator, ioVolume]() -> std::string { + struct rusage rusage; + getrusage(RUSAGE_SELF, &rusage); + auto newStats = allocator->stats(); + float u = elapsedUsec(rusage.ru_utime, state->rusage.ru_utime); + float s = elapsedUsec(rusage.ru_stime, state->rusage.ru_stime); + auto m = allocator->stats() - state->allocatorStats; + float flt = rusage.ru_minflt - state->rusage.ru_minflt; + struct timeval tv; + struct timezone tz; + gettimeofday(&tv, &tz); + float elapsed = elapsedUsec(tv, state->tv); + int64_t io = 0; + if (ioVolume) { + io = ioVolume() - state->ioTotal; + } + std::stringstream out; + out << std::endl + << std::endl + << fmt::format( + "user%={} sys%={} minflt/s={}, io={} MB/s\n", + 100 * u / elapsed, + 100 * s / elapsed, + flt / (elapsed / 1000000), + io / (elapsed)); + out << m.toString() << std::endl; + out << allocator->toString() << std::endl; + return out.str(); + }; +} + } // namespace facebook::velox::memory diff --git a/velox/common/memory/MemoryAllocator.h b/velox/common/memory/MemoryAllocator.h index bba5db0bc1280..4a7ac706c7151 100644 --- a/velox/common/memory/MemoryAllocator.h +++ b/velox/common/memory/MemoryAllocator.h @@ -23,14 +23,13 @@ #include #include +#include #include #include "velox/common/base/CheckedArithmetic.h" #include "velox/common/base/Exceptions.h" #include "velox/common/memory/Allocation.h" #include "velox/common/time/Timer.h" -DECLARE_bool(velox_use_malloc); -DECLARE_int32(velox_memory_pool_mb); DECLARE_bool(velox_time_allocations); namespace facebook::velox::memory { @@ -141,11 +140,12 @@ struct Stats { class MemoryAllocator; -/// A general cache interface using 'MemroyAllocator' to allocate memory, that +/// A general cache interface using 'MemoryAllocator' to allocate memory, that /// is also able to free up memory upon request by shrinking itself. class Cache { public: virtual ~Cache() = default; + /// This method should be implemented so that it tries to /// accommodate the passed in 'allocate' by freeing up space from /// 'this' if needed. 'numPages' is the number of pages 'allocate @@ -158,9 +158,24 @@ class Cache { memory::MachinePageCount numPages, std::function allocate) = 0; + /// This method is implemented to shrink the cache space with the specified + /// 'targetBytes'. The method returns the actually freed cache space in bytes. + virtual uint64_t shrink(uint64_t targetBytes) = 0; + virtual MemoryAllocator* allocator() const = 0; }; +/// Sets a thread level failure message describing cache state. Used +/// for example to expose why space could not be freed from +/// cache. This is defined here with the abstract Cache base class +/// and not the cache implementation because allocator cannot depend +/// on cache. +void setCacheFailureMessage(std::string message); + +/// Returns and clears a thread local message set with +/// setCacheFailureMessage(). +std::string getAndClearCacheFailureMessage(); + /// This class provides interface for the actual memory allocations from memory /// pool. It allocates runs of machine pages from predefined size classes, and /// supports both contiguous and non-contiguous memory allocations. An @@ -183,35 +198,18 @@ class MemoryAllocator : public std::enable_shared_from_this { kMalloc, /// The memory allocator kind which is implemented by MmapAllocator. It /// manages the large chunk of memory allocations on its own by leveraging - /// mmap and madvice, to optimize the memory fragmentation in the long + /// mmap and madvise, to optimize the memory fragmentation in the long /// running service such as Prestissimo. kMmap, }; static std::string kindString(Kind kind); - /// Returns the process-wide default instance or an application-supplied - /// custom instance set via setDefaultInstance(). - static MemoryAllocator* getInstance(); - - /// Overrides the process-wide default instance. The caller keeps ownership - /// and must not destroy the instance until it is empty. Calling this with - /// nullptr restores the initial process-wide default instance. - static void setDefaultInstance(MemoryAllocator* instance); - - /// Creates a default MemoryAllocator instance but does not set this to - /// process default. - static std::shared_ptr createDefaultInstance(); - - static void testingDestroyInstance(); - virtual ~MemoryAllocator() = default; static constexpr int32_t kMaxSizeClasses = 12; static constexpr uint16_t kMinAlignment = alignof(max_align_t); static constexpr uint16_t kMaxAlignment = 64; - static constexpr uint64_t kDefaultCapacityBytes = - std::numeric_limits::max(); /// Returns the kind of this memory allocator. For AsyncDataCache, it returns /// the kind of the delegated memory allocator underneath. @@ -222,7 +220,7 @@ class MemoryAllocator : public std::enable_shared_from_this { /// the same as 'this'. virtual void registerCache(const std::shared_ptr& cache) = 0; - using ReservationCallback = std::function; + using ReservationCallback = std::function; /// Returns the capacity of the allocator in bytes. virtual size_t capacity() const = 0; @@ -317,6 +315,12 @@ class MemoryAllocator : public std::enable_shared_from_this { /// reallocateBytes. virtual void freeBytes(void* p, uint64_t size) noexcept = 0; + /// Unmaps the unused memory space to return the backing physical pages back + /// to the operating system. This only works for MmapAllocator implementation + /// which manages the physical memory on its own by mmap. The function returns + /// the number of actual unmapped physical pages. + virtual MachinePageCount unmap(MachinePageCount targetPages) = 0; + /// Checks internal consistency of allocation data structures. Returns true if /// OK. virtual bool checkConsistency() const = 0; @@ -387,8 +391,45 @@ class MemoryAllocator : public std::enable_shared_from_this { isPersistentFailureInjection_ = false; } + /// Sets a thread level failure message describing the reason for the last + /// allocation failure. + void setAllocatorFailureMessage(std::string message); + + /// Returns extra information after returning false from any of the allocate + /// functions. The error message is scoped to the most recent call on the + /// thread. The message is cleared after return. + std::string getAndClearFailureMessage(); + + void getTracingHooks( + std::function& init, + std::function& report, + std::function ioVolume = nullptr); + protected: - explicit MemoryAllocator() = default; + MemoryAllocator(MachinePageCount largestSizeClassPages = 256) + : sizeClassSizes_(makeSizeClassSizes(largestSizeClassPages)) {} + + static std::vector makeSizeClassSizes( + MachinePageCount largest); + + /// Represents a mix of blocks of different sizes for covering a single + /// allocation. + struct SizeMix { + // Index into 'sizeClassSizes_' + std::vector sizeIndices; + // Number of items of the class of the corresponding element in + // '"sizeIndices'. + std::vector sizeCounts; + // Number of valid elements in 'sizeCounts' and 'sizeIndices'. + int32_t numSizes{0}; + // Total number of pages. + int32_t totalPages{0}; + + SizeMix() { + sizeIndices.reserve(kMaxSizeClasses); + sizeCounts.reserve(kMaxSizeClasses); + } + }; /// The actual memory allocation function implementation without retry /// attempts by making space from cache. @@ -396,14 +437,11 @@ class MemoryAllocator : public std::enable_shared_from_this { MachinePageCount numPages, Allocation* collateral, ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr, MachinePageCount maxPages = 0) = 0; virtual bool allocateNonContiguousWithoutRetry( - MachinePageCount numPages, - Allocation& out, - ReservationCallback reservationCB, - MachinePageCount minSizeClass) = 0; + const SizeMix& sizeMix, + Allocation& out) = 0; virtual void* allocateBytesWithoutRetry( uint64_t bytes, @@ -413,8 +451,7 @@ class MemoryAllocator : public std::enable_shared_from_this { virtual bool growContiguousWithoutRetry( MachinePageCount increment, - ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr) = 0; + ContiguousAllocation& allocation) = 0; // 'Cache' getter. The cache is only responsible for freeing up memory space // by shrinking itself when there is not enough space upon allocating. The @@ -426,20 +463,6 @@ class MemoryAllocator : public std::enable_shared_from_this { size_t bytes, const std::vector& sizes); - // Represents a mix of blocks of different sizes for covering a single - // allocation. - struct SizeMix { - // Index into 'sizeClassSizes_' - std::array sizeIndices{}; - // Number of items of the class of the corresponding element in - // '"sizeIndices'. - std::array sizeCounts{}; - // Number of valid elements in 'sizeCounts' and 'sizeIndices'. - int32_t numSizes{0}; - // Total number of pages. - int32_t totalPages{0}; - }; - // Returns a mix of standard sizes and allocation counts for covering // 'numPages' worth of memory. 'minSizeClass' is the size of the // smallest usable size class. @@ -457,8 +480,8 @@ class MemoryAllocator : public std::enable_shared_from_this { return true; } - // If 'data' is sufficiently large, enables/disables adaptive huge pages for - // the address raneg. + // If 'data' is sufficiently large, enables/disables adaptive huge pages + // for the address range. void useHugePages(const ContiguousAllocation& data, bool enable); // The machine page counts corresponding to different sizes in order @@ -466,17 +489,17 @@ class MemoryAllocator : public std::enable_shared_from_this { const std::vector sizeClassSizes_{1, 2, 4, 8, 16, 32, 64, 128, 256}; - // Tracks the number of allocated pages. Allocated pages are the memory pages - // that are currently being used. + // Tracks the number of allocated pages. Allocated pages are the memory + // pages that are currently being used. std::atomic numAllocated_{0}; // Tracks the number of mapped pages. Mapped pages are the memory pages that // meet following requirements: // 1. They are obtained from the operating system from mmap calls directly, // without going through std::malloc. - // 2. They are currently being allocated (used) or they were allocated (used) - // and freed in the past but haven't been returned to the operating system by - // 'this' (via madvise calls). + // 2. They are currently being allocated (used) or they were allocated + // (used) and freed in the past but haven't been returned to the operating + // system by 'this' (via madvise calls). std::atomic numMapped_{0}; // Indicates if the failure injection is persistent or transient. @@ -486,15 +509,16 @@ class MemoryAllocator : public std::enable_shared_from_this { bool isPersistentFailureInjection_{false}; Stats stats_; - - private: - static std::mutex initMutex_; - // Singleton instance. - static std::shared_ptr instance_; - // Application-supplied custom implementation of MemoryAllocator to be - // returned by getInstance(). - static MemoryAllocator* customInstance_; }; std::ostream& operator<<(std::ostream& out, const MemoryAllocator::Kind& kind); } // namespace facebook::velox::memory +template <> +struct fmt::formatter + : fmt::formatter { + auto format( + facebook::velox::memory::MemoryAllocator::InjectedFailure s, + format_context& ctx) const { + return formatter::format(static_cast(s), ctx); + } +}; diff --git a/velox/common/memory/MemoryArbitrator.cpp b/velox/common/memory/MemoryArbitrator.cpp index c5d596aa34c41..5bfff0ec64afb 100644 --- a/velox/common/memory/MemoryArbitrator.cpp +++ b/velox/common/memory/MemoryArbitrator.cpp @@ -18,23 +18,26 @@ #include +#include "velox/common/base/Counters.h" +#include "velox/common/base/RuntimeMetrics.h" +#include "velox/common/base/StatsReporter.h" #include "velox/common/memory/Memory.h" -#include "velox/common/memory/SharedArbitrator.h" namespace facebook::velox::memory { namespace { class FactoryRegistry { public: - void registerFactory( + bool registerFactory( const std::string& kind, MemoryArbitrator::Factory factory) { std::lock_guard l(mutex_); VELOX_USER_CHECK( map_.find(kind) == map_.end(), - "Arbitrator factory for kind {} already registered", - kind) + "Arbitrator factory for kind {} is already registered", + kind); map_[kind] = std::move(factory); + return true; } MemoryArbitrator::Factory& getFactory(const std::string& kind) { @@ -42,7 +45,7 @@ class FactoryRegistry { VELOX_USER_CHECK( map_.find(kind) != map_.end(), "Arbitrator factory for kind {} not registered", - kind) + kind); return map_[kind]; } @@ -51,7 +54,7 @@ class FactoryRegistry { VELOX_USER_CHECK( map_.find(kind) != map_.end(), "Arbitrator factory for kind {} not registered", - kind) + kind); return map_.erase(kind); } @@ -88,32 +91,34 @@ class NoopArbitrator : public MemoryArbitrator { return "NOOP"; } - // Noop arbitrator has no memory capacity limit so no operation needed for - // memory pool capacity reserve. - void reserveMemory(MemoryPool* pool, uint64_t /*unused*/) override { - pool->grow(pool->maxCapacity()); + void addPool(const std::shared_ptr& pool) override { + VELOX_CHECK_EQ(pool->capacity(), 0); + growPool(pool.get(), pool->maxCapacity(), 0); } - // Noop arbitrator has no memory capacity limit so no operation needed for - // memory pool capacity release. - void releaseMemory(MemoryPool* /*unused*/) override { - // No-op + void removePool(MemoryPool* pool) override { + VELOX_CHECK_EQ(pool->reservedBytes(), 0); } // Noop arbitrator has no memory capacity limit so no operation needed for // memory pool capacity grow. - bool growMemory( - MemoryPool* /*unused*/, - const std::vector>& /*unused*/, - uint64_t /*unused*/) override { + bool growCapacity(MemoryPool* /*unused*/, uint64_t /*unused*/) override { return false; } + // Noop arbitrator has no memory capacity limit so no operation needed for + // memory pool capacity release. + uint64_t shrinkCapacity(MemoryPool* pool, uint64_t /*unused*/) override { + // No-op + return 0; + } + // Noop arbitrator has no memory capacity limit so no operation needed for // memory pool capacity shrink. - uint64_t shrinkMemory( - const std::vector>& /*unused*/, - uint64_t /*unused*/) override { + uint64_t shrinkCapacity( + uint64_t /* unused */, + bool /* unused */, + bool /* unused */) override { return 0; } @@ -144,28 +149,62 @@ std::unique_ptr MemoryArbitrator::create( return factory(config); } -void MemoryArbitrator::registerFactory( +bool MemoryArbitrator::registerFactory( const std::string& kind, MemoryArbitrator::Factory factory) { - arbitratorFactories().registerFactory(kind, std::move(factory)); + return arbitratorFactories().registerFactory(kind, std::move(factory)); } void MemoryArbitrator::unregisterFactory(const std::string& kind) { arbitratorFactories().unregisterFactory(kind); } -void MemoryArbitrator::registerAllFactories() { - SharedArbitrator::registerFactory(); +/*static*/ bool MemoryArbitrator::growPool( + MemoryPool* pool, + uint64_t growBytes, + uint64_t reservationBytes) { + return pool->grow(growBytes, reservationBytes); } -void MemoryArbitrator::unregisterAllFactories() { - SharedArbitrator::unregisterFactory(); +/*static*/ uint64_t MemoryArbitrator::shrinkPool( + MemoryPool* pool, + uint64_t targetBytes) { + return pool->shrink(targetBytes); } std::unique_ptr MemoryReclaimer::create() { return std::unique_ptr(new MemoryReclaimer()); } +// static +uint64_t MemoryReclaimer::run( + const std::function& func, + Stats& stats) { + VELOX_CHECK(underMemoryArbitration()); + uint64_t execTimeUs{0}; + int64_t reclaimedBytes{0}; + { + MicrosecondTimer timer{&execTimeUs}; + reclaimedBytes = func(); + } + VELOX_CHECK_GE(reclaimedBytes, 0); + stats.reclaimExecTimeUs += execTimeUs; + stats.reclaimedBytes += reclaimedBytes; + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricMemoryReclaimExecTimeMs, execTimeUs / 1'000); + RECORD_HISTOGRAM_METRIC_VALUE(kMetricMemoryReclaimedBytes, reclaimedBytes); + RECORD_METRIC_VALUE(kMetricMemoryReclaimCount); + addThreadLocalRuntimeStat( + "memoryReclaimWallNanos", + RuntimeCounter(execTimeUs * 1'000, RuntimeCounter::Unit::kNanos)); + addThreadLocalRuntimeStat( + "memoryReclaimCount", RuntimeCounter(1, RuntimeCounter::Unit::kNone)); + addThreadLocalRuntimeStat( + "reclaimedMemoryBytes", + RuntimeCounter(reclaimedBytes, RuntimeCounter::Unit::kBytes)); + return reclaimedBytes; +} + bool MemoryReclaimer::reclaimableBytes( const MemoryPool& pool, uint64_t& reclaimableBytes) const { @@ -175,17 +214,20 @@ bool MemoryReclaimer::reclaimableBytes( } bool reclaimable{false}; pool.visitChildren([&](MemoryPool* pool) { - uint64_t poolReclaimableBytes{0}; - reclaimable |= pool->reclaimableBytes(poolReclaimableBytes); - reclaimableBytes += poolReclaimableBytes; + auto reclaimableBytesOpt = pool->reclaimableBytes(); + reclaimable |= reclaimableBytesOpt.has_value(); + reclaimableBytes += reclaimableBytesOpt.value_or(0); return true; }); VELOX_CHECK(reclaimable || reclaimableBytes == 0); return reclaimable; } -uint64_t -MemoryReclaimer::reclaim(MemoryPool* pool, uint64_t targetBytes, Stats& stats) { +uint64_t MemoryReclaimer::reclaim( + MemoryPool* pool, + uint64_t targetBytes, + uint64_t maxWaitMs, + Stats& stats) { if (pool->kind() == MemoryPool::Kind::kLeaf) { return 0; } @@ -194,15 +236,18 @@ MemoryReclaimer::reclaim(MemoryPool* pool, uint64_t targetBytes, Stats& stats) { // child pool with most reservation first. struct Candidate { std::shared_ptr pool; - int64_t reservedBytes; + int64_t reclaimableBytes; }; std::vector candidates; - candidates.reserve(pool->children_.size()); - for (auto& entry : pool->children_) { - auto child = entry.second.lock(); - if (child != nullptr) { - const int64_t reservedBytes = child->reservedBytes(); - candidates.push_back(Candidate{std::move(child), reservedBytes}); + { + std::shared_lock guard{pool->poolMutex_}; + candidates.reserve(pool->children_.size()); + for (auto& entry : pool->children_) { + auto child = entry.second.lock(); + if (child != nullptr) { + const int64_t reclaimableBytes = child->reclaimableBytes().value_or(0); + candidates.push_back(Candidate{std::move(child), reclaimableBytes}); + } } } @@ -210,12 +255,15 @@ MemoryReclaimer::reclaim(MemoryPool* pool, uint64_t targetBytes, Stats& stats) { candidates.begin(), candidates.end(), [](const auto& lhs, const auto& rhs) { - return lhs.reservedBytes > rhs.reservedBytes; + return lhs.reclaimableBytes > rhs.reclaimableBytes; }); uint64_t reclaimedBytes{0}; for (const auto& candidate : candidates) { - const auto bytes = candidate.pool->reclaim(targetBytes, stats); + if (candidate.reclaimableBytes == 0) { + break; + } + const auto bytes = candidate.pool->reclaim(targetBytes, maxWaitMs, stats); reclaimedBytes += bytes; if (targetBytes != 0) { if (bytes >= targetBytes) { @@ -246,11 +294,17 @@ void MemoryReclaimer::abort(MemoryPool* pool, const std::exception_ptr& error) { void MemoryReclaimer::Stats::reset() { numNonReclaimableAttempts = 0; + reclaimExecTimeUs = 0; + reclaimedBytes = 0; + reclaimWaitTimeUs = 0; } bool MemoryReclaimer::Stats::operator==( const MemoryReclaimer::Stats& other) const { - return numNonReclaimableAttempts == other.numNonReclaimableAttempts; + return numNonReclaimableAttempts == other.numNonReclaimableAttempts && + reclaimExecTimeUs == other.reclaimExecTimeUs && + reclaimedBytes == other.reclaimedBytes && + reclaimWaitTimeUs == other.reclaimWaitTimeUs; } bool MemoryReclaimer::Stats::operator!=( @@ -258,6 +312,15 @@ bool MemoryReclaimer::Stats::operator!=( return !(*this == other); } +MemoryReclaimer::Stats& MemoryReclaimer::Stats::operator+=( + const MemoryReclaimer::Stats& other) { + numNonReclaimableAttempts += other.numNonReclaimableAttempts; + reclaimExecTimeUs += other.reclaimExecTimeUs; + reclaimedBytes += other.reclaimedBytes; + reclaimWaitTimeUs += other.reclaimWaitTimeUs; + return *this; +} + MemoryArbitrator::Stats::Stats( uint64_t _numRequests, uint64_t _numSucceeded, @@ -269,8 +332,10 @@ MemoryArbitrator::Stats::Stats( uint64_t _numReclaimedBytes, uint64_t _maxCapacityBytes, uint64_t _freeCapacityBytes, + uint64_t _freeReservedCapacityBytes, uint64_t _reclaimTimeUs, - uint64_t _numNonReclaimableAttempts) + uint64_t _numNonReclaimableAttempts, + uint64_t _numShrinks) : numRequests(_numRequests), numSucceeded(_numSucceeded), numAborted(_numAborted), @@ -281,24 +346,30 @@ MemoryArbitrator::Stats::Stats( numReclaimedBytes(_numReclaimedBytes), maxCapacityBytes(_maxCapacityBytes), freeCapacityBytes(_freeCapacityBytes), + freeReservedCapacityBytes(_freeReservedCapacityBytes), reclaimTimeUs(_reclaimTimeUs), - numNonReclaimableAttempts(_numNonReclaimableAttempts) {} + numNonReclaimableAttempts(_numNonReclaimableAttempts), + numShrinks(_numShrinks) {} std::string MemoryArbitrator::Stats::toString() const { return fmt::format( - "STATS[numRequests {} numSucceeded {} numAborted {} numFailures {} numNonReclaimableAttempts {} queueTime {} arbitrationTime {} reclaimTime {} shrunkMemory {} reclaimedMemory {} maxCapacity {} freeCapacity {}]", + "STATS[numRequests {} numAborted {} numFailures {} " + "numNonReclaimableAttempts {} numShrinks {} " + "queueTime {} arbitrationTime {} reclaimTime {} shrunkMemory {} " + "reclaimedMemory {} maxCapacity {} freeCapacity {} freeReservedCapacity {}]", numRequests, - numSucceeded, numAborted, numFailures, numNonReclaimableAttempts, + numShrinks, succinctMicros(queueTimeUs), succinctMicros(arbitrationTimeUs), succinctMicros(reclaimTimeUs), succinctBytes(numShrunkBytes), succinctBytes(numReclaimedBytes), succinctBytes(maxCapacityBytes), - succinctBytes(freeCapacityBytes)); + succinctBytes(freeCapacityBytes), + succinctBytes(freeReservedCapacityBytes)); } MemoryArbitrator::Stats MemoryArbitrator::Stats::operator-( @@ -314,9 +385,11 @@ MemoryArbitrator::Stats MemoryArbitrator::Stats::operator-( result.numReclaimedBytes = numReclaimedBytes - other.numReclaimedBytes; result.maxCapacityBytes = maxCapacityBytes; result.freeCapacityBytes = freeCapacityBytes; + result.freeReservedCapacityBytes = freeReservedCapacityBytes; result.reclaimTimeUs = reclaimTimeUs - other.reclaimTimeUs; result.numNonReclaimableAttempts = numNonReclaimableAttempts - other.numNonReclaimableAttempts; + result.numShrinks = numShrinks - other.numShrinks; return result; } @@ -332,8 +405,10 @@ bool MemoryArbitrator::Stats::operator==(const Stats& other) const { numReclaimedBytes, maxCapacityBytes, freeCapacityBytes, + freeReservedCapacityBytes, reclaimTimeUs, - numNonReclaimableAttempts) == + numNonReclaimableAttempts, + numShrinks) == std::tie( other.numRequests, other.numSucceeded, @@ -345,8 +420,10 @@ bool MemoryArbitrator::Stats::operator==(const Stats& other) const { other.numReclaimedBytes, other.maxCapacityBytes, other.freeCapacityBytes, + other.freeReservedCapacityBytes, other.reclaimTimeUs, - other.numNonReclaimableAttempts); + other.numNonReclaimableAttempts, + other.numShrinks); } bool MemoryArbitrator::Stats::operator!=(const Stats& other) const { @@ -354,7 +431,6 @@ bool MemoryArbitrator::Stats::operator!=(const Stats& other) const { } bool MemoryArbitrator::Stats::operator<(const Stats& other) const { - uint32_t eqCount{0}; uint32_t gtCount{0}; uint32_t ltCount{0}; #define UPDATE_COUNTER(counter) \ @@ -363,8 +439,6 @@ bool MemoryArbitrator::Stats::operator<(const Stats& other) const { ++ltCount; \ } else if (counter > other.counter) { \ ++gtCount; \ - } else { \ - ++eqCount; \ } \ } while (0); @@ -378,6 +452,7 @@ bool MemoryArbitrator::Stats::operator<(const Stats& other) const { UPDATE_COUNTER(numReclaimedBytes); UPDATE_COUNTER(reclaimTimeUs); UPDATE_COUNTER(numNonReclaimableAttempts); + UPDATE_COUNTER(numShrinks); #undef UPDATE_COUNTER VELOX_CHECK( !((gtCount > 0) && (ltCount > 0)), @@ -400,7 +475,7 @@ bool MemoryArbitrator::Stats::operator<=(const Stats& other) const { } ScopedMemoryArbitrationContext::ScopedMemoryArbitrationContext( - const MemoryPool& requestor) + const MemoryPool* requestor) : savedArbitrationCtx_(arbitrationCtx), currentArbitrationCtx_({.requestor = requestor}) { arbitrationCtx = ¤tArbitrationCtx_; @@ -410,11 +485,64 @@ ScopedMemoryArbitrationContext::~ScopedMemoryArbitrationContext() { arbitrationCtx = savedArbitrationCtx_; } -MemoryArbitrationContext* memoryArbitrationContext() { +const MemoryArbitrationContext* memoryArbitrationContext() { return arbitrationCtx; } +ScopedMemoryPoolArbitrationCtx::ScopedMemoryPoolArbitrationCtx(MemoryPool* pool) + : pool_(pool) { + VELOX_CHECK_NOT_NULL(pool_); + pool_->enterArbitration(); +} + +ScopedMemoryPoolArbitrationCtx::~ScopedMemoryPoolArbitrationCtx() { + pool_->leaveArbitration(); +} + bool underMemoryArbitration() { return memoryArbitrationContext() != nullptr; } + +void testingRunArbitration( + uint64_t targetBytes, + bool allowSpill, + MemoryManager* manager) { + if (manager == nullptr) { + manager = memory::memoryManager(); + } + manager->shrinkPools(targetBytes, allowSpill); +} + +void testingRunArbitration( + MemoryPool* pool, + uint64_t targetBytes, + bool allowSpill) { + { + ScopedMemoryPoolArbitrationCtx arbitrationCtx{pool}; + static_cast(pool)->testingManager()->shrinkPools( + targetBytes, allowSpill); + } + static_cast(pool)->testingCheckIfAborted(); +} + +ScopedReclaimedBytesRecorder::ScopedReclaimedBytesRecorder( + MemoryPool* pool, + int64_t* reclaimedBytes) + : pool_(pool), + reclaimedBytes_(reclaimedBytes), + reservedBytesBeforeReclaim_(pool_->reservedBytes()) { + VELOX_CHECK_NOT_NULL(reclaimedBytes_); + VELOX_CHECK_EQ(*reclaimedBytes_, 0); +} + +ScopedReclaimedBytesRecorder::~ScopedReclaimedBytesRecorder() { + if (std::uncaught_exceptions() > 0) { + // NOTE: if there is an alive exception triggered by memory reclaim, then we + // won't set reclaimed memory bytes. + *reclaimedBytes_ = 0; + return; + } + const int64_t reservedBytesAfterReclaim = pool_->reservedBytes(); + *reclaimedBytes_ = reservedBytesBeforeReclaim_ - reservedBytesAfterReclaim; +} } // namespace facebook::velox::memory diff --git a/velox/common/memory/MemoryArbitrator.h b/velox/common/memory/MemoryArbitrator.h index 09bc3a92d0eed..a76164dc3310a 100644 --- a/velox/common/memory/MemoryArbitrator.h +++ b/velox/common/memory/MemoryArbitrator.h @@ -18,9 +18,12 @@ #include +#include "velox/common/base/AsyncSource.h" #include "velox/common/base/Exceptions.h" +#include "velox/common/base/Portability.h" #include "velox/common/base/SuccinctPrinter.h" #include "velox/common/future/VeloxPromise.h" +#include "velox/common/time/Timer.h" namespace facebook::velox::memory { @@ -55,46 +58,34 @@ class MemoryArbitrator { /// manager. int64_t capacity; - /// The initial memory capacity to reserve for a newly created memory pool. - uint64_t memoryPoolInitCapacity{256 << 20}; - - /// The minimal memory capacity to transfer out of or into a memory pool - /// during the memory arbitration. - uint64_t memoryPoolTransferCapacity{32 << 20}; - - /// Provided by the query system to validate the state after a memory pool - /// enters arbitration if not null. For instance, Prestissimo provides - /// callback to check if a memory arbitration request is issued from a + /// Callback to check if a memory arbitration request is issued from a /// driver thread, then the driver should be put in suspended state to avoid /// the potential deadlock when reclaim memory from the task of the request /// memory pool. MemoryArbitrationStateCheckCB arbitrationStateCheckCb{nullptr}; + + /// Additional configs that are arbitrator implementation specific. + std::unordered_map extraConfigs{}; }; using Factory = std::function( const MemoryArbitrator::Config& config)>; - /// Register factory for a specific 'kind' of memory arbitrator + /// Registers factory for a specific 'kind' of memory arbitrator /// MemoryArbitrator::Create looks up the registry to find the factory to /// create arbitrator instance based on the kind specified in arbitrator /// config. /// /// NOTE: we only allow the same 'kind' of memory arbitrator to be registered - /// once. The function throws an error if 'kind' is already registered. - static void registerFactory(const std::string& kind, Factory factory); + /// once. The function returns false if 'kind' is already registered. + static bool registerFactory(const std::string& kind, Factory factory); - /// Unregister the registered factory for a specifc kind. + /// Unregisters the registered factory for a specifc kind. /// /// NOTE: the function throws if the specified arbitrator 'kind' is not /// registered. static void unregisterFactory(const std::string& kind); - /// Register all the supported memory arbitrator kinds. - static void registerAllFactories(); - - /// Unregister all the supported memory arbitrator kinds. - static void unregisterAllFactories(); - /// Invoked by the memory manager to create an instance of memory arbitrator /// based on the kind specified in 'config'. The arbitrator kind must be /// registered through MemoryArbitrator::registerFactory(), otherwise the @@ -112,43 +103,51 @@ class MemoryArbitrator { virtual ~MemoryArbitrator() = default; - /// Invoked by the memory manager to reserve up to 'bytes' memory capacity - /// without actually freeing memory for a newly created memory pool. The - /// function will set the memory pool's capacity based on the actually - /// reserved memory. - /// - /// NOTE: the memory arbitrator can decides how much memory capacity is - /// actually reserved for a newly created memory pool. The latter can trigger - /// the memory arbitration on demand when actual memory allocation happens. - virtual void reserveMemory(MemoryPool* pool, uint64_t bytes) = 0; + /// Invoked by the memory manager to add a newly created memory pool. The + /// memory arbitrator allocates the initial capacity for 'pool' and + /// dynamically adjusts its capacity based query memory needs through memory + /// arbitration. + virtual void addPool(const std::shared_ptr& pool) = 0; - /// Invoked by the memory manager to return back all the reserved memory - /// capacity of a destroying memory pool. - virtual void releaseMemory(MemoryPool* pool) = 0; + /// Invoked by the memory manager to remove a destroyed memory pool. The + /// memory arbitrator frees up all its capacity and stops memory arbitration + /// operation on it. + virtual void removePool(MemoryPool* pool) = 0; /// Invoked by the memory manager to grow a memory pool's capacity. - /// 'pool' is the memory pool to request to grow. 'candidates' is a list - /// of query root pools to participate in the memory arbitration. The memory - /// arbitrator picks up a number of pools to either shrink its memory capacity - /// without actually freeing memory or reclaim its used memory to free up - /// enough memory for 'requestor' to grow. Different arbitrators use different - /// policies to select the candidate pools. The shared memory arbitrator used - /// by both Prestissimo and Prestissimo-on-Spark, selects the candidates with - /// more memory capacity. + /// 'pool' is the memory pool to request to grow. The memory arbitrator picks + /// up a number of pools to either shrink its memory capacity without actually + /// freeing memory or reclaim its used memory to free up enough memory for + /// 'requestor' to grow. + virtual bool growCapacity(MemoryPool* pool, uint64_t requestBytes) = 0; + + /// Invoked by the memory manager to shrink up to 'targetBytes' free capacity + /// from a memory 'pool', and returns them back to the arbitrator. If + /// 'targetBytes' is zero, we shrink all the free capacity from the memory + /// pool. The function returns the actual freed capacity from 'pool'. + virtual uint64_t shrinkCapacity(MemoryPool* pool, uint64_t targetBytes) = 0; + + /// Invoked by the memory manager to globally shrink memory from + /// memory pools by reclaiming only used memory, to reduce system memory + /// pressure. The freed memory capacity is given back to the arbitrator. If + /// 'targetBytes' is zero, then try to reclaim all the memory from 'pools'. + /// The function returns the actual freed memory capacity in bytes. If + /// 'allowSpill' is true, it reclaims the used memory by spilling. If + /// 'allowAbort' is true, it reclaims the used memory by aborting the queries + /// with the most memory usage. If both are true, it first reclaims the used + /// memory by spilling and then abort queries to reach the reclaim target. /// - /// NOTE: the memory manager keeps 'candidates' valid during the arbitration - /// processing. - virtual bool growMemory( - MemoryPool* pool, - const std::vector>& candidatePools, - uint64_t targetBytes) = 0; - - /// Invoked by the memory manager to shrink memory from a given list of memory - /// pools. The freed memory capacity is given back to the arbitrator. The - /// function returns the actual freed memory capacity in bytes. - virtual uint64_t shrinkMemory( - const std::vector>& pools, - uint64_t targetBytes) = 0; + /// NOTE: The actual reclaimed used memory (hence system memory) may be less + /// than 'targetBytes' due to the accounting of free capacity reclaimed. This + /// is okay because when this method is called, system is normally under + /// memory pressure, and there normally isn't much free capacity to reclaim. + /// So the reclaimed used memory in this case should be very close to + /// 'targetBytes' if enough used memory is reclaimable. We should improve this + /// in the future. + virtual uint64_t shrinkCapacity( + uint64_t targetBytes, + bool allowSpill = true, + bool allowAbort = false) = 0; /// The internal execution stats of the memory arbitrator. struct Stats { @@ -173,12 +172,16 @@ class MemoryArbitrator { uint64_t maxCapacityBytes{0}; /// The free memory capacity in bytes. uint64_t freeCapacityBytes{0}; + /// The free reserved memory capacity in bytes. + uint64_t freeReservedCapacityBytes{0}; /// The sum of all reclaim operation durations during arbitration in /// microseconds. uint64_t reclaimTimeUs{0}; /// The total number of times of the reclaim attempts that end up failing /// due to reclaiming at non-reclaimable stage. uint64_t numNonReclaimableAttempts{0}; + /// The total number of memory capacity shrinks. + uint64_t numShrinks{0}; Stats( uint64_t _numRequests, @@ -191,8 +194,10 @@ class MemoryArbitrator { uint64_t _numReclaimedBytes, uint64_t _maxCapacityBytes, uint64_t _freeCapacityBytes, + uint64_t _freeReservedCapacityBytes, uint64_t _reclaimTimeUs, - uint64_t _numNonReclaimableAttempts); + uint64_t _numNonReclaimableAttempts, + uint64_t _numShrinks); Stats() = default; @@ -220,16 +225,24 @@ class MemoryArbitrator { protected: explicit MemoryArbitrator(const Config& config) : capacity_(config.capacity), - memoryPoolInitCapacity_(config.memoryPoolInitCapacity), - memoryPoolTransferCapacity_(config.memoryPoolTransferCapacity), arbitrationStateCheckCb_(config.arbitrationStateCheckCb) {} + /// Helper utilities used by the memory arbitrator implementations to call + /// protected methods of memory pool. + static bool + growPool(MemoryPool* pool, uint64_t growBytes, uint64_t reservationBytes); + + static uint64_t shrinkPool(MemoryPool* pool, uint64_t targetBytes); + const uint64_t capacity_; - const uint64_t memoryPoolInitCapacity_; - const uint64_t memoryPoolTransferCapacity_; const MemoryArbitrationStateCheckCB arbitrationStateCheckCb_; }; +/// Formatter for fmt. +FOLLY_ALWAYS_INLINE std::string format_as(MemoryArbitrator::Stats stats) { + return stats.toString(); +} + FOLLY_ALWAYS_INLINE std::ostream& operator<<( std::ostream& o, const MemoryArbitrator::Stats& stats) { @@ -262,16 +275,29 @@ class MemoryReclaimer { /// due to reclaiming at non-reclaimable stage. uint64_t numNonReclaimableAttempts{0}; + /// The total execution time to do the reclaim in microseconds. + uint64_t reclaimExecTimeUs{0}; + + /// The total reclaimed memory bytes. + uint64_t reclaimedBytes{0}; + + /// The total time of task pause during reclaim in microseconds. + uint64_t reclaimWaitTimeUs{0}; + void reset(); bool operator==(const Stats& other) const; bool operator!=(const Stats& other) const; + Stats& operator+=(const Stats& other); }; virtual ~MemoryReclaimer() = default; static std::unique_ptr create(); + /// Invoked memory reclaim function from 'pool' and record execution 'stats'. + static uint64_t run(const std::function& func, Stats& stats); + /// Invoked by the memory arbitrator before entering the memory arbitration /// processing. The default implementation does nothing but user can override /// this if needs. For example, an operator memory reclaimer needed to put the @@ -300,10 +326,18 @@ class MemoryReclaimer { /// Invoked by the memory arbitrator to reclaim from memory 'pool' with /// specified 'targetBytes'. It is expected to reclaim at least that amount of /// memory bytes but there is no guarantees. If 'targetBytes' is zero, then it - /// reclaims all the reclaimable memory from the memory 'pool'. The function - /// returns the actual reclaimed memory bytes. - virtual uint64_t - reclaim(MemoryPool* pool, uint64_t targetBytes, Stats& stats); + /// reclaims all the reclaimable memory from the memory 'pool'. 'maxWaitMs' + /// specifies the max time to wait for reclaim if not zero. The memory + /// reclaim might fail if exceeds the timeout. The function returns the actual + /// reclaimed memory bytes. + /// + /// NOTE: 'maxWaitMs' is optional and the actual memory reclaim implementation + /// can choose to respect this timeout or not on its own. + virtual uint64_t reclaim( + MemoryPool* pool, + uint64_t targetBytes, + uint64_t maxWaitMs, + Stats& stats); /// Invoked by the memory arbitrator to abort memory 'pool' and the associated /// query execution when encounters non-recoverable memory reclaim error or @@ -318,20 +352,74 @@ class MemoryReclaimer { MemoryReclaimer() = default; }; +/// Helper class used to measure the memory bytes reclaimed from a memory pool +/// by a memory reclaim function. +class ScopedReclaimedBytesRecorder { + public: + ScopedReclaimedBytesRecorder(MemoryPool* pool, int64_t* reclaimedBytes); + + ~ScopedReclaimedBytesRecorder(); + + private: + MemoryPool* const pool_; + int64_t* const reclaimedBytes_; + const int64_t reservedBytesBeforeReclaim_; +}; + +/// The object is used to set/clear non-reclaimable section of an operation in +/// the middle of its execution. It allows the memory arbitrator to reclaim +/// memory from a running operator which is waiting for memory arbitration. +/// 'nonReclaimableSection' points to the corresponding flag of the associated +/// operator. +class ReclaimableSectionGuard { + public: + explicit ReclaimableSectionGuard(tsan_atomic* nonReclaimableSection) + : nonReclaimableSection_(nonReclaimableSection), + oldNonReclaimableSectionValue_(*nonReclaimableSection_) { + *nonReclaimableSection_ = false; + } + + ~ReclaimableSectionGuard() { + *nonReclaimableSection_ = oldNonReclaimableSectionValue_; + } + + private: + tsan_atomic* const nonReclaimableSection_; + const bool oldNonReclaimableSectionValue_; +}; + +class NonReclaimableSectionGuard { + public: + explicit NonReclaimableSectionGuard(tsan_atomic* nonReclaimableSection) + : nonReclaimableSection_(nonReclaimableSection), + oldNonReclaimableSectionValue_(*nonReclaimableSection_) { + *nonReclaimableSection_ = true; + } + + ~NonReclaimableSectionGuard() { + *nonReclaimableSection_ = oldNonReclaimableSectionValue_; + } + + private: + tsan_atomic* const nonReclaimableSection_; + const bool oldNonReclaimableSectionValue_; +}; + /// The memory arbitration context which is set on per-thread local variable by /// memory arbitrator. It is used to indicate a running thread is under memory /// arbitration processing or not. This helps to enable sanity check such as all /// the memory reservations during memory arbitration should come from the /// spilling memory pool. struct MemoryArbitrationContext { - const MemoryPool& requestor; + const MemoryPool* requestor; }; /// Object used to set/restore the memory arbitration context when a thread is /// under memory arbitration processing. class ScopedMemoryArbitrationContext { public: - explicit ScopedMemoryArbitrationContext(const MemoryPool& requestor); + explicit ScopedMemoryArbitrationContext(const MemoryPool* requestor); + ~ScopedMemoryArbitrationContext(); private: @@ -339,10 +427,73 @@ class ScopedMemoryArbitrationContext { MemoryArbitrationContext currentArbitrationCtx_; }; +/// Object used to setup arbitration context for a memory pool. +class ScopedMemoryPoolArbitrationCtx { + public: + explicit ScopedMemoryPoolArbitrationCtx(MemoryPool* pool); + + ~ScopedMemoryPoolArbitrationCtx(); + + private: + MemoryPool* const pool_; +}; + /// Returns the memory arbitration context set by a per-thread local variable if /// the running thread is under memory arbitration processing. -MemoryArbitrationContext* memoryArbitrationContext(); +const MemoryArbitrationContext* memoryArbitrationContext(); /// Returns true if the running thread is under memory arbitration or not. bool underMemoryArbitration(); + +/// Creates an async memory reclaim task with memory arbitration context set. +/// This is to avoid recursive memory arbitration during memory reclaim. +/// +/// NOTE: this must be called under memory arbitration. +template +std::shared_ptr> createAsyncMemoryReclaimTask( + std::function()> task) { + auto* arbitrationCtx = memory::memoryArbitrationContext(); + return std::make_shared>( + [asyncTask = std::move(task), arbitrationCtx]() -> std::unique_ptr { + std::unique_ptr restoreArbitrationCtx; + if (arbitrationCtx != nullptr) { + restoreArbitrationCtx = + std::make_unique( + arbitrationCtx->requestor); + } + return asyncTask(); + }); +} + +/// The function triggers memory arbitration by shrinking memory pools from +/// 'manager' by invoking shrinkPools API. If 'manager' is not set, then it +/// shrinks from the process wide memory manager. If 'targetBytes' is zero, then +/// it reclaims all the memory from 'manager' if possible. If 'allowSpill' is +/// true, then it allows to reclaim the used memory by spilling. +class MemoryManager; +void testingRunArbitration( + uint64_t targetBytes = 0, + bool allowSpill = true, + MemoryManager* manager = nullptr); + +/// The function triggers memory arbitration by shrinking memory pools from +/// 'manager' of 'pool' by invoking its shrinkPools API. If 'targetBytes' is +/// zero, then it reclaims all the memory from 'manager' if possible. If +/// 'allowSpill' is true, then it allows to reclaim the used memory by spilling. +void testingRunArbitration( + MemoryPool* pool, + uint64_t targetBytes = 0, + bool allowSpill = true); } // namespace facebook::velox::memory + +#if FMT_VERSION < 100100 +template <> +struct fmt::formatter + : formatter { + auto format( + facebook::velox::memory::MemoryArbitrator::Stats s, + format_context& ctx) const { + return formatter::format(s.toString(), ctx); + } +}; +#endif diff --git a/velox/common/memory/MemoryPool.cpp b/velox/common/memory/MemoryPool.cpp index ab2c3b026f471..2e647dc59a7a3 100644 --- a/velox/common/memory/MemoryPool.cpp +++ b/velox/common/memory/MemoryPool.cpp @@ -16,14 +16,22 @@ #include "velox/common/memory/MemoryPool.h" +#include #include +#include "velox/common/base/Counters.h" +#include "velox/common/base/StatsReporter.h" #include "velox/common/base/SuccinctPrinter.h" #include "velox/common/memory/Memory.h" #include "velox/common/testutil/TestValue.h" #include +DEFINE_bool( + velox_memory_pool_capacity_transfer_across_tasks, + false, + "Whether allow to memory capacity transfer between memory pools from different tasks, which might happen in use case like Spark-Gluten"); + DECLARE_bool(velox_suppress_memory_capacity_exceeding_error_message); using facebook::velox::common::testutil::TestValue; @@ -60,18 +68,24 @@ namespace { struct MemoryUsage { std::string name; uint64_t currentUsage; + uint64_t reservedUsage; uint64_t peakUsage; bool operator>(const MemoryUsage& other) const { - return std::tie(currentUsage, peakUsage, name) > - std::tie(other.currentUsage, other.peakUsage, other.name); + return std::tie(reservedUsage, currentUsage, peakUsage, name) > + std::tie( + other.reservedUsage, + other.currentUsage, + other.peakUsage, + other.name); } std::string toString() const { return fmt::format( - "{} usage {} peak {}", + "{} usage {} reserved {} peak {}", name, succinctBytes(currentUsage), + succinctBytes(reservedUsage), succinctBytes(peakUsage)); } }; @@ -103,20 +117,25 @@ void treeMemoryUsageVisitor( MemoryPool* pool, size_t indent, MemoryUsageHeap& topLeafMemUsages, + bool skipEmptyPool, std::stringstream& out) { const MemoryPool::Stats stats = pool->stats(); - // Avoid logging empty pools. - if (stats.empty()) { + // Avoid logging empty pools if 'skipEmptyPool' is true. + if (stats.empty() && skipEmptyPool) { return; } const MemoryUsage usage{ .name = pool->name(), - .currentUsage = stats.currentBytes, + .currentUsage = stats.usedBytes, + .reservedUsage = stats.reservedBytes, .peakUsage = stats.peakBytes, }; out << std::string(indent, ' ') << usage.toString() << "\n"; if (pool->kind() == MemoryPool::Kind::kLeaf) { + if (stats.empty()) { + return; + } static const size_t kTopNLeafMessages = 10; topLeafMemUsages.push(usage); if (topLeafMemUsages.size() > kTopNLeafMessages) { @@ -124,11 +143,11 @@ void treeMemoryUsageVisitor( } return; } - pool->visitChildren( - [&, indent = indent + kCapMessageIndentSize](MemoryPool* pool) { - treeMemoryUsageVisitor(pool, indent, topLeafMemUsages, out); - return true; - }); + pool->visitChildren([&, indent = indent + kCapMessageIndentSize]( + MemoryPool* pool) { + treeMemoryUsageVisitor(pool, indent, topLeafMemUsages, skipEmptyPool, out); + return true; + }); } std::string capacityToString(int64_t capacity) { @@ -151,8 +170,9 @@ std::string capacityToString(int64_t capacity) { std::string MemoryPool::Stats::toString() const { return fmt::format( - "currentBytes:{} peakBytes:{} cumulativeBytes:{} numAllocs:{} numFrees:{} numReserves:{} numReleases:{} numShrinks:{} numReclaims:{} numCollisions:{}", - succinctBytes(currentBytes), + "usedBytes:{} reservedBytes:{} peakBytes:{} cumulativeBytes:{} numAllocs:{} numFrees:{} numReserves:{} numReleases:{} numShrinks:{} numReclaims:{} numCollisions:{} numCapacityGrowths:{}", + succinctBytes(usedBytes), + succinctBytes(reservedBytes), succinctBytes(peakBytes), succinctBytes(cumulativeBytes), numAllocs, @@ -161,28 +181,33 @@ std::string MemoryPool::Stats::toString() const { numReleases, numShrinks, numReclaims, - numCollisions); + numCollisions, + numCapacityGrowths); } bool MemoryPool::Stats::operator==(const MemoryPool::Stats& other) const { return std::tie( - currentBytes, + usedBytes, + reservedBytes, peakBytes, cumulativeBytes, numAllocs, numFrees, numReserves, numReleases, - numCollisions) == + numCollisions, + numCapacityGrowths) == std::tie( - other.currentBytes, + other.usedBytes, + other.reservedBytes, other.peakBytes, other.cumulativeBytes, other.numAllocs, other.numFrees, other.numReserves, other.numReleases, - other.numCollisions); + other.numCollisions, + other.numCapacityGrowths); } std::ostream& operator<<(std::ostream& os, const MemoryPool::Stats& stats) { @@ -201,8 +226,8 @@ MemoryPool::MemoryPool( maxCapacity_(parent_ == nullptr ? options.maxCapacity : kMaxMemory), trackUsage_(options.trackUsage), threadSafe_(options.threadSafe), - checkUsageLeak_(options.checkUsageLeak), - debugEnabled_(options.debugEnabled) { + debugEnabled_(options.debugEnabled), + coreOnAllocationFailureEnabled_(options.coreOnAllocationFailureEnabled) { VELOX_CHECK(!isRoot() || !isLeaf()); VELOX_CHECK_GT( maxCapacity_, 0, "Memory pool {} max capacity can't be zero", name_); @@ -213,6 +238,7 @@ MemoryPool::~MemoryPool() { VELOX_CHECK(children_.empty()); } +// static std::string MemoryPool::kindString(Kind kind) { switch (kind) { case Kind::kLeaf: @@ -249,7 +275,7 @@ MemoryPool* MemoryPool::root() const { } uint64_t MemoryPool::getChildCount() const { - folly::SharedMutex::ReadHolder guard{poolMutex_}; + std::shared_lock guard{poolMutex_}; return children_.size(); } @@ -257,7 +283,7 @@ void MemoryPool::visitChildren( const std::function& visitor) const { std::vector> children; { - folly::SharedMutex::ReadHolder guard{poolMutex_}; + std::shared_lock guard{poolMutex_}; children.reserve(children_.size()); for (auto& entry : children_) { auto child = entry.second.lock(); @@ -267,16 +293,15 @@ void MemoryPool::visitChildren( } } - // NOTE: we should call 'visitor' on child pool object out of - // 'poolMutex_' to avoid potential recursive locking issues. Firstly, the - // user provided 'visitor' might try to acquire this memory pool lock again. - // Secondly, the shared child pool reference created from the weak pointer - // might be the last reference if some other threads drop all the external - // references during this time window. Then drop of this last shared reference - // after 'visitor' call will trigger child memory pool destruction in that - // case. The child memory pool destructor will remove its weak pointer - // reference from the parent pool which needs to acquire this memory pool lock - // again. + // NOTE: we should call 'visitor' on child pool object out of 'poolMutex_' to + // avoid potential recursive locking issues. Firstly, the user provided + // 'visitor' might try to acquire this memory pool lock again. Secondly, the + // shared child pool reference created from the weak pointer might be the last + // reference if some other threads drop all the external references during + // this time window. Then drop of this last shared reference after 'visitor' + // call will trigger child memory pool destruction in that case. The child + // memory pool destructor will remove its weak pointer reference from the + // parent pool which needs to acquire this memory pool lock again. for (auto& child : children) { if (!visitor(child.get())) { return; @@ -287,51 +312,65 @@ void MemoryPool::visitChildren( std::shared_ptr MemoryPool::addLeafChild( const std::string& name, bool threadSafe, - std::unique_ptr reclaimer) { + std::unique_ptr _reclaimer) { CHECK_POOL_MANAGEMENT_OP(addLeafChild); + // NOTE: we shall only set reclaimer in a child pool if its parent has also + // set. Otherwise it should be mis-configured. + VELOX_CHECK( + reclaimer() != nullptr || _reclaimer == nullptr, + "Child memory pool {} shall only set memory reclaimer if its parent {} has also set", + name, + name_); - folly::SharedMutex::WriteHolder guard{poolMutex_}; + std::unique_lock guard{poolMutex_}; VELOX_CHECK_EQ( children_.count(name), 0, "Leaf child memory pool {} already exists in {}", name, - toString()); + name_); auto child = genChild( shared_from_this(), name, MemoryPool::Kind::kLeaf, threadSafe, - std::move(reclaimer)); + std::move(_reclaimer)); children_.emplace(name, child); return child; } std::shared_ptr MemoryPool::addAggregateChild( const std::string& name, - std::unique_ptr reclaimer) { + std::unique_ptr _reclaimer) { CHECK_POOL_MANAGEMENT_OP(addAggregateChild); + // NOTE: we shall only set reclaimer in a child pool if its parent has also + // set. Otherwise it should be mis-configured. + VELOX_CHECK( + reclaimer() != nullptr || _reclaimer == nullptr, + "Child memory pool {} shall only set memory reclaimer if its parent {} has also set", + name, + name_); - folly::SharedMutex::WriteHolder guard{poolMutex_}; + std::unique_lock guard{poolMutex_}; VELOX_CHECK_EQ( children_.count(name), 0, "Child memory pool {} already exists in {}", name, - toString()); + name_); auto child = genChild( shared_from_this(), name, MemoryPool::Kind::kAggregate, true, - std::move(reclaimer)); + std::move(_reclaimer)); children_.emplace(name, child); return child; } void MemoryPool::dropChild(const MemoryPool* child) { CHECK_POOL_MANAGEMENT_OP(dropChild); - folly::SharedMutex::WriteHolder guard{poolMutex_}; + std::unique_lock guard{poolMutex_}; const auto ret = children_.erase(child->name()); VELOX_CHECK_EQ( ret, @@ -341,6 +380,20 @@ void MemoryPool::dropChild(const MemoryPool* child) { toString()); } +bool MemoryPool::aborted() const { + if (parent_ != nullptr) { + return parent_->aborted(); + } + return aborted_; +} + +std::exception_ptr MemoryPool::abortError() const { + if (parent_ != nullptr) { + return parent_->abortError(); + } + return abortError_; +} + size_t MemoryPool::preferredSize(size_t size) { if (size < 8) { return 8; @@ -365,26 +418,17 @@ MemoryPoolImpl::MemoryPoolImpl( Kind kind, std::shared_ptr parent, std::unique_ptr reclaimer, - DestructionCallback destructionCb, const Options& options) : MemoryPool{name, kind, parent, options}, manager_{memoryManager}, - allocator_{&manager_->allocator()}, - destructionCb_(std::move(destructionCb)), + allocator_{manager_->allocator()}, + arbitrator_{manager_->arbitrator()}, debugPoolNameRegex_(debugEnabled_ ? *(debugPoolNameRegex().rlock()) : ""), reclaimer_(std::move(reclaimer)), // The memory manager sets the capacity through grow() according to the // actually used memory arbitration policy. capacity_(parent_ != nullptr ? kMaxMemory : 0) { VELOX_CHECK(options.threadSafe || isLeaf()); - // NOTE: we shall only set reclaimer in a child pool if its parent has also - // set. Otherwise. it should be mis-configured. - VELOX_CHECK( - parent_ == nullptr || parent_->reclaimer() != nullptr || - reclaimer_ == nullptr, - "Child memory pool {} shall only set memory reclaimer if its parent {} has also set", - name_, - parent_->name()); } MemoryPoolImpl::~MemoryPoolImpl() { @@ -392,13 +436,31 @@ MemoryPoolImpl::~MemoryPoolImpl() { if (parent_ != nullptr) { toImpl(parent_)->dropChild(this); } - if (checkUsageLeak_) { - VELOX_CHECK( - (usedReservationBytes_ == 0) && (reservationBytes_ == 0) && - (minReservationBytes_ == 0), - "Bad memory usage track state: {}", - toString()); + + if (isLeaf()) { + if (usedReservationBytes_ > 0) { + VELOX_MEM_LOG(ERROR) << "Memory leak (Used memory): " << toString(); + RECORD_METRIC_VALUE( + kMetricMemoryPoolUsageLeakBytes, usedReservationBytes_); + } + + if (minReservationBytes_ > 0) { + VELOX_MEM_LOG(ERROR) << "Memory leak (Reserved Memory): " << toString(); + RECORD_METRIC_VALUE( + kMetricMemoryPoolReservationLeakBytes, minReservationBytes_); + } + } + VELOX_DCHECK( + (usedReservationBytes_ == 0) && (reservationBytes_ == 0) && + (minReservationBytes_ == 0), + "Bad memory usage track state: {}", + toString()); + + if (isRoot()) { + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricMemoryPoolCapacityGrowCount, numCapacityGrowths_); } + if (destructionCb_ != nullptr) { destructionCb_(this); } @@ -411,7 +473,8 @@ MemoryPool::Stats MemoryPoolImpl::stats() const { MemoryPool::Stats MemoryPoolImpl::statsLocked() const { Stats stats; - stats.currentBytes = currentBytesLocked(); + stats.usedBytes = usedBytes(); + stats.reservedBytes = reservationBytes_; stats.peakBytes = peakBytes_; stats.cumulativeBytes = cumulativeBytes_; stats.numAllocs = numAllocs_; @@ -419,6 +482,7 @@ MemoryPool::Stats MemoryPoolImpl::statsLocked() const { stats.numReserves = numReserves_; stats.numReleases = numReleases_; stats.numCollisions = numCollisions_; + stats.numCapacityGrowths = numCapacityGrowths_; return stats; } @@ -429,11 +493,12 @@ void* MemoryPoolImpl::allocate(int64_t size) { void* buffer = allocator_->allocateBytes(alignedSize, alignment_); if (FOLLY_UNLIKELY(buffer == nullptr)) { release(alignedSize); - VELOX_MEM_ALLOC_ERROR(fmt::format( - "{} failed with {} from {}", + handleAllocationFailure(fmt::format( + "{} failed with {} from {} {}", __FUNCTION__, succinctBytes(size), - toString())); + toString(), + allocator_->getAndClearFailureMessage())); } DEBUG_RECORD_ALLOC(buffer, size); return buffer; @@ -447,12 +512,13 @@ void* MemoryPoolImpl::allocateZeroFilled(int64_t numEntries, int64_t sizeEach) { void* buffer = allocator_->allocateZeroFilled(alignedSize); if (FOLLY_UNLIKELY(buffer == nullptr)) { release(alignedSize); - VELOX_MEM_ALLOC_ERROR(fmt::format( - "{} failed with {} entries and {} each from {}", + handleAllocationFailure(fmt::format( + "{} failed with {} entries and {} each from {} {}", __FUNCTION__, numEntries, succinctBytes(sizeEach), - toString())); + toString(), + allocator_->getAndClearFailureMessage())); } DEBUG_RECORD_ALLOC(buffer, size); return buffer; @@ -466,12 +532,13 @@ void* MemoryPoolImpl::reallocate(void* p, int64_t size, int64_t newSize) { void* newP = allocator_->allocateBytes(alignedNewSize, alignment_); if (FOLLY_UNLIKELY(newP == nullptr)) { release(alignedNewSize); - VELOX_MEM_ALLOC_ERROR(fmt::format( - "{} failed with new {} and old {} from {}", + handleAllocationFailure(fmt::format( + "{} failed with new {} and old {} from {} {}", __FUNCTION__, succinctBytes(newSize), succinctBytes(size), - toString())); + toString(), + allocator_->getAndClearFailureMessage())); } DEBUG_RECORD_ALLOC(newP, newSize); if (p != nullptr) { @@ -505,7 +572,7 @@ void MemoryPoolImpl::allocateNonContiguous( if (!allocator_->allocateNonContiguous( numPages, out, - [this](int64_t allocBytes, bool preAllocate) { + [this](uint64_t allocBytes, bool preAllocate) { if (preAllocate) { reserve(allocBytes); } else { @@ -514,8 +581,12 @@ void MemoryPoolImpl::allocateNonContiguous( }, minSizeClass)) { VELOX_CHECK(out.empty()); - VELOX_MEM_ALLOC_ERROR(fmt::format( - "{} failed with {} pages from {}", __FUNCTION__, numPages, toString())); + handleAllocationFailure(fmt::format( + "{} failed with {} pages from {} {}", + __FUNCTION__, + numPages, + toString(), + allocator_->getAndClearFailureMessage())); } DEBUG_RECORD_ALLOC(out); VELOX_CHECK(!out.empty()); @@ -553,7 +624,7 @@ void MemoryPoolImpl::allocateContiguous( numPages, nullptr, out, - [this](int64_t allocBytes, bool preAlloc) { + [this](uint64_t allocBytes, bool preAlloc) { if (preAlloc) { reserve(allocBytes); } else { @@ -562,8 +633,12 @@ void MemoryPoolImpl::allocateContiguous( }, maxPages)) { VELOX_CHECK(out.empty()); - VELOX_MEM_ALLOC_ERROR(fmt::format( - "{} failed with {} pages from {}", __FUNCTION__, numPages, toString())); + handleAllocationFailure(fmt::format( + "{} failed with {} pages from {} {}", + __FUNCTION__, + numPages, + toString(), + allocator_->getAndClearFailureMessage())); } DEBUG_RECORD_ALLOC(out); VELOX_CHECK(!out.empty()); @@ -584,18 +659,19 @@ void MemoryPoolImpl::growContiguous( MachinePageCount increment, ContiguousAllocation& allocation) { if (!allocator_->growContiguous( - increment, allocation, [this](int64_t allocBytes, bool preAlloc) { + increment, allocation, [this](uint64_t allocBytes, bool preAlloc) { if (preAlloc) { reserve(allocBytes); } else { release(allocBytes); } })) { - VELOX_MEM_ALLOC_ERROR(fmt::format( - "{} failed with {} pages from {}", + handleAllocationFailure(fmt::format( + "{} failed with {} pages from {} {}", __FUNCTION__, increment, - toString())); + toString(), + allocator_->getAndClearFailureMessage())); } if (FOLLY_UNLIKELY(debugEnabled_)) { recordGrowDbg(allocation.data(), allocation.size()); @@ -610,6 +686,38 @@ int64_t MemoryPoolImpl::capacity() const { return capacity_; } +int64_t MemoryPoolImpl::usedBytes() const { + if (isLeaf()) { + return usedReservationBytes_; + } + if (reservedBytes() == 0) { + return 0; + } + int64_t usedBytes{0}; + visitChildren([&](MemoryPool* pool) { + usedBytes += pool->usedBytes(); + return true; + }); + return usedBytes; +} + +int64_t MemoryPoolImpl::releasableReservation() const { + if (isLeaf()) { + std::lock_guard l(mutex_); + return std::max( + 0, reservationBytes_ - quantizedSize(usedReservationBytes_)); + } + if (reservedBytes() == 0) { + return 0; + } + int64_t releasableBytes{0}; + visitChildren([&](MemoryPool* pool) { + releasableBytes += pool->releasableReservation(); + return true; + }); + return releasableBytes; +} + std::shared_ptr MemoryPoolImpl::genChild( std::shared_ptr parent, const std::string& name, @@ -622,13 +730,12 @@ std::shared_ptr MemoryPoolImpl::genChild( kind, parent, std::move(reclaimer), - nullptr, Options{ .alignment = alignment_, .trackUsage = trackUsage_, .threadSafe = threadSafe, - .checkUsageLeak = checkUsageLeak_, - .debugEnabled = debugEnabled_}); + .debugEnabled = debugEnabled_, + .coreOnAllocationFailureEnabled = coreOnAllocationFailureEnabled_}); } bool MemoryPoolImpl::maybeReserve(uint64_t increment) { @@ -640,7 +747,7 @@ bool MemoryPoolImpl::maybeReserve(uint64_t increment) { const auto reservationToAdd = bits::roundUp(increment, kGrowthQuantum); try { reserve(reservationToAdd, true); - } catch (const std::exception& e) { + } catch (const std::exception&) { if (aborted()) { // NOTE: we shall throw to stop the query execution if the root memory // pool has been aborted. It is also unsafe to proceed as the memory abort @@ -654,13 +761,6 @@ bool MemoryPoolImpl::maybeReserve(uint64_t increment) { } void MemoryPoolImpl::reserve(uint64_t size, bool reserveOnly) { - if (FOLLY_UNLIKELY(underMemoryArbitration() && !isSpillMemoryPool(this))) { - VELOX_FAIL( - "Unexpected non-spilling memory reservation from memory pool: {}, arbitration request pool: {}", - name(), - memoryArbitrationContext()->requestor.name()); - } - if (FOLLY_LIKELY(trackUsage_)) { if (FOLLY_LIKELY(threadSafe_)) { reserveThreadSafe(size, reserveOnly); @@ -698,7 +798,7 @@ void MemoryPoolImpl::reserveThreadSafe(uint64_t size, bool reserveOnly) { "facebook::velox::memory::MemoryPoolImpl::reserveThreadSafe", this); try { incrementReservationThreadSafe(this, increment); - } catch (const std::exception& e) { + } catch (const std::exception&) { // When race with concurrent memory reservation free, we might end up with // unused reservation but no used reservation if a retry memory // reservation attempt run into memory capacity exceeded error. @@ -732,63 +832,81 @@ bool MemoryPoolImpl::incrementReservationThreadSafe( } } - { - std::lock_guard l(mutex_); - if (maybeIncrementReservationLocked(size)) { - return true; - } + if (maybeIncrementReservation(size)) { + return true; } + VELOX_CHECK_NULL(parent_); - if (manager_->growPool(requestor, size)) { + if (growCapacity(requestor, size)) { TestValue::adjust( "facebook::velox::memory::MemoryPoolImpl::incrementReservationThreadSafe::AfterGrowCallback", this); - // NOTE: the memory reservation might still fail even if the memory grow - // callback succeeds. The reason is that we don't hold the root tracker's - // mutex lock while running the grow callback. Therefore, there is a - // possibility in theory that a concurrent memory reservation request - // might steal away the increased memory capacity after the grow callback - // finishes and before we increase the reservation. If it happens, we can - // simply fall back to retry the memory reservation from the leaf memory - // pool which should happen rarely. - return maybeIncrementReservation(size); + // NOTE: if memory arbitration succeeds, it should have already committed + // the reservation 'size' in the root memory pool. + return true; } VELOX_MEM_POOL_CAP_EXCEEDED(fmt::format( - "Exceeded memory pool cap of {} with max {} when requesting {}, memory " - "manager cap is {}, requestor '{}' with current usage {}\n{}", + "Exceeded memory pool capacity after attempt to grow capacity " + "through arbitration. Requestor pool name '{}', request size {}, memory " + "pool capacity {}, memory pool max capacity {}, memory manager capacity " + "{}, current usage {}\n{}", + requestor->name(), + succinctBytes(size), capacityToString(capacity()), capacityToString(maxCapacity_), - succinctBytes(size), capacityToString(manager_->capacity()), - requestor->name(), - succinctBytes(requestor->currentBytes()), + succinctBytes(requestor->usedBytes()), treeMemoryUsage())); } -bool MemoryPoolImpl::maybeIncrementReservation(uint64_t size) { - std::lock_guard l(mutex_); - return maybeIncrementReservationLocked(size); +bool MemoryPoolImpl::growCapacity(MemoryPool* requestor, uint64_t size) { + VELOX_CHECK(requestor->isLeaf()); + ++numCapacityGrowths_; + + bool success{false}; + { + ScopedMemoryPoolArbitrationCtx arbitrationCtx(requestor); + success = arbitrator_->growCapacity(this, size); + } + // The memory pool might have been aborted during the time it leaves the + // arbitration no matter the arbitration succeed or not. + if (FOLLY_UNLIKELY(aborted())) { + if (success) { + // Release the reservation committed by the memory arbitration on success. + decrementReservation(size); + } + VELOX_CHECK_NOT_NULL(abortError()); + std::rethrow_exception(abortError()); + } + return success; } -bool MemoryPoolImpl::maybeIncrementReservationLocked(uint64_t size) { +bool MemoryPoolImpl::maybeIncrementReservation(uint64_t size) { + std::lock_guard l(mutex_); if (isRoot()) { - if (aborted()) { - // This memory pool has been aborted by the memory arbitrator. Abort to - // prevent this pool from triggering memory arbitration. The associated - // query should also abort soon. - VELOX_MEM_POOL_ABORTED("This memory pool has been aborted."); - } - if (reservationBytes_ + size > capacity_) { + checkIfAborted(); + + // NOTE: we allow memory pool to overuse its memory during the memory + // arbitration process. The memory arbitration process itself needs to + // ensure the memory pool usage of the memory pool is within the capacity + // limit after the arbitration operation completes. + if (FOLLY_UNLIKELY( + (reservationBytes_ + size > capacity_) && + !underMemoryArbitration())) { return false; } } - reservationBytes_ += size; + incrementReservationLocked(size); + return true; +} + +void MemoryPoolImpl::incrementReservationLocked(uint64_t bytes) { + reservationBytes_ += bytes; if (!isLeaf()) { - cumulativeBytes_ += size; + cumulativeBytes_ += bytes; maybeUpdatePeakBytesLocked(reservationBytes_); } - return true; } void MemoryPoolImpl::release() { @@ -849,9 +967,9 @@ void MemoryPoolImpl::decrementReservation(uint64_t size) noexcept { sanityCheckLocked(); } -std::string MemoryPoolImpl::treeMemoryUsage() const { +std::string MemoryPoolImpl::treeMemoryUsage(bool skipEmptyPool) const { if (parent_ != nullptr) { - return parent_->treeMemoryUsage(); + return parent_->treeMemoryUsage(skipEmptyPool); } if (FLAGS_velox_suppress_memory_capacity_exceeding_error_message) { return ""; @@ -862,14 +980,15 @@ std::string MemoryPoolImpl::treeMemoryUsage() const { const Stats stats = statsLocked(); const MemoryUsage usage{ .name = name(), - .currentUsage = stats.currentBytes, + .currentUsage = stats.usedBytes, + .reservedUsage = stats.reservedBytes, .peakUsage = stats.peakBytes}; out << usage.toString() << "\n"; } MemoryUsageHeap topLeafMemUsages; visitChildren([&, indent = kCapMessageIndentSize](MemoryPool* pool) { - treeMemoryUsageVisitor(pool, indent, topLeafMemUsages, out); + treeMemoryUsageVisitor(pool, indent, topLeafMemUsages, skipEmptyPool, out); return true; }); @@ -892,7 +1011,11 @@ uint64_t MemoryPoolImpl::freeBytes() const { if (capacity_ == kMaxMemory) { return 0; } - VELOX_CHECK_GE(capacity_, reservationBytes_); + if (capacity_ < reservationBytes_) { + // NOTE: the memory reservation could be temporarily larger than its + // capacity if this memory pool is under memory arbitration processing. + return 0; + } return capacity_ - reservationBytes_; } @@ -915,21 +1038,27 @@ MemoryReclaimer* MemoryPoolImpl::reclaimer() const { return reclaimer_.get(); } -bool MemoryPoolImpl::reclaimableBytes(uint64_t& reclaimableBytes) const { - reclaimableBytes = 0; +std::optional MemoryPoolImpl::reclaimableBytes() const { if (reclaimer() == nullptr) { - return false; + return std::nullopt; } - return reclaimer()->reclaimableBytes(*this, reclaimableBytes); + + uint64_t reclaimableBytes = 0; + if (!reclaimer()->reclaimableBytes(*this, reclaimableBytes)) { + return std::nullopt; + } + + return reclaimableBytes; } uint64_t MemoryPoolImpl::reclaim( uint64_t targetBytes, + uint64_t maxWaitMs, memory::MemoryReclaimer::Stats& stats) { if (reclaimer() == nullptr) { return 0; } - return reclaimer()->reclaim(this, targetBytes, stats); + return reclaimer()->reclaim(this, targetBytes, maxWaitMs, stats); } void MemoryPoolImpl::enterArbitration() { @@ -946,7 +1075,7 @@ void MemoryPoolImpl::leaveArbitration() noexcept { uint64_t MemoryPoolImpl::shrink(uint64_t targetBytes) { if (parent_ != nullptr) { - return parent_->shrink(targetBytes); + return toImpl(parent_)->shrink(targetBytes); } std::lock_guard l(mutex_); // We don't expect to shrink a memory pool without capacity limit. @@ -959,27 +1088,29 @@ uint64_t MemoryPoolImpl::shrink(uint64_t targetBytes) { return freeBytes; } -uint64_t MemoryPoolImpl::grow(uint64_t bytes) noexcept { +bool MemoryPoolImpl::grow(uint64_t growBytes, uint64_t reservationBytes) { if (parent_ != nullptr) { - return parent_->grow(bytes); + return toImpl(parent_)->grow(growBytes, reservationBytes); } // TODO: add to prevent from growing beyond the max capacity and the // corresponding support in memory arbitrator. std::lock_guard l(mutex_); // We don't expect to grow a memory pool without capacity limit. VELOX_CHECK_NE(capacity_, kMaxMemory, "Can't grow with unlimited capacity"); - VELOX_CHECK_LE( - capacity_ + bytes, maxCapacity_, "Can't grow beyond the max capacity"); - capacity_ += bytes; - VELOX_CHECK_GE(capacity_, bytes); - return capacity_; -} + if (capacity_ + growBytes > maxCapacity_) { + return false; + } + if (reservationBytes_ + reservationBytes > capacity_ + growBytes) { + return false; + } -bool MemoryPoolImpl::aborted() const { - if (parent_ != nullptr) { - return parent_->aborted(); + capacity_ += growBytes; + VELOX_CHECK_GE(capacity_, growBytes); + if (reservationBytes > 0) { + incrementReservationLocked(reservationBytes); + VELOX_CHECK_LE(reservationBytes, reservationBytes_); } - return aborted_; + return true; } void MemoryPoolImpl::abort(const std::exception_ptr& error) { @@ -991,10 +1122,37 @@ void MemoryPoolImpl::abort(const std::exception_ptr& error) { if (reclaimer() == nullptr) { VELOX_FAIL("Can't abort the memory pool {} without reclaimer", name_); } - aborted_ = true; + setAbortError(error); reclaimer()->abort(this, error); } +void MemoryPoolImpl::setAbortError(const std::exception_ptr& error) { + VELOX_CHECK( + !aborted_, + "Trying to set another abort error on an already aborted pool."); + abortError_ = error; + aborted_ = true; +} + +void MemoryPoolImpl::checkIfAborted() const { + if (FOLLY_UNLIKELY(aborted())) { + VELOX_CHECK_NOT_NULL(abortError()); + std::rethrow_exception(abortError()); + } +} + +void MemoryPoolImpl::setDestructionCallback( + const DestructionCallback& callback) { + VELOX_CHECK_NOT_NULL(callback); + VELOX_CHECK( + isRoot(), + "Only root memory pool allows to set destruction callbacks: {}", + name_); + std::lock_guard l(mutex_); + VELOX_CHECK_NULL(destructionCb_); + destructionCb_ = callback; +} + void MemoryPoolImpl::testingSetCapacity(int64_t bytes) { if (parent_ != nullptr) { return toImpl(parent_)->testingSetCapacity(bytes); @@ -1003,6 +1161,14 @@ void MemoryPoolImpl::testingSetCapacity(int64_t bytes) { capacity_ = bytes; } +void MemoryPoolImpl::testingSetReservation(int64_t bytes) { + if (parent_ != nullptr) { + return toImpl(parent_)->testingSetReservation(bytes); + } + std::lock_guard l(mutex_); + reservationBytes_ = bytes; +} + bool MemoryPoolImpl::needRecordDbg(bool /* isAlloc */) { if (!debugPoolNameRegex_.empty()) { return RE2::FullMatch(name_, debugPoolNameRegex_); @@ -1016,10 +1182,10 @@ void MemoryPoolImpl::recordAllocDbg(const void* addr, uint64_t size) { if (!needRecordDbg(true)) { return; } - const auto stackTrace = process::StackTrace().toString(); std::lock_guard l(debugAllocMutex_); debugAllocRecords_.emplace( - reinterpret_cast(addr), AllocationRecord{size, stackTrace}); + reinterpret_cast(addr), + AllocationRecord{size, process::StackTrace()}); } void MemoryPoolImpl::recordAllocDbg(const Allocation& allocation) { @@ -1060,7 +1226,7 @@ void MemoryPoolImpl::recordFreeDbg(const void* addr, uint64_t size) { "{}\n", size, allocRecord.size, - allocRecord.callStack, + allocRecord.callStack.toString(), freeStackTrace)); } debugAllocRecords_.erase(addrUint64); @@ -1105,12 +1271,51 @@ void MemoryPoolImpl::leakCheckDbg() { std::ostream oss(&buf); oss << "Detected total of " << debugAllocRecords_.size() << " leaked allocations:\n"; + struct AllocationStats { + uint64_t size{0}; + uint64_t numAllocations{0}; + }; + std::unordered_map sizeAggregatedRecords; for (const auto& itr : debugAllocRecords_) { const auto& allocationRecord = itr.second; - oss << "======== Leaked memory allocation of " << allocationRecord.size - << " bytes ========\n" - << allocationRecord.callStack; + const auto stackStr = allocationRecord.callStack.toString(); + if (sizeAggregatedRecords.count(stackStr) == 0) { + sizeAggregatedRecords[stackStr] = AllocationStats(); + } + sizeAggregatedRecords[stackStr].size += allocationRecord.size; + ++sizeAggregatedRecords[stackStr].numAllocations; + } + std::vector> sortedRecords( + sizeAggregatedRecords.begin(), sizeAggregatedRecords.end()); + std::sort( + sortedRecords.begin(), + sortedRecords.end(), + [](const std::pair& a, + std::pair& b) { + return a.second.size > b.second.size; + }); + for (const auto& pair : sortedRecords) { + oss << "======== Leaked memory from " << pair.second.numAllocations + << " total allocations of " << succinctBytes(pair.second.size) + << " total size ========\n" + << pair.first << "\n"; } VELOX_FAIL(buf.str()); } + +void MemoryPoolImpl::handleAllocationFailure( + const std::string& failureMessage) { + if (coreOnAllocationFailureEnabled_) { + VELOX_MEM_LOG(ERROR) << failureMessage; + // SIGBUS is one of the standard signals in Linux that triggers a core dump + // Normally it is raised by the operating system when a misaligned memory + // access occurs. On x86 and aarch64 misaligned access is allowed by default + // hence this signal should never occur naturally. Raising a signal other + // than SIGABRT makes it easier to distinguish an allocation failure from + // any other crash + raise(SIGBUS); + } + + VELOX_MEM_ALLOC_ERROR(failureMessage); +} } // namespace facebook::velox::memory diff --git a/velox/common/memory/MemoryPool.h b/velox/common/memory/MemoryPool.h index b12e0799a29c2..1f3b33ed3a906 100644 --- a/velox/common/memory/MemoryPool.h +++ b/velox/common/memory/MemoryPool.h @@ -22,6 +22,7 @@ #include #include +#include #include "velox/common/base/BitUtil.h" #include "velox/common/base/Exceptions.h" #include "velox/common/base/Portability.h" @@ -32,6 +33,11 @@ DECLARE_bool(velox_memory_leak_check_enabled); DECLARE_bool(velox_memory_pool_debug_enabled); +DECLARE_bool(velox_memory_pool_capacity_transfer_across_tasks); + +namespace facebook::velox::exec { +class ParallelMemoryReclaimer; +} namespace facebook::velox::memory { #define VELOX_MEM_POOL_CAP_EXCEEDED(errorMessage) \ @@ -56,9 +62,6 @@ class MemoryManager; constexpr int64_t kMaxMemory = std::numeric_limits::max(); -/// Sets the memory reclaimer to the provided memory pool. -using SetMemoryReclaimer = std::function; - /// This class provides the memory allocation interfaces for a query execution. /// Each query execution entity creates a dedicated memory pool object. The /// memory pool objects from a query are organized as a tree with four levels @@ -79,33 +82,33 @@ using SetMemoryReclaimer = std::function; /// one per each query plan node. The task pool is the parent of all the node /// pools from the task's physical query plan fragment. The node pool is created /// by the first operator instantiated for the corresponding plan node. It is -/// owned by Task via 'childPools_' +/// owned by Task via 'childPools_'. /// /// The bottom level consists of per-operator pools. These are children of the /// node pool that corresponds to the plan node from which the operator is /// created. Operator and node pools are owned by the Task via 'childPools_'. /// -/// The query pool is created from MemoryManager::getChild() as a child of a -/// singleton root pool object (system pool). There is only one system pool for -/// a velox process. Hence each query pool objects forms a subtree rooted from -/// the system pool. +/// The query pool is created from MemoryManager::addRootPool(), it has no +/// parent and is the root node of its corresponding subtree. Each query pool is +/// owned by QueryCtx (such as in Prestissimo), and the memory manager also +/// tracks the current alive query pools in MemoryManager::pools_ through weak +/// pointers. /// /// Each child pool object holds a shared reference to its parent pool object. -/// The parent object tracks its child pool objects through the raw pool object -/// pointer protected by a mutex. The child pool object destruction first -/// removes its raw pointer from its parent through dropChild() and then drops -/// the shared reference on the parent. +/// The parent object tracks its child pool objects through weak pointers +/// protected by a mutex. The child pool object destruction first removes its +/// weak pointer from its parent through dropChild() and then drops the shared +/// reference on the parent. /// /// NOTE: for the users that integrate at expression evaluation level, we don't /// need to build the memory pool hierarchy as described above. Users can either -/// create a single memory pool from MemoryManager::getChild() to share with +/// create a single memory pool from MemoryManager::addLeafPool() to share with /// all the concurrent expression evaluations or create one dedicated memory /// pool for each expression evaluation if they need per-expression memory quota /// enforcement. /// /// In addition to providing memory allocation functions, the memory pool object -/// also provides memory usage accounting through MemoryUsageTracker. This will -/// be merged into memory pool object later. +/// also provides memory usage accounting. class MemoryPool : public std::enable_shared_from_this { public: /// Defines the kinds of a memory pool. @@ -135,7 +138,7 @@ class MemoryPool : public std::enable_shared_from_this { /// tracking and the capacity enforcement on top of that, but are sensitive /// to its cpu cost so we provide an options for user to turn it off. We can /// only turn on/off this feature at the root memory pool and automatically - /// applies to all its child pools , and we don't support to selectively + /// applies to all its child pools, and we don't support to selectively /// enable it on a subset of memory pools. bool trackUsage{true}; @@ -148,18 +151,13 @@ class MemoryPool : public std::enable_shared_from_this { /// memory pools from the same root memory pool independently. bool threadSafe{true}; - /// TODO: deprecate this flag after all the existing memory leak use cases - /// have been fixed. - /// - /// If true, checks the memory usage leak on destruction. - /// - /// NOTE: user can turn on/off the memory leak check of each individual - /// memory pools from the same root memory pool independently. - bool checkUsageLeak{FLAGS_velox_memory_leak_check_enabled}; - /// If true, tracks the allocation and free call stacks to detect the source /// of memory leak for testing purpose. bool debugEnabled{FLAGS_velox_memory_pool_debug_enabled}; + + /// Terminates the process and generates a core file on an allocation + /// failure + bool coreOnAllocationFailureEnabled{false}; }; /// Constructs a named memory pool with specified 'name', 'parent' and 'kind'. @@ -177,6 +175,7 @@ class MemoryPool : public std::enable_shared_from_this { virtual ~MemoryPool(); /// Tree methods used to access and manage the memory hierarchy. + /// Returns the name of this memory pool. virtual const std::string& name() const; @@ -208,17 +207,9 @@ class MemoryPool : public std::enable_shared_from_this { return threadSafe_; } - /// Returns true if this memory pool checks memory leak on destruction. - /// Used only for test purposes. - virtual bool testingCheckUsageLeak() const { - return checkUsageLeak_; - } - - /// Invoked to traverse the memory pool subtree rooted at this, and calls - /// 'visitor' on each visited child memory pool with the parent pool's - /// 'poolMutex_' reader lock held. The 'visitor' must not access the - /// parent memory pool to avoid the potential recursive locking issues. Note - /// that the traversal stops if 'visitor' returns false. + /// Invoked to visit the memory pool's direct children, and calls 'visitor' on + /// each visited child memory pool. Note that the traversal stops if 'visitor' + /// returns false. virtual void visitChildren( const std::function& visitor) const; @@ -247,9 +238,7 @@ class MemoryPool : public std::enable_shared_from_this { virtual void* allocateZeroFilled(int64_t numEntries, int64_t sizeEach) = 0; /// Re-allocates from an existing buffer with 'newSize' and update memory - /// usage counting accordingly. If 'newSize' is larger than the current buffer - /// 'size', the function will allocate a new buffer and free the old buffer. - /// If the new allocation fails, this method will throw and not free 'p'. + /// usage counting accordingly. virtual void* reallocate(void* p, int64_t size, int64_t newSize) = 0; /// Frees an allocated buffer. @@ -328,21 +317,30 @@ class MemoryPool : public std::enable_shared_from_this { /// 'capacity()' is fixed and set to 'maxCapacity()' on creation. virtual int64_t capacity() const = 0; - /// Returns the currently used memory in bytes of this memory pool. - virtual int64_t currentBytes() const = 0; + /// Returns the currently used memory in bytes of this memory pool. For + /// non-leaf memory pool, the function returns the aggregated used memory from + /// all its child memory pools. + virtual int64_t usedBytes() const = 0; /// Returns the peak memory usage in bytes of this memory pool. virtual int64_t peakBytes() const = 0; - /// Returns the reserved but not used memory reservation in bytes of this - /// memory pool. + /// Returns the reserved but not used memory in bytes of this memory pool. /// - /// NOTE: this is always zero for non-leaf memory pool as it only aggregate + /// NOTE: this is always zero for non-leaf memory pool as it only aggregates /// the memory reservations from its child memory pools but not /// differentiating whether the aggregated reservations have been actually /// used in child pools or not. virtual int64_t availableReservation() const = 0; + /// Returns the reserved but not used memory in bytes that can be released by + /// calling 'release()'. This might be different from 'availableReservation()' + /// because leaf memory pool makes quantized memory reservation. + /// + /// NOTE: For non-leaf memory pool, it returns the aggregated releasable + /// memory reservations from all its leaf memory pool. + virtual int64_t releasableReservation() const = 0; + /// Returns the reserved memory reservation in bytes including both used and /// unused reservations. virtual int64_t reservedBytes() const = 0; @@ -366,16 +364,6 @@ class MemoryPool : public std::enable_shared_from_this { /// without actually freeing the used memory. virtual uint64_t freeBytes() const = 0; - /// Invoked to free up to the specified amount of free memory by reducing - /// this memory pool's capacity without actually freeing any used memory. The - /// function returns the actually freed memory capacity in bytes. If - /// 'targetBytes' is zero, the function frees all the free memory capacity. - virtual uint64_t shrink(uint64_t targetBytes = 0) = 0; - - /// Invoked to increase the memory pool's capacity by 'bytes'. The function - /// returns the memory pool's capacity after the growth. - virtual uint64_t grow(uint64_t bytes) noexcept = 0; - /// Sets the memory reclaimer for this memory pool. /// /// NOTE: this shall only be called at most once if the memory pool hasn't set @@ -385,31 +373,23 @@ class MemoryPool : public std::enable_shared_from_this { /// Returns the memory reclaimer of this memory pool if not null. virtual MemoryReclaimer* reclaimer() const = 0; - /// Invoked by the memory arbitrator to enter memory arbitration processing. - /// It is a noop if 'reclaimer_' is not set, otherwise invoke the reclaimer's - /// corresponding method. - virtual void enterArbitration() = 0; - - /// Invoked by the memory arbitrator to leave memory arbitration processing. - /// It is a noop if 'reclaimer_' is not set, otherwise invoke the reclaimer's - /// corresponding method. - virtual void leaveArbitration() noexcept = 0; - - /// Returns how many bytes is reclaimable from this memory pool. The function - /// returns true if this memory pool is reclaimable, and returns the estimated - /// reclaimable bytes in 'reclaimableBytes'. If 'reclaimer_' is not set, the - /// function returns false, otherwise invoke the reclaimer's corresponding - /// method. - virtual bool reclaimableBytes(uint64_t& reclaimableBytes) const = 0; + /// Function estimates the number of reclaimable bytes and returns in + /// 'reclaimableBytes'. If the 'reclaimer' is not set, the function returns + /// std::nullopt. Otherwise, it will invoke the corresponding method of the + /// reclaimer. + virtual std::optional reclaimableBytes() const = 0; /// Invoked by the memory arbitrator to reclaim memory from this memory pool /// with specified reclaim target bytes. If 'targetBytes' is zero, then it /// tries to reclaim all the reclaimable memory from the memory pool. It is /// noop if the reclaimer is not set, otherwise invoke the reclaimer's - /// corresponding method. The function returns the actually freed capacity - /// from the root of this memory pool. + /// corresponding method. If not zero, 'maxWaitMs' specifies the max time in + /// milliseconds to wait for reclaim. The memory reclaim might fail if exceeds + /// the timeout. The function returns the actually freed capacity from the + /// root of this memory pool. virtual uint64_t reclaim( uint64_t targetBytes, + uint64_t maxWaitMs, memory::MemoryReclaimer::Stats& stats) = 0; /// Invoked by the memory arbitrator to abort a root memory pool. The function @@ -422,12 +402,14 @@ class MemoryPool : public std::enable_shared_from_this { virtual void abort(const std::exception_ptr& error) = 0; /// Returns true if this memory pool has been aborted. - virtual bool aborted() const = 0; + virtual bool aborted() const; /// The memory pool's execution stats. struct Stats { /// The current memory usage. - uint64_t currentBytes{0}; + uint64_t usedBytes{0}; + /// The current reserved memory. + uint64_t reservedBytes{0}; /// The peak memory usage. uint64_t peakBytes{0}; /// The accumulative memory usage. @@ -454,16 +436,21 @@ class MemoryPool : public std::enable_shared_from_this { /// The number of internal memory reservation collisions caused by /// concurrent memory requests. uint64_t numCollisions{0}; + /// The number of memory capacity growth attempts through the memory + /// arbitration. + /// + /// NOTE: this only applies for the root memory pool. + uint64_t numCapacityGrowths{0}; bool operator==(const Stats& rhs) const; std::string toString() const; - /// Returns true if the current bytes is zero. + /// Returns true if the current and reserved bytes are zero. /// Note that peak or cumulative bytes might be non-zero and we are still /// empty at this moment. bool empty() const { - return currentBytes == 0; + return usedBytes == 0 && reservedBytes == 0; } }; @@ -473,8 +460,9 @@ class MemoryPool : public std::enable_shared_from_this { virtual std::string toString() const = 0; /// Invoked to generate a descriptive memory usage summary of the entire tree. - /// MemoryPoolImpl::treeMemoryUsage() - virtual std::string treeMemoryUsage() const = 0; + /// If 'skipEmptyPool' is true, then skip print out the child memory pools + /// with empty memory usage. + virtual std::string treeMemoryUsage(bool skipEmptyPool = true) const = 0; /// Indicates if this is a leaf memory pool or not. FOLLY_ALWAYS_INLINE bool isLeaf() const { @@ -502,8 +490,35 @@ class MemoryPool : public std::enable_shared_from_this { protected: static constexpr uint64_t kMB = 1 << 20; - /// Invoked by addChild() to create a child memory pool object. 'parent' is - /// a shared pointer created from this. + /// Invoked by the memory arbitrator to enter memory arbitration processing. + /// It is a noop if 'reclaimer' is not set, otherwise invoke the reclaimer's + /// corresponding method. + virtual void enterArbitration() = 0; + + /// Invoked by the memory arbitrator to leave memory arbitration processing. + /// It is a noop if 'reclaimer' is not set, otherwise invoke the reclaimer's + /// corresponding method. + virtual void leaveArbitration() noexcept = 0; + + /// Invoked to free up to the specified amount of free memory by reducing + /// this memory pool's capacity without actually freeing any used memory. The + /// function returns the actually freed memory capacity in bytes. If + /// 'targetBytes' is zero, the function frees all the free memory capacity. + virtual uint64_t shrink(uint64_t targetBytes = 0) = 0; + + /// Invoked to increase the memory pool's capacity by 'growBytes' and commit + /// the reservation by 'reservationBytes'. The function makes the two updates + /// atomic. The function returns true if the updates succeed, otherwise false + /// and neither change will apply. + /// + /// NOTE: this should only be called by memory arbitrator when a root memory + /// pool tries to grow its capacity for a new reservation request which + /// exceeds its current capacity limit. + virtual bool grow(uint64_t growBytes, uint64_t reservationBytes = 0) = 0; + + /// Invoked by addLeafChild() and addAggregateChild() to create a child memory + /// pool object. 'parent' is a shared pointer created from this, ie, + /// shared_from_this(). virtual std::shared_ptr genChild( std::shared_ptr parent, const std::string& name, @@ -511,6 +526,8 @@ class MemoryPool : public std::enable_shared_from_this { bool threadSafe, std::unique_ptr reclaimer) = 0; + virtual std::exception_ptr abortError() const; + /// Invoked only on destruction to remove this memory pool from its parent's /// child memory pool tracking. virtual void dropChild(const MemoryPool* child); @@ -522,8 +539,8 @@ class MemoryPool : public std::enable_shared_from_this { const int64_t maxCapacity_; const bool trackUsage_; const bool threadSafe_; - const bool checkUsageLeak_; const bool debugEnabled_; + const bool coreOnAllocationFailureEnabled_; /// Indicates if the memory pool has been aborted by the memory arbitrator or /// not. @@ -531,15 +548,22 @@ class MemoryPool : public std::enable_shared_from_this { /// NOTE: this flag is only set for a root memory pool if it has memory /// reclaimer. We process a query abort request from the root memory pool. std::atomic aborted_{false}; + /// Saves the aborted error exception which is only set if 'aborted_' is true. + std::exception_ptr abortError_{nullptr}; mutable folly::SharedMutex poolMutex_; - // NOTE: we use raw pointer instead of weak pointer here to minimize - // visitChildren() cost as we don't have to upgrade the weak pointer and copy - // out the upgraded shared pointers.git std::unordered_map> children_; - friend class TestMemoryReclaimer; friend class MemoryReclaimer; + friend class velox::exec::ParallelMemoryReclaimer; + friend class MemoryManager; + friend class MemoryArbitrator; + friend class ScopedMemoryPoolArbitrationCtx; + + VELOX_FRIEND_TEST(MemoryPoolTest, shrinkAndGrowAPIs); + VELOX_FRIEND_TEST(MemoryPoolTest, grow); + VELOX_FRIEND_TEST(MemoryPoolTest, growFailures); + VELOX_FRIEND_TEST(MemoryPoolTest, grownonContiguousAllocateFailures); }; std::ostream& operator<<(std::ostream& out, MemoryPool::Kind kind); @@ -548,6 +572,8 @@ std::ostream& operator<<(std::ostream& os, const MemoryPool::Stats& stats); class MemoryPoolImpl : public MemoryPool { public: + /// The callback invoked on the root memory pool destruction. It is set by + /// memory manager to removes the pool from 'MemoryManager::pools_'. using DestructionCallback = std::function; MemoryPoolImpl( @@ -555,8 +581,7 @@ class MemoryPoolImpl : public MemoryPool { const std::string& name, Kind kind, std::shared_ptr parent, - std::unique_ptr reclaimer = nullptr, - DestructionCallback destructionCb = nullptr, + std::unique_ptr reclaimer, const Options& options = Options{}); ~MemoryPoolImpl() override; @@ -593,10 +618,7 @@ class MemoryPoolImpl : public MemoryPool { int64_t capacity() const override; - int64_t currentBytes() const override { - std::lock_guard l(mutex_); - return currentBytesLocked(); - } + int64_t usedBytes() const override; int64_t peakBytes() const override { std::lock_guard l(mutex_); @@ -608,8 +630,9 @@ class MemoryPoolImpl : public MemoryPool { return availableReservationLocked(); } + int64_t releasableReservation() const override; + int64_t reservedBytes() const override { - std::lock_guard l(mutex_); return reservationBytes_; } @@ -623,65 +646,73 @@ class MemoryPoolImpl : public MemoryPool { MemoryReclaimer* reclaimer() const override; - void enterArbitration() override; - - void leaveArbitration() noexcept override; - - bool reclaimableBytes(uint64_t& reclaimableBytes) const override; - - uint64_t reclaim(uint64_t targetBytes, memory::MemoryReclaimer::Stats& stats) - override; - - uint64_t shrink(uint64_t targetBytes = 0) override; + std::optional reclaimableBytes() const override; - uint64_t grow(uint64_t bytes) noexcept override; + uint64_t reclaim( + uint64_t targetBytes, + uint64_t maxWaitMs, + memory::MemoryReclaimer::Stats& stats) override; void abort(const std::exception_ptr& error) override; - bool aborted() const override; + void setDestructionCallback(const DestructionCallback& callback); std::string toString() const override { std::lock_guard l(mutex_); return toStringLocked(); } - // Detailed debug pool state printout by traversing the pool structure from - // the root memory pool. - // - // Exceeded memory cap of 5.00MB when requesting 2.00MB - // default_root_1 usage 5.00MB peak 5.00MB - // task.test_cursor 1 usage 5.00MB peak 5.00MB - // node.N/A usage 0B peak 0B - // op.N/A.0.0.CallbackSink usage 0B peak 0B - // node.2 usage 4.00MB peak 4.00MB - // op.2.0.0.Aggregation usage 3.77MB peak 3.77MB - // node.1 usage 1.00MB peak 1.00MB - // op.1.0.0.FilterProject usage 12.00KB peak 12.00KB - // node.3 usage 0B peak 0B - // op.3.0.0.OrderBy usage 0B peak 0B - // node.0 usage 0B peak 0B - // op.0.0.0.Values usage 0B peak 0B - // - // Top 5 leaf memory pool usages: - // op.2.0.0.Aggregation usage 3.77MB peak 3.77MB - // op.1.0.0.FilterProject usage 12.00KB peak 12.00KB - // op.N/A.0.0.CallbackSink usage 0B peak 0B - // op.3.0.0.OrderBy usage 0B peak 0B - // op.0.0.0.Values usage 0B peak 0B - std::string treeMemoryUsage() const override; + /// Detailed debug pool state printout by traversing the pool structure from + /// the root memory pool. + /// + /// Exceeded memory cap of 5.00MB when requesting 2.00MB + /// default_root_1 usage 5.00MB peak 5.00MB + /// task.test_cursor 1 usage 5.00MB peak 5.00MB + /// node.N/A usage 0B peak 0B + /// op.N/A.0.0.CallbackSink usage 0B peak 0B + /// node.2 usage 4.00MB peak 4.00MB + /// op.2.0.0.Aggregation usage 3.77MB peak 3.77MB + /// node.1 usage 1.00MB peak 1.00MB + /// op.1.0.0.FilterProject usage 12.00KB peak 12.00KB + /// node.3 usage 0B peak 0B + /// op.3.0.0.OrderBy usage 0B peak 0B + /// node.0 usage 0B peak 0B + /// op.0.0.0.Values usage 0B peak 0B + /// + /// Top 5 leaf memory pool usages: + /// op.2.0.0.Aggregation usage 3.77MB peak 3.77MB + /// op.1.0.0.FilterProject usage 12.00KB peak 12.00KB + /// op.N/A.0.0.CallbackSink usage 0B peak 0B + /// op.3.0.0.OrderBy usage 0B peak 0B + /// op.0.0.0.Values usage 0B peak 0B + std::string treeMemoryUsage(bool skipEmptyPool = true) const override; Stats stats() const override; void testingSetCapacity(int64_t bytes); + void testingSetReservation(int64_t bytes); + + MemoryManager* testingManager() const { + return manager_; + } + MemoryAllocator* testingAllocator() const { return allocator_; } + void testingCheckIfAborted() const { + checkIfAborted(); + } + + uint64_t testingMinReservationBytes() const { + return minReservationBytes_; + } + /// Structure to store allocation details in debug mode. struct AllocationRecord { uint64_t size; - std::string callStack; + process::StackTrace callStack; }; std::unordered_map& testingDebugAllocRecords() { @@ -693,6 +724,14 @@ class MemoryPoolImpl : public MemoryPool { } private: + void enterArbitration() override; + + void leaveArbitration() noexcept override; + + uint64_t shrink(uint64_t targetBytes = 0) override; + + bool grow(uint64_t growBytes, uint64_t reservationBytes = 0) override; + FOLLY_ALWAYS_INLINE static MemoryPoolImpl* toImpl(MemoryPool* pool) { return static_cast(pool); } @@ -718,10 +757,6 @@ class MemoryPoolImpl : public MemoryPool { return parent_ != nullptr ? toImpl(parent_)->capacity_ : capacity_; } - FOLLY_ALWAYS_INLINE int64_t currentBytesLocked() const { - return isLeaf() ? usedReservationBytes_ : reservationBytes_; - } - FOLLY_ALWAYS_INLINE int64_t availableReservationLocked() const { return !isLeaf() ? 0 @@ -822,7 +857,8 @@ class MemoryPoolImpl : public MemoryPool { // Tries to increment the reservation 'size' if it is within the limit and // returns true, otherwise the function returns false. bool maybeIncrementReservation(uint64_t size); - bool maybeIncrementReservationLocked(uint64_t size); + + void incrementReservationLocked(uint64_t bytes); // Release memory reservation for an allocation free or memory release with // specified 'size'. If 'releaseOnly' is true, then we only release the unused @@ -833,6 +869,11 @@ class MemoryPoolImpl : public MemoryPool { void releaseThreadSafe(uint64_t size, bool releaseOnly); + // Invoked to grow capacity of the root memory pool from the memory + // arbitrator. 'requestor' is the leaf memory pool that triggers the memory + // capacity growth. 'size' is the memory capacity growth in bytes. + bool growCapacity(MemoryPool* requestor, uint64_t size); + FOLLY_ALWAYS_INLINE void releaseNonThreadSafe( uint64_t size, bool releaseOnly) { @@ -874,6 +915,13 @@ class MemoryPoolImpl : public MemoryPool { } } + void setAbortError(const std::exception_ptr& error); + + // Check if this memory pool has been aborted. If already aborted, we rethrow + // the preserved abort error to prevent this pool from triggering additional + // memory arbitration. The associated query should also abort soon. + void checkIfAborted() const; + Stats statsLocked() const; FOLLY_ALWAYS_INLINE std::string toStringLocked() const { @@ -894,7 +942,7 @@ class MemoryPoolImpl : public MemoryPool { } else { out << "unlimited capacity "; } - out << "used " << succinctBytes(currentBytesLocked()) << " available " + out << "used " << succinctBytes(usedBytes()) << " available " << succinctBytes(availableReservationLocked()); out << " reservation [used " << succinctBytes(usedReservationBytes_) << ", reserved " << succinctBytes(reservationBytes_) << ", min " @@ -949,22 +997,26 @@ class MemoryPoolImpl : public MemoryPool { // pool is enabled. void leakCheckDbg(); + void handleAllocationFailure(const std::string& failureMessage); + MemoryManager* const manager_; MemoryAllocator* const allocator_; - const DestructionCallback destructionCb_; + MemoryArbitrator* const arbitrator_; // Regex for filtering on 'name_' when debug mode is enabled. This allows us // to only track the callsites of memory allocations for memory pools whose // name matches the specified regular expression 'debugPoolNameRegex_'. const std::string debugPoolNameRegex_; - // Serializes updates on 'grantedReservationBytes_', 'usedReservationBytes_' + // Serializes updates on 'reservationBytes_', 'usedReservationBytes_' // and 'minReservationBytes_' to make reservation decision on a consistent // read/write of those counters. incrementReservation()/decrementReservation() // work based on atomic 'reservationBytes_' without mutex as children updating // the same parent do not have to be serialized. mutable std::mutex mutex_; + DestructionCallback destructionCb_; + // Used by memory arbitration to reclaim memory from the associated query // object if not null. For example, a memory pool can reclaim the used memory // from a spillable operator through disk spilling. If null, we can't reclaim @@ -991,20 +1043,26 @@ class MemoryPoolImpl : public MemoryPool { // Stats counters. // The number of memory allocations. - std::atomic numAllocs_{0}; + std::atomic_uint64_t numAllocs_{0}; // The number of memory frees. - std::atomic numFrees_{0}; + std::atomic_uint64_t numFrees_{0}; // The number of external memory reservations made through maybeReserve(). - std::atomic numReserves_{0}; + std::atomic_uint64_t numReserves_{0}; // The number of external memory releases made through release(). - std::atomic numReleases_{0}; + std::atomic_uint64_t numReleases_{0}; // The number of internal memory reservation collisions caused by concurrent // memory reservation requests. - std::atomic numCollisions_{0}; + std::atomic_uint64_t numCollisions_{0}; + + // The number of memory capacity growth attempts through the memory + // arbitration. + // + // NOTE: this only applies for root memory pool. + std::atomic_uint64_t numCapacityGrowths_{0}; // Mutex for 'debugAllocRecords_'. std::mutex debugAllocMutex_; @@ -1047,3 +1105,13 @@ class StlAllocator { } }; } // namespace facebook::velox::memory + +template <> +struct fmt::formatter + : formatter { + auto format(facebook::velox::memory::MemoryPool::Kind s, format_context& ctx) + const { + return formatter::format( + facebook::velox::memory::MemoryPool::kindString(s), ctx); + } +}; diff --git a/velox/common/memory/MmapAllocator.cpp b/velox/common/memory/MmapAllocator.cpp index 1db2ce06a7416..8ab440e212526 100644 --- a/velox/common/memory/MmapAllocator.cpp +++ b/velox/common/memory/MmapAllocator.cpp @@ -18,12 +18,15 @@ #include +#include "velox/common/base/Counters.h" #include "velox/common/base/Portability.h" +#include "velox/common/base/StatsReporter.h" #include "velox/common/memory/Memory.h" namespace facebook::velox::memory { MmapAllocator::MmapAllocator(const Options& options) - : kind_(MemoryAllocator::Kind::kMmap), + : MemoryAllocator(options.largestSizeClass), + kind_(MemoryAllocator::Kind::kMmap), useMmapArena_(options.useMmapArena), maxMallocBytes_(options.maxMallocBytes), mallocReservedBytes_( @@ -52,75 +55,53 @@ MmapAllocator::~MmapAllocator() { } bool MmapAllocator::allocateNonContiguousWithoutRetry( - MachinePageCount numPages, - Allocation& out, - ReservationCallback reservationCB, - MachinePageCount minSizeClass) { - const MachinePageCount numFreed = freeInternal(out); - const auto bytesFreed = AllocationTraits::pageBytes(numFreed); + const SizeMix& sizeMix, + Allocation& out) { + const MachinePageCount numFreed = freeNonContiguousInternal(out); if (numFreed != 0) { numAllocated_.fetch_sub(numFreed); } - if (numPages == 0) { - if ((bytesFreed != 0) && (reservationCB != nullptr)) { - reservationCB(bytesFreed, false); - } + if (sizeMix.totalPages == 0) { return true; } - const SizeMix mix = allocationSize(numPages, minSizeClass); - if (testingHasInjectedFailure(InjectedFailure::kCap)) { - if ((bytesFreed != 0) && (reservationCB != nullptr)) { - reservationCB(bytesFreed, false); - } - return false; - } - if (numAllocated_ + mix.totalPages > capacity_) { - VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) - << "Exceeding memory allocator limit when allocate " << mix.totalPages - << " pages with capacity of " << capacity_ << " pages"; - if ((bytesFreed != 0) && (reservationCB != nullptr)) { - reservationCB(bytesFreed, false); - } + if (numAllocated_ + sizeMix.totalPages > capacity_ || + testingHasInjectedFailure(InjectedFailure::kCap)) { + const std::string errorMsg = fmt::format( + "Exceeded memory allocator limit when allocating {} pages with " + "capacity of {} pages", + sizeMix.totalPages, + capacity_); + VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) << errorMsg; + setAllocatorFailureMessage(errorMsg); return false; } - if (numAllocated_.fetch_add(mix.totalPages) + mix.totalPages > capacity_) { - VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) - << "Exceeded memory allocator limit when allocate " << mix.totalPages - << " pages with capacity of " << capacity_ << " pages"; - numAllocated_.fetch_sub(mix.totalPages); - if ((bytesFreed != 0) && (reservationCB != nullptr)) { - reservationCB(bytesFreed, false); - } + if (numAllocated_.fetch_add(sizeMix.totalPages) + sizeMix.totalPages > + capacity_) { + const std::string errorMsg = fmt::format( + "Exceeding memory allocator limit when allocating {} pages with " + "capacity of {} pages", + sizeMix.totalPages, + capacity_); + VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) << errorMsg; + setAllocatorFailureMessage(errorMsg); + numAllocated_.fetch_sub(sizeMix.totalPages); return false; } ++numAllocations_; - numAllocatedPages_ += mix.totalPages; - const int64_t numNeededPages = mix.totalPages - numFreed; - if (reservationCB != nullptr) { - try { - reservationCB(AllocationTraits::pageBytes(numNeededPages), true); - } catch (const std::exception& e) { - VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) - << "Exceeded memory reservation limit when reserve " << numNeededPages - << " new pages when allocate " << mix.totalPages << " pages"; - numAllocated_.fetch_sub(mix.totalPages); - reservationCB(bytesFreed, false); - std::rethrow_exception(std::current_exception()); - } - } + numAllocatedPages_ += sizeMix.totalPages; MachinePageCount newMapsNeeded = 0; - for (int i = 0; i < mix.numSizes; ++i) { + for (int i = 0; i < sizeMix.numSizes; ++i) { bool success; stats_.recordAllocate( - AllocationTraits::pageBytes(sizeClassSizes_[mix.sizeIndices[i]]), - mix.sizeCounts[i], + AllocationTraits::pageBytes(sizeClassSizes_[sizeMix.sizeIndices[i]]), + sizeMix.sizeCounts[i], [&]() { - success = sizeClasses_[mix.sizeIndices[i]]->allocate( - mix.sizeCounts[i], newMapsNeeded, out); + success = sizeClasses_[sizeMix.sizeIndices[i]]->allocate( + sizeMix.sizeCounts[i], newMapsNeeded, out); }); - if (success && ((i > 0) || (mix.numSizes == 1)) && + if (success && ((i > 0) || (sizeMix.numSizes == 1)) && testingHasInjectedFailure(InjectedFailure::kAllocate)) { // Trigger memory allocation failure in the middle of the size class // allocation series. @@ -129,14 +110,15 @@ bool MmapAllocator::allocateNonContiguousWithoutRetry( if (!success) { // This does not normally happen since any size class can accommodate // all the capacity. 'allocatedPages_' must be out of sync. - VELOX_MEM_LOG(WARNING) << "Failed allocation in size class " << i - << " for " << mix.sizeCounts[i] << " pages"; - const auto failedPages = mix.totalPages - out.numPages(); + const std::string errorMsg = fmt::format( + "Failed allocation in size class {} for {} pages", + i, + sizeMix.sizeCounts[i]); + VELOX_MEM_LOG(WARNING) << errorMsg; + setAllocatorFailureMessage(errorMsg); + const auto failedPages = sizeMix.totalPages - out.numPages(); freeNonContiguous(out); numAllocated_.fetch_sub(failedPages); - if (reservationCB != nullptr) { - reservationCB(AllocationTraits::pageBytes(mix.totalPages), false); - } return false; } } @@ -148,13 +130,14 @@ bool MmapAllocator::allocateNonContiguousWithoutRetry( return true; } - VELOX_MEM_LOG(WARNING) << "Could not advise away enough for " << newMapsNeeded - << " pages with total allocation of << " - << mix.totalPages << " pages"; + const std::string errorMsg = fmt::format( + "Could not advise away enough for {} pages for total allocation " + "of {} pages", + newMapsNeeded, + sizeMix.totalPages); + VELOX_MEM_LOG(WARNING) << errorMsg; + setAllocatorFailureMessage(errorMsg); freeNonContiguous(out); - if (reservationCB != nullptr) { - reservationCB(AllocationTraits::pageBytes(mix.totalPages), false); - } return false; } @@ -172,7 +155,6 @@ bool MmapAllocator::ensureEnoughMappedPages(int32_t newMappedNeeded) { // We need to advise away a number of pages or we fail the alloc. const auto target = totalMaps - capacity_; const auto numAdvised = adviseAway(target); - numAdvisedPages_ += numAdvised; if (numAdvised >= target) { numMapped_.fetch_sub(numAdvised); return true; @@ -182,13 +164,21 @@ bool MmapAllocator::ensureEnoughMappedPages(int32_t newMappedNeeded) { } int64_t MmapAllocator::freeNonContiguous(Allocation& allocation) { - const auto numFreed = freeInternal(allocation); + const auto numFreed = freeNonContiguousInternal(allocation); numAllocated_.fetch_sub(numFreed); return AllocationTraits::pageBytes(numFreed); } -MachinePageCount MmapAllocator::freeInternal(Allocation& allocation) { - MachinePageCount numFreed = 0; +MachinePageCount MmapAllocator::unmap(MachinePageCount targetPages) { + std::lock_guard l(sizeClassBalanceMutex_); + const auto numAdvised = adviseAway(targetPages); + numMapped_.fetch_sub(numAdvised); + return numAdvised; +} + +MachinePageCount MmapAllocator::freeNonContiguousInternal( + Allocation& allocation) { + MachinePageCount numFreed{0}; if (allocation.empty()) { return numFreed; } @@ -219,12 +209,10 @@ bool MmapAllocator::allocateContiguousWithoutRetry( MachinePageCount numPages, Allocation* collateral, ContiguousAllocation& allocation, - ReservationCallback reservationCB, MachinePageCount maxPages) { bool result; stats_.recordAllocate(AllocationTraits::pageBytes(numPages), 1, [&]() { - result = allocateContiguousImpl( - numPages, collateral, allocation, reservationCB, maxPages); + result = allocateContiguousImpl(numPages, collateral, allocation, maxPages); }); return result; } @@ -233,7 +221,6 @@ bool MmapAllocator::allocateContiguousImpl( MachinePageCount numPages, Allocation* collateral, ContiguousAllocation& allocation, - ReservationCallback reservationCB, MachinePageCount maxPages) { if (maxPages == 0) { maxPages = numPages; @@ -257,7 +244,7 @@ bool MmapAllocator::allocateContiguousImpl( // 'allocation' cover the new size, as other threads might grab the // transiently free pages. if (collateral != nullptr) { - numCollateralPages = freeInternal(*collateral); + numCollateralPages = freeNonContiguousInternal(*collateral); } const auto numLargeCollateralPages = allocation.numPages(); if (numLargeCollateralPages > 0) { @@ -275,34 +262,12 @@ bool MmapAllocator::allocateContiguousImpl( } const auto totalCollateralPages = numCollateralPages + numLargeCollateralPages; - const auto totalCollateralBytes = - AllocationTraits::pageBytes(totalCollateralPages); if (numPages == 0) { - if (totalCollateralBytes != 0 && reservationCB != nullptr) { - reservationCB(totalCollateralBytes, false); - } return true; } const auto numCollateralUnmap = numLargeCollateralPages; const int64_t newPages = numPages - totalCollateralPages; - if (reservationCB != nullptr) { - try { - reservationCB(AllocationTraits::pageBytes(newPages), true); - } catch (const std::exception& e) { - VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) - << "Exceeded memory reservation limit when reserve " << newPages - << " new pages when allocate " << numPages << " pages"; - numAllocated_ -= totalCollateralPages; - numMapped_ -= numCollateralUnmap; - numExternalMapped_ -= numCollateralUnmap; - - // We failed to grow by 'newPages. So we record the freeing off the whole - // collateral and the unmap of former 'allocation'. - reservationCB(totalCollateralBytes, false); - std::rethrow_exception(std::current_exception()); - } - } // Rolls back the counters on failure. 'mappedDecrement' is subtracted from // 'numMapped_' on top of other adjustment. @@ -315,10 +280,6 @@ bool MmapAllocator::allocateContiguousImpl( // were never allocated. numExternalMapped_ -= numPages; numMapped_ -= numCollateralUnmap + mappedDecrement; - - if (reservationCB != nullptr) { - reservationCB(AllocationTraits::pageBytes(numPages), false); - } }; numExternalMapped_ += numPages - numCollateralUnmap; @@ -329,10 +290,15 @@ bool MmapAllocator::allocateContiguousImpl( if (newPages > 0 && (numAllocated > capacity_ || testingHasInjectedFailure(InjectedFailure::kCap))) { - VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) - << "Exceeded memory allocator limit when allocate " << newPages - << " new pages for total allocation of " << numPages - << " pages, the memory allocator capacity is " << capacity_ << " pages"; + const std::string errorMsg = fmt::format( + "Exceeded memory allocator limit when allocating {} new pages for " + "total allocation of {} pages, the memory allocator capacity is" + " {} pages", + newPages, + numPages, + capacity_); + VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) << errorMsg; + setAllocatorFailureMessage(errorMsg); rollbackAllocation(0); return false; } @@ -341,9 +307,13 @@ bool MmapAllocator::allocateContiguousImpl( const int64_t numToMap = numPages - numCollateralUnmap; if (numToMap > 0) { if (!ensureEnoughMappedPages(numToMap)) { - VELOX_MEM_LOG(WARNING) - << "Could not advise away enough for " << numToMap - << " pages for total allocation of " << numPages << " pages"; + const std::string errorMsg = fmt::format( + "Could not advise away enough for {} pages for total allocation " + "of {} pages", + numToMap, + numPages); + VELOX_MEM_LOG(WARNING) << errorMsg; + setAllocatorFailureMessage(errorMsg); rollbackAllocation(0); return false; } @@ -369,11 +339,15 @@ bool MmapAllocator::allocateContiguousImpl( 0); } } - // TODO: add handling of MAP_FAILED. - if (data == nullptr) { - VELOX_MEM_LOG(ERROR) << "Mmap failed with " << numPages - << " pages, use MmapArena " - << (useMmapArena_ ? "true" : "false"); + if (data == nullptr || data == MAP_FAILED) { + const std::string errorMsg = fmt::format( + "Mmap failed with {} pages use MmapArena {}, errno {}, Mmap Allocator: {}", + numPages, + (useMmapArena_ ? "true" : "false"), + folly::errnoStr(errno), + toString()); + VELOX_MEM_LOG(ERROR) << errorMsg; + setAllocatorFailureMessage(errorMsg); // If the mmap failed, we have unmapped former 'allocation' and the extra to // be mapped. rollbackAllocation(numToMap); @@ -414,38 +388,33 @@ void MmapAllocator::freeContiguousImpl(ContiguousAllocation& allocation) { bool MmapAllocator::growContiguousWithoutRetry( MachinePageCount increment, - ContiguousAllocation& allocation, - ReservationCallback reservationCB) { - VELOX_CHECK_LE( - allocation.size() + increment * AllocationTraits::kPageSize, - allocation.maxSize()); - if (reservationCB != nullptr) { - // May throw. If does, there is nothing to revert. - reservationCB(AllocationTraits::pageBytes(increment), true); - } + ContiguousAllocation& allocation) { auto numAllocated = numAllocated_.fetch_add(increment) + increment; if (numAllocated > capacity_ || testingHasInjectedFailure(InjectedFailure::kCap)) { - VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) - << "Exceeded memory allocator limit when adding " << increment - << " new pages for total allocation of " << allocation.numPages() - << " pages, the memory allocator capacity is " << capacity_ << " pages"; + const std::string errorMsg = fmt::format( + "Exceeded memory allocator limit when allocating {} new pages for " + "total allocation of {} pages, the memory allocator capacity is" + " {} pages", + increment, + allocation.numPages(), + capacity_); + VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) << errorMsg; + setAllocatorFailureMessage(errorMsg); numAllocated_ -= increment; - if (reservationCB != nullptr) { - reservationCB(AllocationTraits::pageBytes(increment), false); - } return false; } // Check if need to advise away - if (!ensureEnoughMappedPages(increment) || - testingHasInjectedFailure(InjectedFailure::kMmap)) { - VELOX_MEM_LOG(WARNING) << "Could not advise away enough for " << increment - << " pages for growing allocation of " - << allocation.numPages() << " pages"; - if (reservationCB != nullptr) { - reservationCB(AllocationTraits::pageBytes(increment), false); - } + if (testingHasInjectedFailure(InjectedFailure::kMmap) || + !ensureEnoughMappedPages(increment)) { + const std::string errorMsg = fmt::format( + "Could not advise away enough for {} pages for growing allocation " + "of {} pages", + increment, + allocation.numPages()); + VELOX_MEM_LOG(WARNING) << errorMsg; + setAllocatorFailureMessage(errorMsg); numAllocated_.fetch_sub(increment); return false; } @@ -470,7 +439,7 @@ void* MmapAllocator::allocateBytesWithoutRetry( VELOX_MEM_LOG(ERROR) << "Failed to allocateBytes " << bytes << " bytes with " << alignment << " alignment"; } else { - numMallocBytes_.fetch_add(bytes); + numMallocBytes_ += bytes; } return result; } @@ -478,8 +447,8 @@ void* MmapAllocator::allocateBytesWithoutRetry( if (bytes <= AllocationTraits::pageBytes(sizeClassSizes_.back())) { Allocation allocation; const auto numPages = roundUpToSizeClassSize(bytes, sizeClassSizes_); - if (!allocateNonContiguousWithoutRetry( - numPages, allocation, nullptr, numPages)) { + const SizeMix mix = allocationSize(numPages, numPages); + if (!allocateNonContiguousWithoutRetry(mix, allocation)) { return nullptr; } auto run = allocation.runAt(0); @@ -492,8 +461,7 @@ void* MmapAllocator::allocateBytesWithoutRetry( } ContiguousAllocation allocation; - auto numPages = bits::roundUp(bytes, AllocationTraits::kPageSize) / - AllocationTraits::kPageSize; + auto numPages = AllocationTraits::numPages(bytes); if (!allocateContiguousWithoutRetry(numPages, nullptr, allocation)) { return nullptr; } @@ -506,7 +474,7 @@ void* MmapAllocator::allocateBytesWithoutRetry( void MmapAllocator::freeBytes(void* p, uint64_t bytes) noexcept { if (useMalloc(bytes)) { ::free(p); // NOLINT - numMallocBytes_.fetch_sub(bytes); + numMallocBytes_ -= bytes; return; } @@ -537,6 +505,7 @@ MachinePageCount MmapAllocator::adviseAway(MachinePageCount target) { break; } } + numAdvisedPages_ += numAway; return numAway; } @@ -625,20 +594,23 @@ std::string MmapAllocator::SizeClass::toString() const { int count = 0; int mappedCount = 0; int mappedFreeCount = 0; - for (int i = 0; i < pageBitmapSize_; ++i) { - count += __builtin_popcountll(pageAllocated_[i]); - mappedCount += __builtin_popcountll(pageMapped_[i]); - mappedFreeCount += - __builtin_popcountll(~pageAllocated_[i] & pageMapped_[i]); - } - auto mb = (AllocationTraits::pageBytes(count * unitSize_)) >> 20; - out << "[size " << unitSize_ << ": " << count << "(" << mb << "MB) allocated " - << mappedCount << " mapped"; - if (mappedFreeCount != numMappedFreePages_) { - out << "Mismatched count of mapped free pages " - << ". Actual= " << mappedFreeCount - << " vs recorded= " << numMappedFreePages_ - << ". Total mapped=" << mappedCount; + { + std::lock_guard l(mutex_); + for (int i = 0; i < pageBitmapSize_; ++i) { + count += __builtin_popcountll(pageAllocated_[i]); + mappedCount += __builtin_popcountll(pageMapped_[i]); + mappedFreeCount += + __builtin_popcountll(~pageAllocated_[i] & pageMapped_[i]); + } + auto mb = (AllocationTraits::pageBytes(count * unitSize_)) >> 20; + out << "[size " << unitSize_ << ": " << count << "(" << mb + << "MB) allocated " << mappedCount << " mapped"; + if (mappedFreeCount != numMappedFreePages_) { + out << "Mismatched count of mapped free pages " + << ". Actual= " << mappedFreeCount + << " vs recorded= " << numMappedFreePages_ + << ". Total mapped=" << mappedCount; + } } out << "]"; return out.str(); @@ -909,10 +881,10 @@ MachinePageCount MmapAllocator::SizeClass::free(Allocation& allocation) { const int firstBit = (runAddress - address_) / (AllocationTraits::kPageSize * unitSize_); for (auto page = firstBit; page < firstBit + numPages; ++page) { - if (!bits::isBitSet(pageAllocated_.data(), page)) { - // TODO: change this to a velox failure to catch the bug. + if (FOLLY_UNLIKELY(!bits::isBitSet(pageAllocated_.data(), page))) { VELOX_MEM_LOG(ERROR) << "Double free: page = " << page << " sizeclass = " << unitSize_; + RECORD_METRIC_VALUE(kMetricMemoryAllocatorDoubleFreeCount); continue; } if (bits::isBitSet(pageMapped_.data(), page)) { @@ -985,8 +957,13 @@ bool MmapAllocator::useMalloc(uint64_t bytes) { std::string MmapAllocator::toString() const { std::stringstream out; - out << "Memory Allocator[" << kindString(kind_) << " capacity " - << ((capacity_ == kMaxMemory) ? "UNLIMITED" : succinctBytes(capacity_)) + out << "Memory Allocator[" << kindString(kind_) << " total capacity " + << ((capacity_ == kMaxMemory) ? "UNLIMITED" : succinctBytes(capacity())) + << " free capacity " + << ((capacity_ == kMaxMemory) + ? "UNLIMITED" + : succinctBytes( + capacity() - AllocationTraits::pageBytes(numAllocated()))) << " allocated pages " << numAllocated_ << " mapped pages " << numMapped_ << " external mapped pages " << numExternalMapped_ << std::endl; for (auto& sizeClass : sizeClasses_) { diff --git a/velox/common/memory/MmapAllocator.h b/velox/common/memory/MmapAllocator.h index 40e9ca35cb544..fd17ee8fa765b 100644 --- a/velox/common/memory/MmapAllocator.h +++ b/velox/common/memory/MmapAllocator.h @@ -24,8 +24,11 @@ #include #include +#include + #include "velox/common/base/SimdUtil.h" #include "velox/common/memory/MemoryAllocator.h" +#include "velox/common/memory/MemoryPool.h" #include "velox/common/memory/MmapArena.h" namespace facebook::velox::memory { @@ -50,7 +53,9 @@ class MmapAllocator : public MemoryAllocator { public: struct Options { /// Capacity in bytes, default unlimited. - uint64_t capacity = kDefaultCapacityBytes; + uint64_t capacity{kMaxMemory}; + + int32_t largestSizeClass{256}; /// If set true, allocations larger than largest size class size will be /// delegated to ManagedMmapArena. Otherwise a system mmap call will be @@ -102,13 +107,14 @@ class MmapAllocator : public MemoryAllocator { bool growContiguousWithoutRetry( MachinePageCount increment, - ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr) override; + ContiguousAllocation& allocation) override; void freeContiguous(ContiguousAllocation& allocation) override; int64_t freeNonContiguous(Allocation& allocation) override; + MachinePageCount unmap(MachinePageCount targetPages) override; + void freeBytes(void* p, uint64_t bytes) noexcept override; /// Checks internal consistency of allocation data structures. Returns true if @@ -128,7 +134,7 @@ class MmapAllocator : public MemoryAllocator { } size_t totalUsedBytes() const override { - return numMallocBytes_ + AllocationTraits::pageBytes(numAllocated_); + return numMallocBytes() + AllocationTraits::pageBytes(numAllocated_); } MachinePageCount numAllocated() const override { @@ -144,7 +150,7 @@ class MmapAllocator : public MemoryAllocator { } uint64_t numMallocBytes() const { - return numMallocBytes_; + return numMallocBytes_.readFull(); } Stats stats() const override { @@ -275,7 +281,7 @@ class MmapAllocator : public MemoryAllocator { const int32_t pageBitmapSize_; // Serializes access to all data members and private methods. - std::mutex mutex_; + mutable std::mutex mutex_; // Start of address range. uint8_t* address_; @@ -313,23 +319,19 @@ class MmapAllocator : public MemoryAllocator { }; bool allocateNonContiguousWithoutRetry( - MachinePageCount numPages, - Allocation& out, - ReservationCallback reservationCB = nullptr, - MachinePageCount minSizeClass = 0) override; + const SizeMix& sizeMix, + Allocation& out) override; bool allocateContiguousWithoutRetry( MachinePageCount numPages, Allocation* collateral, ContiguousAllocation& allocation, - ReservationCallback reservationCB = nullptr, MachinePageCount maxPages = 0) override; bool allocateContiguousImpl( MachinePageCount numPages, Allocation* collateral, ContiguousAllocation& allocation, - ReservationCallback reservationCB, MachinePageCount maxPages); void freeContiguousImpl(ContiguousAllocation& allocation); @@ -360,7 +362,7 @@ class MmapAllocator : public MemoryAllocator { // Frees 'allocation and returns the number of freed pages. Does not // update 'numAllocated'. - MachinePageCount freeInternal(Allocation& allocation); + MachinePageCount freeNonContiguousInternal(Allocation& allocation); void markAllMapped(const Allocation& allocation); @@ -412,7 +414,7 @@ class MmapAllocator : public MemoryAllocator { std::atomic numAllocations_ = 0; std::atomic numAllocatedPages_ = 0; std::atomic numAdvisedPages_ = 0; - std::atomic numMallocBytes_ = 0; + folly::ThreadCachedInt numMallocBytes_; // Allocations that are larger than largest size classes will be delegated to // ManagedMmapArenas, to avoid calling mmap on every allocation. diff --git a/velox/common/memory/MmapArena.cpp b/velox/common/memory/MmapArena.cpp index 9511f1ea14453..d72559972cac3 100644 --- a/velox/common/memory/MmapArena.cpp +++ b/velox/common/memory/MmapArena.cpp @@ -46,7 +46,7 @@ MmapArena::MmapArena(size_t capacityBytes) : byteSize_(capacityBytes) { capacityBytes); } address_ = reinterpret_cast(ptr); - addFreeBlock(reinterpret_cast(address_), byteSize_); + addFreeBlock(reinterpret_cast(address_), byteSize_); freeBytes_ = byteSize_; } @@ -60,7 +60,7 @@ void* MmapArena::allocate(uint64_t bytes) { } bytes = roundBytes(bytes); - // First match in the list that can give this many bytes + // First match in the list that can give this many bytes. auto lookupItr = freeLookup_.lower_bound(bytes); if (lookupItr == freeLookup_.end()) { VELOX_MEM_LOG_EVERY_MS(WARNING, 1000) @@ -71,8 +71,8 @@ void* MmapArena::allocate(uint64_t bytes) { } freeBytes_ -= bytes; - auto address = *(lookupItr->second.begin()); auto curFreeBytes = lookupItr->first; + auto address = *(lookupItr->second.begin()); void* result = reinterpret_cast(address); if (curFreeBytes == bytes) { removeFreeBlock(address, curFreeBytes); @@ -92,10 +92,10 @@ void MmapArena::free(void* address, uint64_t bytes) { ::madvise(address, bytes, MADV_DONTNEED); freeBytes_ += bytes; - const auto curAddr = reinterpret_cast(address); + const auto curAddr = reinterpret_cast(address); auto curIter = addFreeBlock(curAddr, bytes); auto prevIter = freeList_.end(); - uint64_t prevAddr; + uintptr_t prevAddr; uint64_t prevBytes; bool mergePrev = false; if (curIter != freeList_.begin()) { @@ -106,26 +106,28 @@ void MmapArena::free(void* address, uint64_t bytes) { VELOX_CHECK_LE( prevEndAddr, curAddr, - "New free node (addr:{} size:{}) overlaps with previous free node (addr:{} size:{}) in free list", + "New free block (addr:{} size:{}) overlaps with previous free block " + "(addr:{} size:{}) in free list", curAddr, bytes, prevAddr, prevBytes); - mergePrev = prevEndAddr == curAddr; + mergePrev = (prevEndAddr == curAddr); } - auto nextItr = std::next(curIter); - uint64_t nextAddr; + auto nextIter = std::next(curIter); + uintptr_t nextAddr; uint64_t nextBytes; bool mergeNext = false; - if (nextItr != freeList_.end()) { - nextAddr = nextItr->first; - nextBytes = nextItr->second; + if (nextIter != freeList_.end()) { + nextAddr = nextIter->first; + nextBytes = nextIter->second; auto curEndAddr = curAddr + bytes; VELOX_CHECK_LE( curEndAddr, nextAddr, - "New free node (addr:{} size:{}) overlaps with next free node (addr:{} size:{}) in free list", + "New free block (addr:{} size:{}) overlaps with next free block " + "(addr:{} size:{}) in free list", curAddr, bytes, nextAddr, @@ -142,38 +144,39 @@ void MmapArena::free(void* address, uint64_t bytes) { removeFromLookup(prevAddr, prevBytes); auto newFreeSize = curAddr - prevAddr + bytes; if (mergeNext) { - removeFreeBlock(nextItr); + removeFreeBlock(nextIter); newFreeSize = nextAddr - prevAddr + nextBytes; } - freeList_[prevIter->first] = newFreeSize; + freeList_[prevAddr] = newFreeSize; freeLookup_[newFreeSize].emplace(prevAddr); return; } if (mergeNext) { VELOX_DCHECK(!mergePrev); - removeFreeBlock(nextItr); + removeFreeBlock(nextIter); removeFromLookup(curAddr, bytes); const auto newFreeSize = nextAddr - curAddr + nextBytes; - freeList_[curIter->first] = newFreeSize; + freeList_[curAddr] = newFreeSize; freeLookup_[newFreeSize].emplace(curAddr); } } -void MmapArena::removeFromLookup(uint64_t addr, uint64_t bytes) { +void MmapArena::removeFromLookup(uintptr_t addr, uint64_t bytes) { freeLookup_[bytes].erase(addr); if (freeLookup_[bytes].empty()) { freeLookup_.erase(bytes); } } -std::map::iterator MmapArena::addFreeBlock( - uint64_t address, +std::map::iterator MmapArena::addFreeBlock( + uintptr_t address, uint64_t bytes) { auto insertResult = freeList_.emplace(address, bytes); VELOX_CHECK( insertResult.second, - "Trying to free a memory space that is already freed. Already in free list address {} size {}. Attempted to free address {} size {}", + "Trying to free a memory space that is already freed. Already in free " + "list address {} size {}. Attempted to free address {} size {}", address, freeList_[address], address, @@ -182,26 +185,24 @@ std::map::iterator MmapArena::addFreeBlock( return insertResult.first; } -void MmapArena::removeFreeBlock(uint64_t addr, uint64_t bytes) { +void MmapArena::removeFreeBlock(uintptr_t addr, uint64_t bytes) { freeList_.erase(addr); removeFromLookup(addr, bytes); } -void MmapArena::removeFreeBlock(std::map::iterator& iter) { +void MmapArena::removeFreeBlock(std::map::iterator& iter) { removeFromLookup(iter->first, iter->second); freeList_.erase(iter); } bool MmapArena::checkConsistency() const { uint64_t numErrors = 0; - uint64_t bytes = 0; - auto arenaEndAddress = reinterpret_cast(address_) + byteSize_; + auto arenaEndAddress = reinterpret_cast(address_) + byteSize_; auto iter = freeList_.begin(); auto end = freeList_.end(); - uint8_t* current = reinterpret_cast(address_); int64_t freeListTotalBytes = 0; while (iter != end) { - // Lookup list should contain the address + // Lookup list should contain the address. auto freeLookupIter = freeLookup_.find(iter->second); if (freeLookupIter == freeLookup_.end() || freeLookupIter->second.find(iter->first) == @@ -213,7 +214,7 @@ bool MmapArena::checkConsistency() const { numErrors++; } - // Verify current free block end + // Verify current free block end. auto blockEndAddress = iter->first + iter->second; if (blockEndAddress > arenaEndAddress) { LOG(WARNING) @@ -223,7 +224,7 @@ bool MmapArena::checkConsistency() const { numErrors++; } - // Verify next free block not overlapping + // Verify next free block not overlapping. auto next = std::next(iter); if (next != end && blockEndAddress > next->first) { LOG(WARNING) @@ -238,7 +239,7 @@ bool MmapArena::checkConsistency() const { iter++; } - // Check consistency of lookup list + // Check consistency of lookup list. int64_t freeLookupTotalBytes = 0; for (auto iter = freeLookup_.begin(); iter != freeLookup_.end(); iter++) { if (iter->second.empty()) { @@ -251,7 +252,7 @@ bool MmapArena::checkConsistency() const { freeLookupTotalBytes += (iter->first * iter->second.size()); } - // Check consistency of freeList_ and freeLookup_ in terms of bytes + // Check consistency of freeList_ and freeLookup_ in terms of bytes. if (freeListTotalBytes != freeLookupTotalBytes || freeListTotalBytes != freeBytes_) { LOG(WARNING) @@ -279,7 +280,7 @@ std::string MmapArena::toString() const { ManagedMmapArenas::ManagedMmapArenas(uint64_t singleArenaCapacity) : singleArenaCapacity_(singleArenaCapacity) { auto arena = std::make_shared(singleArenaCapacity); - arenas_.emplace(reinterpret_cast(arena->address()), arena); + arenas_.emplace(reinterpret_cast(arena->address()), arena); currentArena_ = arena; } @@ -293,14 +294,14 @@ void* ManagedMmapArenas::allocate(uint64_t bytes) { // it ever fails again then it means requested bytes is larger than a single // MmapArena's capacity. No further attempts will happen. auto newArena = std::make_shared(singleArenaCapacity_); - arenas_.emplace(reinterpret_cast(newArena->address()), newArena); + arenas_.emplace(reinterpret_cast(newArena->address()), newArena); currentArena_ = newArena; return currentArena_->allocate(bytes); } void ManagedMmapArenas::free(void* address, uint64_t bytes) { VELOX_CHECK(!arenas_.empty()); - const uint64_t addressU64 = reinterpret_cast(address); + const auto addressU64 = reinterpret_cast(address); auto iter = arenas_.lower_bound(addressU64); if (iter == arenas_.end() || iter->first != addressU64) { VELOX_CHECK(iter != arenas_.begin()); diff --git a/velox/common/memory/MmapArena.h b/velox/common/memory/MmapArena.h index c084509e2a6fa..c1132252d6c18 100644 --- a/velox/common/memory/MmapArena.h +++ b/velox/common/memory/MmapArena.h @@ -36,11 +36,12 @@ class MmapArena { /// MmapArena capacity should be multiple of kMinGrainSizeBytes. static constexpr uint64_t kMinGrainSizeBytes = 1024 * 1024; // 1M - MmapArena(size_t capacityBytes); + explicit MmapArena(size_t capacityBytes); ~MmapArena(); void* allocate(uint64_t bytes); void free(void* address, uint64_t bytes); + void* address() const { return reinterpret_cast(address_); } @@ -49,11 +50,11 @@ class MmapArena { return byteSize_; } - const std::map& freeList() const { + const std::map& freeList() const { return freeList_; } - const std::map>& freeLookup() const { + const std::map>& freeLookup() const { return freeLookup_; } @@ -66,7 +67,7 @@ class MmapArena { } /// Checks internal consistency of this MmapArena. Returns true if OK. May - /// return false if there are concurrent alocations and frees during the + /// return false if there are concurrent allocations and frees during the /// consistency check. This is a false positive but not dangerous. This is for /// test only bool checkConsistency() const; @@ -91,15 +92,15 @@ class MmapArena { // Rounds up size to the next power of 2. static uint64_t roundBytes(uint64_t bytes); - std::map::iterator addFreeBlock( - uint64_t addr, + std::map::iterator addFreeBlock( + uintptr_t addr, uint64_t bytes); - void removeFromLookup(uint64_t addr, uint64_t bytes); + void removeFromLookup(uintptr_t addr, uint64_t bytes); - void removeFreeBlock(uint64_t addr, uint64_t bytes); + void removeFreeBlock(uintptr_t addr, uint64_t bytes); - void removeFreeBlock(std::map::iterator& itr); + void removeFreeBlock(std::map::iterator& itr); // Total capacity size of this arena. const uint64_t byteSize_; @@ -111,11 +112,11 @@ class MmapArena { // A sorted list with each entry mapping from free block address to size of // the free block - std::map freeList_; + std::map freeList_; - // A sorted look up structure that stores the block size as key and a set of + // A sorted look-up structure that stores the block size as key and a set of // addresses of that size as value. - std::map> freeLookup_; + std::map> freeLookup_; }; /// A class that manages a set of MmapArenas. It is able to adapt itself by @@ -123,13 +124,13 @@ class MmapArena { /// fragmentation happens. class ManagedMmapArenas { public: - ManagedMmapArenas(uint64_t singleArenaCapacity); + explicit ManagedMmapArenas(uint64_t singleArenaCapacity); void* allocate(uint64_t bytes); void free(void* address, uint64_t bytes); - const std::map>& arenas() const { + const std::map>& arenas() const { return arenas_; } @@ -138,7 +139,7 @@ class ManagedMmapArenas { const uint64_t singleArenaCapacity_; // A sorted list of MmapArena by its initial address - std::map> arenas_; + std::map> arenas_; // All allocations should come from this MmapArena. When it is no longer able // to handle allocations it will be updated to a newly created MmapArena. diff --git a/velox/common/memory/SharedArbitrator.cpp b/velox/common/memory/SharedArbitrator.cpp index 611e9fa65fea6..beaa9b8cfe430 100644 --- a/velox/common/memory/SharedArbitrator.cpp +++ b/velox/common/memory/SharedArbitrator.cpp @@ -15,8 +15,12 @@ */ #include "velox/common/memory/SharedArbitrator.h" +#include #include "velox/common/base/Exceptions.h" +#include "velox/common/base/RuntimeMetrics.h" +#include "velox/common/config/Config.h" +#include "velox/common/memory/Memory.h" #include "velox/common/testutil/TestValue.h" #include "velox/common/time/Timer.h" @@ -24,11 +28,13 @@ using facebook::velox::common::testutil::TestValue; namespace facebook::velox::memory { +using namespace facebook::velox::memory; + namespace { // Returns the max capacity to grow of memory 'pool'. The calculation is based // on a memory pool's max capacity and its current capacity. -uint64_t maxGrowBytes(const MemoryPool& pool) { +uint64_t maxGrowCapacity(const MemoryPool& pool) { return pool.maxCapacity() - pool.capacity(); } @@ -58,69 +64,297 @@ std::string memoryPoolAbortMessage( return out.str(); } +template +T getConfig( + const std::unordered_map& configs, + const std::string_view& key, + const T& defaultValue) { + if (configs.count(std::string(key)) > 0) { + try { + return folly::to(configs.at(std::string(key))); + } catch (const std::exception& e) { + VELOX_USER_FAIL( + "Failed while parsing SharedArbitrator configs: {}", e.what()); + } + } + return defaultValue; +} } // namespace -SharedArbitrator::SharedArbitrator(const MemoryArbitrator::Config& config) - : MemoryArbitrator(config), freeCapacity_(capacity_) { +int64_t SharedArbitrator::ExtraConfig::getReservedCapacity( + const std::unordered_map& configs) { + return config::toCapacity( + getConfig( + configs, kReservedCapacity, std::string(kDefaultReservedCapacity)), + config::CapacityUnit::BYTE); +} + +uint64_t SharedArbitrator::ExtraConfig::getMemoryPoolInitialCapacity( + const std::unordered_map& configs) { + return config::toCapacity( + getConfig( + configs, + kMemoryPoolInitialCapacity, + std::string(kDefaultMemoryPoolInitialCapacity)), + config::CapacityUnit::BYTE); +} + +uint64_t SharedArbitrator::ExtraConfig::getMemoryPoolReservedCapacity( + const std::unordered_map& configs) { + return config::toCapacity( + getConfig( + configs, + kMemoryPoolReservedCapacity, + std::string(kDefaultMemoryPoolReservedCapacity)), + config::CapacityUnit::BYTE); +} + +uint64_t SharedArbitrator::ExtraConfig::getMemoryPoolTransferCapacity( + const std::unordered_map& configs) { + return config::toCapacity( + getConfig( + configs, + kMemoryPoolTransferCapacity, + std::string(kDefaultMemoryPoolTransferCapacity)), + config::CapacityUnit::BYTE); +} + +uint64_t SharedArbitrator::ExtraConfig::getMemoryReclaimMaxWaitTimeMs( + const std::unordered_map& configs) { + return std::chrono::duration_cast( + config::toDuration(getConfig( + configs, + kMemoryReclaimMaxWaitTime, + std::string(kDefaultMemoryReclaimMaxWaitTime)))) + .count(); +} + +uint64_t SharedArbitrator::ExtraConfig::getMemoryPoolMinFreeCapacity( + const std::unordered_map& configs) { + return config::toCapacity( + getConfig( + configs, + kMemoryPoolMinFreeCapacity, + std::string(kDefaultMemoryPoolMinFreeCapacity)), + config::CapacityUnit::BYTE); +} + +double SharedArbitrator::ExtraConfig::getMemoryPoolMinFreeCapacityPct( + const std::unordered_map& configs) { + return getConfig( + configs, + kMemoryPoolMinFreeCapacityPct, + kDefaultMemoryPoolMinFreeCapacityPct); +} + +bool SharedArbitrator::ExtraConfig::getGlobalArbitrationEnabled( + const std::unordered_map& configs) { + return getConfig( + configs, kGlobalArbitrationEnabled, kDefaultGlobalArbitrationEnabled); +} + +bool SharedArbitrator::ExtraConfig::getCheckUsageLeak( + const std::unordered_map& configs) { + return getConfig(configs, kCheckUsageLeak, kDefaultCheckUsageLeak); +} + +uint64_t +SharedArbitrator::ExtraConfig::getFastExponentialGrowthCapacityLimitBytes( + const std::unordered_map& configs) { + return config::toCapacity( + getConfig( + configs, + kFastExponentialGrowthCapacityLimit, + std::string(kDefaultFastExponentialGrowthCapacityLimit)), + config::CapacityUnit::BYTE); +} + +double SharedArbitrator::ExtraConfig::getSlowCapacityGrowPct( + const std::unordered_map& configs) { + return getConfig( + configs, kSlowCapacityGrowPct, kDefaultSlowCapacityGrowPct); +} + +SharedArbitrator::SharedArbitrator(const Config& config) + : MemoryArbitrator(config), + reservedCapacity_(ExtraConfig::getReservedCapacity(config.extraConfigs)), + memoryPoolInitialCapacity_( + ExtraConfig::getMemoryPoolInitialCapacity(config.extraConfigs)), + memoryPoolReservedCapacity_( + ExtraConfig::getMemoryPoolReservedCapacity(config.extraConfigs)), + memoryPoolTransferCapacity_( + ExtraConfig::getMemoryPoolTransferCapacity(config.extraConfigs)), + memoryReclaimWaitMs_( + ExtraConfig::getMemoryReclaimMaxWaitTimeMs(config.extraConfigs)), + globalArbitrationEnabled_( + ExtraConfig::getGlobalArbitrationEnabled(config.extraConfigs)), + checkUsageLeak_(ExtraConfig::getCheckUsageLeak(config.extraConfigs)), + fastExponentialGrowthCapacityLimit_( + ExtraConfig::getFastExponentialGrowthCapacityLimitBytes( + config.extraConfigs)), + slowCapacityGrowPct_( + ExtraConfig::getSlowCapacityGrowPct(config.extraConfigs)), + memoryPoolMinFreeCapacity_( + ExtraConfig::getMemoryPoolMinFreeCapacity(config.extraConfigs)), + memoryPoolMinFreeCapacityPct_( + ExtraConfig::getMemoryPoolMinFreeCapacityPct(config.extraConfigs)), + freeReservedCapacity_(reservedCapacity_), + freeNonReservedCapacity_(capacity_ - freeReservedCapacity_) { VELOX_CHECK_EQ(kind_, config.kind); + VELOX_CHECK_LE(reservedCapacity_, capacity_); + VELOX_CHECK_GE(slowCapacityGrowPct_, 0); + VELOX_CHECK_GE(memoryPoolMinFreeCapacityPct_, 0); + VELOX_CHECK_LE(memoryPoolMinFreeCapacityPct_, 1); + VELOX_CHECK_EQ( + fastExponentialGrowthCapacityLimit_ == 0, + slowCapacityGrowPct_ == 0, + "fastExponentialGrowthCapacityLimit_ {} and slowCapacityGrowPct_ {} " + "both need to be set (non-zero) at the same time to enable growth capacity " + "adjustment.", + fastExponentialGrowthCapacityLimit_, + slowCapacityGrowPct_); + VELOX_CHECK_EQ( + memoryPoolMinFreeCapacity_ == 0, + memoryPoolMinFreeCapacityPct_ == 0, + "memoryPoolMinFreeCapacity_ {} and memoryPoolMinFreeCapacityPct_ {} both " + "need to be set (non-zero) at the same time to enable shrink capacity " + "adjustment.", + memoryPoolMinFreeCapacity_, + memoryPoolMinFreeCapacityPct_); } std::string SharedArbitrator::Candidate::toString() const { return fmt::format( - "CANDIDATE[{} RECLAIMABLE[{}] RECLAIMABLE_BYTES[{}] FREE_BYTES[{}]]", - pool->root()->name(), - reclaimable, + "CANDIDATE[{}] RECLAIMABLE_BYTES[{}] FREE_BYTES[{}]]", + pool->name(), succinctBytes(reclaimableBytes), succinctBytes(freeBytes)); } -void SharedArbitrator::sortCandidatesByFreeCapacity( - std::vector& candidates) const { +SharedArbitrator::~SharedArbitrator() { + VELOX_CHECK(candidates_.empty()); + if (freeNonReservedCapacity_ + freeReservedCapacity_ != capacity_) { + const std::string errMsg = fmt::format( + "Unexpected free capacity leak in arbitrator: freeNonReservedCapacity_[{}] + freeReservedCapacity_[{}] != capacity_[{}])\\n{}", + freeNonReservedCapacity_, + freeReservedCapacity_, + capacity_, + toString()); + if (checkUsageLeak_) { + VELOX_FAIL(errMsg); + } else { + VELOX_MEM_LOG(ERROR) << errMsg; + } + } +} + +void SharedArbitrator::addPool(const std::shared_ptr& pool) { + VELOX_CHECK_EQ(pool->capacity(), 0); + { + std::unique_lock guard{poolLock_}; + VELOX_CHECK_EQ(candidates_.count(pool.get()), 0); + candidates_.emplace(pool.get(), pool); + } + + std::lock_guard l(stateLock_); + const uint64_t maxBytesToReserve = + std::min(maxGrowCapacity(*pool), memoryPoolInitialCapacity_); + const uint64_t minBytesToReserve = minGrowCapacity(*pool); + const uint64_t reservedBytes = + decrementFreeCapacityLocked(maxBytesToReserve, minBytesToReserve); + try { + checkedGrow(pool.get(), reservedBytes, 0); + } catch (const VeloxRuntimeError&) { + incrementFreeCapacityLocked(reservedBytes); + } +} + +void SharedArbitrator::removePool(MemoryPool* pool) { + VELOX_CHECK_EQ(pool->reservedBytes(), 0); + shrinkCapacity(pool); + + std::unique_lock guard{poolLock_}; + const auto ret = candidates_.erase(pool); + VELOX_CHECK_EQ(ret, 1); +} + +void SharedArbitrator::getCandidates( + ArbitrationOperation* op, + bool freeCapacityOnly) { + op->candidates.clear(); + + std::shared_lock guard{poolLock_}; + op->candidates.reserve(candidates_.size()); + for (const auto& candidate : candidates_) { + const bool selfCandidate = op->requestPool == candidate.first; + std::shared_ptr pool = candidate.second.lock(); + if (pool == nullptr) { + VELOX_CHECK(!selfCandidate); + continue; + } + op->candidates.push_back( + {pool, + freeCapacityOnly ? 0 : reclaimableUsedCapacity(*pool, selfCandidate), + reclaimableFreeCapacity(*pool, selfCandidate), + pool->reservedBytes()}); + } + VELOX_CHECK(!op->candidates.empty()); +} + +void SharedArbitrator::sortCandidatesByReclaimableFreeCapacity( + std::vector& candidates) { std::sort( candidates.begin(), candidates.end(), - [&](const Candidate& lhs, const Candidate& rhs) { + [&](const SharedArbitrator::Candidate& lhs, + const SharedArbitrator::Candidate& rhs) { return lhs.freeBytes > rhs.freeBytes; }); TestValue::adjust( - "facebook::velox::memory::SharedArbitrator::sortCandidatesByFreeCapacity", + "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableFreeCapacity", &candidates); } -void SharedArbitrator::sortCandidatesByReclaimableMemory( - std::vector& candidates) const { +void SharedArbitrator::sortCandidatesByReclaimableUsedCapacity( + std::vector& candidates) { std::sort( candidates.begin(), candidates.end(), - [](const Candidate& lhs, const Candidate& rhs) { - if (!lhs.reclaimable) { - return false; - } - if (!rhs.reclaimable) { - return true; - } + [](const SharedArbitrator::Candidate& lhs, + const SharedArbitrator::Candidate& rhs) { return lhs.reclaimableBytes > rhs.reclaimableBytes; }); TestValue::adjust( - "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableMemory", + "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableUsedCapacity", &candidates); } +void SharedArbitrator::sortCandidatesByUsage( + std::vector& candidates) { + std::sort( + candidates.begin(), + candidates.end(), + [](const SharedArbitrator::Candidate& lhs, + const SharedArbitrator::Candidate& rhs) { + return lhs.reservedBytes > rhs.reservedBytes; + }); +} + const SharedArbitrator::Candidate& SharedArbitrator::findCandidateWithLargestCapacity( MemoryPool* requestor, uint64_t targetBytes, - const std::vector& candidates) const { + const std::vector& candidates) { VELOX_CHECK(!candidates.empty()); int32_t candidateIdx{-1}; - int64_t maxCapacity{-1}; + uint64_t maxCapacity{0}; for (int32_t i = 0; i < candidates.size(); ++i) { - const bool isCandidate = candidates[i].pool == requestor; + const bool isCandidate = candidates[i].pool.get() == requestor; // For capacity comparison, the requestor's capacity should include both its // current capacity and the capacity growth. - const int64_t capacity = + const uint64_t capacity = candidates[i].pool->capacity() + (isCandidate ? targetBytes : 0); if (i == 0) { candidateIdx = 0; @@ -145,302 +379,621 @@ SharedArbitrator::findCandidateWithLargestCapacity( return candidates[candidateIdx]; } -SharedArbitrator::~SharedArbitrator() { - VELOX_CHECK_EQ(freeCapacity_, capacity_, "{}", toString()); +void SharedArbitrator::updateArbitrationRequestStats() { + RECORD_METRIC_VALUE(kMetricArbitratorRequestsCount); + ++numRequests_; } -void SharedArbitrator::reserveMemory(MemoryPool* pool, uint64_t /*unused*/) { - const int64_t bytesToReserve = - std::min(maxGrowBytes(*pool), memoryPoolInitCapacity_); - std::lock_guard l(mutex_); - if (running_) { - // NOTE: if there is a running memory arbitration, then we shall skip - // reserving the free memory for the newly created memory pool but let it - // grow its capacity on-demand later through the memory arbitration. - return; +void SharedArbitrator::updateArbitrationFailureStats() { + RECORD_METRIC_VALUE(kMetricArbitratorFailuresCount); + ++numFailures_; +} + +int64_t SharedArbitrator::maxReclaimableCapacity( + const MemoryPool& pool, + bool isSelfReclaim) const { + // Checks if a query memory pool has likely finished processing. It is likely + // this pool has finished when it has 0 current usage and non-0 past usage. If + // there is a high chance this pool finished, then we don't have to respect + // the memory pool reserved capacity limit check. + // + // NOTE: for query system like Prestissimo, it holds a finished query state in + // minutes for query stats fetch request from the Presto coordinator. + if (isSelfReclaim || (pool.reservedBytes() == 0 && pool.peakBytes() != 0)) { + return pool.capacity(); } - const uint64_t reserveBytes = decrementFreeCapacityLocked(bytesToReserve); - pool->grow(reserveBytes); + return std::max(0, pool.capacity() - memoryPoolReservedCapacity_); } -void SharedArbitrator::releaseMemory(MemoryPool* pool) { - std::lock_guard l(mutex_); - const uint64_t freedBytes = pool->shrink(0); - incrementFreeCapacityLocked(freedBytes); +int64_t SharedArbitrator::reclaimableFreeCapacity( + const MemoryPool& pool, + bool isSelfReclaim) const { + const auto freeBytes = pool.freeBytes(); + if (freeBytes == 0) { + return 0; + } + return std::min( + isSelfReclaim ? freeBytes : getCapacityShrinkTarget(pool, freeBytes), + maxReclaimableCapacity(pool, isSelfReclaim)); +} + +int64_t SharedArbitrator::reclaimableUsedCapacity( + const MemoryPool& pool, + bool isSelfReclaim) const { + const auto maxReclaimableBytes = maxReclaimableCapacity(pool, isSelfReclaim); + const auto reclaimableBytes = pool.reclaimableBytes(); + return std::min(maxReclaimableBytes, reclaimableBytes.value_or(0)); +} + +int64_t SharedArbitrator::minGrowCapacity(const MemoryPool& pool) const { + return std::max( + 0, + std::min(pool.maxCapacity(), memoryPoolReservedCapacity_) - + pool.capacity()); +} + +uint64_t SharedArbitrator::decrementFreeCapacity( + uint64_t maxBytesToReserve, + uint64_t minBytesToReserve) { + uint64_t reservedBytes{0}; + { + std::lock_guard l(stateLock_); + reservedBytes = + decrementFreeCapacityLocked(maxBytesToReserve, minBytesToReserve); + } + return reservedBytes; } -std::vector SharedArbitrator::getCandidateStats( - const std::vector>& pools) { - std::vector candidates; - candidates.reserve(pools.size()); - for (const auto& pool : pools) { - uint64_t reclaimableBytes; - const bool reclaimable = pool->reclaimableBytes(reclaimableBytes); - candidates.push_back( - {reclaimable, reclaimableBytes, pool->freeBytes(), pool.get()}); +uint64_t SharedArbitrator::decrementFreeCapacityLocked( + uint64_t maxBytesToReserve, + uint64_t minBytesToReserve) { + uint64_t allocatedBytes = + std::min(freeNonReservedCapacity_, maxBytesToReserve); + freeNonReservedCapacity_ -= allocatedBytes; + if (allocatedBytes < minBytesToReserve) { + const uint64_t reservedBytes = std::min( + minBytesToReserve - allocatedBytes, freeReservedCapacity_); + freeReservedCapacity_ -= reservedBytes; + allocatedBytes += reservedBytes; } - return candidates; + return allocatedBytes; } -bool SharedArbitrator::growMemory( +uint64_t SharedArbitrator::getCapacityShrinkTarget( + const MemoryPool& pool, + uint64_t requestBytes) const { + VELOX_CHECK_NE(requestBytes, 0); + auto targetBytes = requestBytes; + if (memoryPoolMinFreeCapacity_ != 0) { + const auto minFreeBytes = std::min( + static_cast(pool.capacity() * memoryPoolMinFreeCapacityPct_), + memoryPoolMinFreeCapacity_); + const auto maxShrinkBytes = std::max( + 0LL, pool.freeBytes() - static_cast(minFreeBytes)); + targetBytes = std::min(targetBytes, static_cast(maxShrinkBytes)); + } + return targetBytes; +} + +uint64_t SharedArbitrator::shrinkCapacity( MemoryPool* pool, - const std::vector>& candidatePools, - uint64_t targetBytes) { - ScopedArbitration scopedArbitration(pool, this); - MemoryPool* requestor = pool->root(); - if (FOLLY_UNLIKELY(requestor->aborted())) { - ++numFailures_; - VELOX_MEM_POOL_ABORTED("The requestor has already been aborted"); - } - - if (FOLLY_UNLIKELY(!ensureCapacity(requestor, targetBytes))) { - ++numFailures_; - VELOX_MEM_LOG(ERROR) << "Can't grow " << requestor->name() + uint64_t requestBytes) { + std::lock_guard l(stateLock_); + ++numShrinks_; + const uint64_t freedBytes = shrinkPool( + pool, + requestBytes == 0 ? 0 : getCapacityShrinkTarget(*pool, requestBytes)); + incrementFreeCapacityLocked(freedBytes); + return freedBytes; +} + +uint64_t SharedArbitrator::shrinkCapacity( + uint64_t requestBytes, + bool allowSpill, + bool allowAbort) { + incrementGlobalArbitrationCount(); + const uint64_t targetBytes = requestBytes == 0 ? capacity_ : requestBytes; + ArbitrationOperation op(targetBytes); + ScopedArbitration scopedArbitration(this, &op); + + std::lock_guard exclusiveLock(arbitrationLock_); + getCandidates(&op); + + uint64_t reclaimedBytes{0}; + RECORD_METRIC_VALUE(kMetricArbitratorSlowGlobalArbitrationCount); + + if (allowSpill) { + uint64_t freedBytes{0}; + reclaimUsedMemoryFromCandidatesBySpill(&op, freedBytes); + reclaimedBytes += freedBytes; + if (freedBytes > 0) { + incrementFreeCapacity(freedBytes); + } + if (reclaimedBytes >= op.requestBytes) { + return reclaimedBytes; + } + if (allowAbort) { + // Candidate stats may change after spilling. + getCandidates(&op); + } + } + + if (allowAbort) { + uint64_t freedBytes{0}; + reclaimUsedMemoryFromCandidatesByAbort(&op, freedBytes); + reclaimedBytes += freedBytes; + if (freedBytes > 0) { + incrementFreeCapacity(freedBytes); + } + } + return reclaimedBytes; +} + +void SharedArbitrator::testingFreeCapacity(uint64_t capacity) { + std::lock_guard l(stateLock_); + incrementFreeCapacityLocked(capacity); +} + +uint64_t SharedArbitrator::testingNumRequests() const { + return numRequests_; +} + +uint64_t SharedArbitrator::getCapacityGrowthTarget( + const MemoryPool& pool, + uint64_t requestBytes) const { + if (fastExponentialGrowthCapacityLimit_ == 0 && slowCapacityGrowPct_ == 0) { + return std::max(requestBytes, memoryPoolTransferCapacity_); + } + uint64_t targetBytes{0}; + const auto capacity = pool.capacity(); + if (capacity * 2 <= fastExponentialGrowthCapacityLimit_) { + targetBytes = capacity; + } else { + targetBytes = capacity * slowCapacityGrowPct_; + } + return std::max( + std::max(requestBytes, targetBytes), memoryPoolTransferCapacity_); +} + +bool SharedArbitrator::growCapacity(MemoryPool* pool, uint64_t requestBytes) { + // NOTE: we shouldn't trigger the recursive memory capacity growth under + // memory arbitration context. + VELOX_CHECK(!underMemoryArbitration()); + + ArbitrationOperation op( + pool, requestBytes, getCapacityGrowthTarget(*pool, requestBytes)); + ScopedArbitration scopedArbitration(this, &op); + + bool needGlobalArbitration{false}; + if (!runLocalArbitration(&op, needGlobalArbitration)) { + return false; + } + if (!needGlobalArbitration) { + return true; + } + if (!globalArbitrationEnabled_) { + return false; + } + return runGlobalArbitration(&op); +} + +bool SharedArbitrator::runLocalArbitration( + ArbitrationOperation* op, + bool& needGlobalArbitration) { + needGlobalArbitration = false; + const std::chrono::steady_clock::time_point localArbitrationStartTime = + std::chrono::steady_clock::now(); + std::shared_lock sharedLock(arbitrationLock_); + TestValue::adjust( + "facebook::velox::memory::SharedArbitrator::runLocalArbitration", this); + op->localArbitrationLockWaitTimeUs = + std::chrono::duration_cast( + std::chrono::steady_clock::now() - localArbitrationStartTime) + .count(); + + checkIfAborted(op); + + if (maybeGrowFromSelf(op)) { + return true; + } + + if (!ensureCapacity(op)) { + updateArbitrationFailureStats(); + VELOX_MEM_LOG(ERROR) << "Can't grow " << op->requestPool->name() << " capacity to " - << succinctBytes(requestor->capacity() + targetBytes) + << succinctBytes( + op->requestPool->capacity() + op->requestBytes) << " which exceeds its max capacity " - << succinctBytes(requestor->maxCapacity()); + << succinctBytes(op->requestPool->maxCapacity()) + << ", current capacity " + << succinctBytes(op->requestPool->capacity()) + << ", request " << succinctBytes(op->requestBytes); return false; } + VELOX_CHECK(!op->requestPool->aborted()); + + if (maybeGrowFromSelf(op)) { + return true; + } + + uint64_t maxGrowTarget{0}; + uint64_t minGrowTarget{0}; + getGrowTargets(op, maxGrowTarget, minGrowTarget); + + uint64_t freedBytes = decrementFreeCapacity(maxGrowTarget, minGrowTarget); + auto freeGuard = folly::makeGuard([&]() { + // Returns the unused freed memory capacity back to the arbitrator. + if (freedBytes > 0) { + incrementFreeCapacity(freedBytes); + } + }); + if (freedBytes >= op->requestBytes) { + checkedGrow(op->requestPool, freedBytes, op->requestBytes); + freedBytes = 0; + return true; + } + VELOX_CHECK_LT(freedBytes, maxGrowTarget); + + getCandidates(op, /*freeCapacityOnly=*/true); + freedBytes += + reclaimFreeMemoryFromCandidates(op, maxGrowTarget - freedBytes, true); + if (freedBytes >= op->requestBytes) { + const uint64_t bytesToGrow = std::min(maxGrowTarget, freedBytes); + checkedGrow(op->requestPool, bytesToGrow, op->requestBytes); + freedBytes -= bytesToGrow; + return true; + } + VELOX_CHECK_LT(freedBytes, maxGrowTarget); + + if (!globalArbitrationEnabled_) { + freedBytes += reclaim(op->requestPool, maxGrowTarget - freedBytes, true); + } + checkIfAborted(op); + + if (freedBytes >= op->requestBytes) { + const uint64_t bytesToGrow = std::min(maxGrowTarget, freedBytes); + checkedGrow(op->requestPool, bytesToGrow, op->requestBytes); + freedBytes -= bytesToGrow; + return true; + } + + needGlobalArbitration = true; + return true; +} + +bool SharedArbitrator::runGlobalArbitration(ArbitrationOperation* op) { + incrementGlobalArbitrationCount(); + const std::chrono::steady_clock::time_point globalArbitrationStartTime = + std::chrono::steady_clock::now(); + std::lock_guard exclusiveLock(arbitrationLock_); + TestValue::adjust( + "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", this); + op->globalArbitrationLockWaitTimeUs = + std::chrono::duration_cast( + std::chrono::steady_clock::now() - globalArbitrationStartTime) + .count(); + checkIfAborted(op); + + if (maybeGrowFromSelf(op)) { + return true; + } - std::vector candidates; - candidates.reserve(candidatePools.size()); - int numRetries{0}; - for (;; ++numRetries) { - // Get refreshed stats before the memory arbitration retry. - candidates = getCandidateStats(candidatePools); - if (arbitrateMemory(requestor, candidates, targetBytes)) { - ++numSucceeded_; + int32_t attempts = 0; + for (;; ++attempts) { + if (arbitrateMemory(op)) { return true; } - if (numRetries > 0) { + if (attempts > 0) { break; } - VELOX_CHECK(!requestor->aborted()); - if (!handleOOM(requestor, targetBytes, candidates)) { + VELOX_CHECK(!op->requestPool->aborted()); + if (!handleOOM(op)) { break; } } VELOX_MEM_LOG(ERROR) << "Failed to arbitrate sufficient memory for memory pool " - << requestor->name() << ", request " << succinctBytes(targetBytes) - << " after " << numRetries - << " retries, Arbitrator state: " << toString(); - ++numFailures_; + << op->requestPool->name() << ", request " + << succinctBytes(op->requestBytes) << " after " << attempts + << " attempts, Arbitrator state: " << toString(); + updateArbitrationFailureStats(); return false; } -bool SharedArbitrator::checkCapacityGrowth( - const MemoryPool& pool, - uint64_t targetBytes) const { - return (maxGrowBytes(pool) >= targetBytes) && - (capacityAfterGrowth(pool, targetBytes) <= capacity_); +void SharedArbitrator::getGrowTargets( + ArbitrationOperation* op, + uint64_t& maxGrowTarget, + uint64_t& minGrowTarget) { + VELOX_CHECK(op->targetBytes.has_value()); + maxGrowTarget = + std::min(maxGrowCapacity(*op->requestPool), op->targetBytes.value()); + minGrowTarget = minGrowCapacity(*op->requestPool); } -bool SharedArbitrator::ensureCapacity( - MemoryPool* requestor, - uint64_t targetBytes) { - if ((targetBytes > capacity_) || (targetBytes > requestor->maxCapacity())) { +void SharedArbitrator::checkIfAborted(ArbitrationOperation* op) { + if (op->requestPool->aborted()) { + updateArbitrationFailureStats(); + VELOX_MEM_POOL_ABORTED("The requestor pool has been aborted"); + } +} + +bool SharedArbitrator::maybeGrowFromSelf(ArbitrationOperation* op) { + if (op->requestPool->freeBytes() >= op->requestBytes) { + if (growPool(op->requestPool, 0, op->requestBytes)) { + return true; + } + } + return false; +} + +bool SharedArbitrator::checkCapacityGrowth(ArbitrationOperation* op) const { + return (maxGrowCapacity(*op->requestPool) >= op->requestBytes) && + (capacityAfterGrowth(*op->requestPool, op->requestBytes) <= capacity_); +} + +bool SharedArbitrator::ensureCapacity(ArbitrationOperation* op) { + if ((op->requestBytes > capacity_) || + (op->requestBytes > op->requestPool->maxCapacity())) { return false; } - if (checkCapacityGrowth(*requestor, targetBytes)) { + if (checkCapacityGrowth(op)) { return true; } - const uint64_t reclaimedBytes = reclaim(requestor, targetBytes); + + const uint64_t reclaimedBytes = + reclaim(op->requestPool, op->requestBytes, true); // NOTE: return the reclaimed bytes back to the arbitrator and let the memory // arbitration process to grow the requestor's memory capacity accordingly. incrementFreeCapacity(reclaimedBytes); // Check if the requestor has been aborted in reclaim operation above. - if (requestor->aborted()) { - ++numFailures_; + if (op->requestPool->aborted()) { + updateArbitrationFailureStats(); VELOX_MEM_POOL_ABORTED("The requestor pool has been aborted"); } - return checkCapacityGrowth(*requestor, targetBytes); + return checkCapacityGrowth(op); } -bool SharedArbitrator::handleOOM( - MemoryPool* requestor, - uint64_t targetBytes, - std::vector& candidates) { - MemoryPool* victim = - findCandidateWithLargestCapacity(requestor, targetBytes, candidates).pool; - if (requestor == victim) { +bool SharedArbitrator::handleOOM(ArbitrationOperation* op) { + MemoryPool* victim = findCandidateWithLargestCapacity( + op->requestPool, op->requestBytes, op->candidates) + .pool.get(); + if (op->requestPool == victim) { VELOX_MEM_LOG(ERROR) - << "Requestor memory pool " << requestor->name() + << "Requestor memory pool " << op->requestPool->name() << " is selected as victim memory pool so fail the memory arbitration"; return false; } VELOX_MEM_LOG(WARNING) << "Aborting victim memory pool " << victim->name() << " to free up memory for requestor " - << requestor->name(); + << op->requestPool->name(); try { - VELOX_MEM_POOL_ABORTED( - memoryPoolAbortMessage(victim, requestor, targetBytes)); - } catch (VeloxRuntimeError& e) { + if (victim == op->requestPool) { + VELOX_MEM_POOL_CAP_EXCEEDED( + memoryPoolAbortMessage(victim, op->requestPool, op->requestBytes)); + } else { + VELOX_MEM_POOL_ABORTED( + memoryPoolAbortMessage(victim, op->requestPool, op->requestBytes)); + } + } catch (VeloxRuntimeError&) { abort(victim, std::current_exception()); } // Free up all the unused capacity from the aborted memory pool and gives back // to the arbitrator. - incrementFreeCapacity(victim->shrink()); + incrementFreeCapacity(shrinkPool(victim, 0)); return true; } -bool SharedArbitrator::arbitrateMemory( - MemoryPool* requestor, - std::vector& candidates, - uint64_t targetBytes) { - VELOX_CHECK(!requestor->aborted()); - - const uint64_t growTarget = std::min( - maxGrowBytes(*requestor), - std::max(memoryPoolTransferCapacity_, targetBytes)); - uint64_t freedBytes = decrementFreeCapacity(growTarget); - if (freedBytes >= targetBytes) { - requestor->grow(freedBytes); - return true; - } - VELOX_CHECK_LT(freedBytes, growTarget); +void SharedArbitrator::checkedGrow( + MemoryPool* pool, + uint64_t growBytes, + uint64_t reservationBytes) { + const auto ret = growPool(pool, growBytes, reservationBytes); + VELOX_CHECK( + ret, + "Failed to grow pool {} with {} and commit {} used reservation", + pool->name(), + succinctBytes(growBytes), + succinctBytes(reservationBytes)); +} + +bool SharedArbitrator::arbitrateMemory(ArbitrationOperation* op) { + VELOX_CHECK(!op->requestPool->aborted()); + uint64_t maxGrowTarget{0}; + uint64_t minGrowTarget{0}; + getGrowTargets(op, maxGrowTarget, minGrowTarget); + uint64_t freedBytes = decrementFreeCapacity(maxGrowTarget, minGrowTarget); auto freeGuard = folly::makeGuard([&]() { // Returns the unused freed memory capacity back to the arbitrator. if (freedBytes > 0) { incrementFreeCapacity(freedBytes); } }); + if (freedBytes >= op->requestBytes) { + checkedGrow(op->requestPool, freedBytes, op->requestBytes); + freedBytes = 0; + return true; + } + VELOX_CHECK_LT(freedBytes, maxGrowTarget); + + // Get refreshed stats before the global memory arbitration run. + getCandidates(op); freedBytes += - reclaimFreeMemoryFromCandidates(candidates, growTarget - freedBytes); - if (freedBytes >= targetBytes) { - const uint64_t bytesToGrow = std::min(growTarget, freedBytes); - requestor->grow(bytesToGrow); + reclaimFreeMemoryFromCandidates(op, maxGrowTarget - freedBytes, false); + if (freedBytes >= op->requestBytes) { + const uint64_t bytesToGrow = std::min(maxGrowTarget, freedBytes); + checkedGrow(op->requestPool, bytesToGrow, op->requestBytes); freedBytes -= bytesToGrow; return true; } + VELOX_CHECK_LT(freedBytes, maxGrowTarget); - VELOX_CHECK_LT(freedBytes, growTarget); - freedBytes += reclaimUsedMemoryFromCandidates( - requestor, candidates, growTarget - freedBytes); - if (requestor->aborted()) { - ++numFailures_; - VELOX_MEM_POOL_ABORTED("The requestor pool has been aborted."); - } - - VELOX_CHECK(!requestor->aborted()); + RECORD_METRIC_VALUE(kMetricArbitratorSlowGlobalArbitrationCount); + reclaimUsedMemoryFromCandidatesBySpill(op, freedBytes); + checkIfAborted(op); - if (freedBytes < targetBytes) { + if (freedBytes < op->requestBytes) { VELOX_MEM_LOG(WARNING) << "Failed to arbitrate sufficient memory for memory pool " - << requestor->name() << ", request " << succinctBytes(targetBytes) - << ", only " << succinctBytes(freedBytes) + << op->requestPool->name() << ", request " + << succinctBytes(op->requestBytes) << ", only " + << succinctBytes(freedBytes) << " has been freed, Arbitrator state: " << toString(); return false; } - const uint64_t bytesToGrow = std::min(freedBytes, growTarget); - requestor->grow(bytesToGrow); + const uint64_t bytesToGrow = std::min(freedBytes, maxGrowTarget); + checkedGrow(op->requestPool, bytesToGrow, op->requestBytes); freedBytes -= bytesToGrow; return true; } uint64_t SharedArbitrator::reclaimFreeMemoryFromCandidates( - std::vector& candidates, - uint64_t targetBytes) { - // Sort candidate memory pools based on their free capacity. - sortCandidatesByFreeCapacity(candidates); - - uint64_t freedBytes{0}; - for (const auto& candidate : candidates) { - VELOX_CHECK_LT(freedBytes, targetBytes); + ArbitrationOperation* op, + uint64_t reclaimTargetBytes, + bool isLocalArbitration) { + // Sort candidate memory pools based on their reclaimable free capacity. + sortCandidatesByReclaimableFreeCapacity(op->candidates); + + std::lock_guard l(stateLock_); + uint64_t reclaimedBytes{0}; + for (const auto& candidate : op->candidates) { + VELOX_CHECK_LT(reclaimedBytes, reclaimTargetBytes); if (candidate.freeBytes == 0) { break; } - const int64_t bytesToShrink = - std::min(targetBytes - freedBytes, candidate.freeBytes); - if (bytesToShrink <= 0) { + if (isLocalArbitration && (candidate.pool.get() != op->requestPool) && + isUnderArbitrationLocked(candidate.pool.get())) { + // If the reclamation is for local arbitration and the candidate pool is + // also under arbitration processing, then we can't reclaim from the + // candidate pool as it might cause concurrent changes to the candidate + // pool's capacity. + continue; + } + const int64_t bytesToReclaim = std::min( + reclaimTargetBytes - reclaimedBytes, + reclaimableFreeCapacity( + *candidate.pool, candidate.pool.get() == op->requestPool)); + if (bytesToReclaim <= 0) { + continue; + } + reclaimedBytes += shrinkPool(candidate.pool.get(), bytesToReclaim); + if (reclaimedBytes >= reclaimTargetBytes) { break; } - freedBytes += candidate.pool->shrink(bytesToShrink); - if (freedBytes >= targetBytes) { + } + reclaimedFreeBytes_ += reclaimedBytes; + return reclaimedBytes; +} + +void SharedArbitrator::reclaimUsedMemoryFromCandidatesBySpill( + ArbitrationOperation* op, + uint64_t& freedBytes) { + // Sort candidate memory pools based on their reclaimable used capacity. + sortCandidatesByReclaimableUsedCapacity(op->candidates); + + for (const auto& candidate : op->candidates) { + VELOX_CHECK_LT(freedBytes, op->requestBytes); + if (candidate.reclaimableBytes == 0) { + break; + } + freedBytes += + reclaim(candidate.pool.get(), op->requestBytes - freedBytes, false); + if ((freedBytes >= op->requestBytes) || + (op->requestPool != nullptr && op->requestPool->aborted())) { break; } } - numShrunkBytes_ += freedBytes; - return freedBytes; } -uint64_t SharedArbitrator::reclaimUsedMemoryFromCandidates( - MemoryPool* requestor, - std::vector& candidates, - uint64_t targetBytes) { - // Sort candidate memory pools based on their reclaimable memory. - sortCandidatesByReclaimableMemory(candidates); - - int64_t freedBytes{0}; - for (const auto& candidate : candidates) { - VELOX_CHECK_LT(freedBytes, targetBytes); - if (!candidate.reclaimable || candidate.reclaimableBytes == 0) { +void SharedArbitrator::reclaimUsedMemoryFromCandidatesByAbort( + ArbitrationOperation* op, + uint64_t& freedBytes) { + sortCandidatesByUsage(op->candidates); + + for (const auto& candidate : op->candidates) { + VELOX_CHECK_LT(freedBytes, op->requestBytes); + if (candidate.pool->capacity() == 0) { break; } - const int64_t bytesToReclaim = std::max( - targetBytes - freedBytes, memoryPoolTransferCapacity_); - VELOX_CHECK_GT(bytesToReclaim, 0); - freedBytes += reclaim(candidate.pool, bytesToReclaim); - if ((freedBytes >= targetBytes) || requestor->aborted()) { + try { + VELOX_MEM_POOL_ABORTED(fmt::format( + "Memory pool aborted to reclaim used memory, current usage {}, " + "memory pool details:\n{}\n{}", + succinctBytes(candidate.reservedBytes), + candidate.pool->toString(), + candidate.pool->treeMemoryUsage())); + } catch (VeloxRuntimeError&) { + abort(candidate.pool.get(), std::current_exception()); + } + freedBytes += shrinkPool(candidate.pool.get(), 0); + if (freedBytes >= op->requestBytes) { break; } } - return freedBytes; } uint64_t SharedArbitrator::reclaim( MemoryPool* pool, - uint64_t targetBytes) noexcept { + uint64_t targetBytes, + bool isLocalArbitration) noexcept { + int64_t bytesToReclaim = std::min( + std::max(targetBytes, memoryPoolTransferCapacity_), + maxReclaimableCapacity(*pool, true)); + if (bytesToReclaim == 0) { + return 0; + } uint64_t reclaimDurationUs{0}; - uint64_t reclaimedBytes{0}; - uint64_t freedBytes{0}; + uint64_t reclaimedUsedBytes{0}; + uint64_t reclaimedFreeBytes{0}; MemoryReclaimer::Stats reclaimerStats; { MicrosecondTimer reclaimTimer(&reclaimDurationUs); - const uint64_t oldCapacity = pool->capacity(); try { - freedBytes = pool->shrink(targetBytes); - if (freedBytes < targetBytes) { - pool->reclaim(targetBytes - freedBytes, reclaimerStats); + reclaimedFreeBytes = shrinkPool(pool, bytesToReclaim); + bytesToReclaim -= reclaimedFreeBytes; + VELOX_CHECK_GE(bytesToReclaim, 0); + if (bytesToReclaim > 0) { + if (isLocalArbitration) { + incrementLocalArbitrationCount(); + } + pool->reclaim(bytesToReclaim, memoryReclaimWaitMs_, reclaimerStats); } } catch (const std::exception& e) { VELOX_MEM_LOG(ERROR) << "Failed to reclaim from memory pool " - << pool->name() << ", aborting it!"; + << pool->name() << ", aborting it: " << e.what(); abort(pool, std::current_exception()); - // Free up all the free capacity from the aborted pool as the associated - // query has failed at this point. - pool->shrink(); + reclaimedUsedBytes = shrinkPool(pool, 0); } - const uint64_t newCapacity = pool->capacity(); - VELOX_CHECK_GE(oldCapacity, newCapacity); - reclaimedBytes = oldCapacity - newCapacity; + reclaimedUsedBytes += shrinkPool(pool, bytesToReclaim); } - numReclaimedBytes_ += reclaimedBytes - freedBytes; - numShrunkBytes_ += freedBytes; + reclaimedUsedBytes_ += reclaimedUsedBytes; + reclaimedFreeBytes_ += reclaimedFreeBytes; reclaimTimeUs_ += reclaimDurationUs; numNonReclaimableAttempts_ += reclaimerStats.numNonReclaimableAttempts; VELOX_MEM_LOG(INFO) << "Reclaimed from memory pool " << pool->name() << " with target of " << succinctBytes(targetBytes) - << ", actually reclaimed " << succinctBytes(freedBytes) + << ", actually reclaimed " + << succinctBytes(reclaimedFreeBytes) << " free memory and " - << succinctBytes(reclaimedBytes - freedBytes) - << " used memory"; - return reclaimedBytes; + << succinctBytes(reclaimedUsedBytes) + << " used memory, spent " + << succinctMicros(reclaimDurationUs) + << ", isLocalArbitration: " << isLocalArbitration; + return reclaimedUsedBytes + reclaimedFreeBytes; } void SharedArbitrator::abort( MemoryPool* pool, const std::exception_ptr& error) { + RECORD_METRIC_VALUE(kMetricArbitratorAbortedCount); ++numAborted_; try { pool->abort(error); } catch (const std::exception& e) { - VELOX_MEM_LOG(WARNING) << "Failed to abort memory pool " - << pool->toString(); + VELOX_MEM_LOG(WARNING) << "Failed to abort memory pool " << pool->toString() + << ", error: " << e.what(); } // NOTE: no matter memory pool abort throws or not, it should have been marked // as aborted to prevent any new memory arbitration triggered from the aborted @@ -448,112 +1001,151 @@ void SharedArbitrator::abort( VELOX_CHECK(pool->aborted()); } -uint64_t SharedArbitrator::decrementFreeCapacity(uint64_t bytes) { - std::lock_guard l(mutex_); - return decrementFreeCapacityLocked(bytes); -} - -uint64_t SharedArbitrator::decrementFreeCapacityLocked(uint64_t bytes) { - const uint64_t targetBytes = std::min(freeCapacity_, bytes); - VELOX_CHECK_LE(targetBytes, freeCapacity_); - freeCapacity_ -= targetBytes; - return targetBytes; -} - void SharedArbitrator::incrementFreeCapacity(uint64_t bytes) { - std::lock_guard l(mutex_); + std::lock_guard l(stateLock_); incrementFreeCapacityLocked(bytes); } void SharedArbitrator::incrementFreeCapacityLocked(uint64_t bytes) { - freeCapacity_ += bytes; - if (FOLLY_UNLIKELY(freeCapacity_ > capacity_)) { + incrementFreeReservedCapacityLocked(bytes); + freeNonReservedCapacity_ += bytes; + if (FOLLY_UNLIKELY( + freeNonReservedCapacity_ + freeReservedCapacity_ > capacity_)) { VELOX_FAIL( - "The free capacity {} is larger than the max capacity {}, {}", - succinctBytes(freeCapacity_), + "The free capacity {}/{} is larger than the max capacity {}, {}", + succinctBytes(freeNonReservedCapacity_), + succinctBytes(freeReservedCapacity_), succinctBytes(capacity_), toStringLocked()); } } +void SharedArbitrator::incrementFreeReservedCapacityLocked(uint64_t& bytes) { + VELOX_CHECK_LE(freeReservedCapacity_, reservedCapacity_); + const uint64_t freedBytes = + std::min(bytes, reservedCapacity_ - freeReservedCapacity_); + freeReservedCapacity_ += freedBytes; + bytes -= freedBytes; +} + MemoryArbitrator::Stats SharedArbitrator::stats() const { - std::lock_guard l(mutex_); + std::lock_guard l(stateLock_); return statsLocked(); } MemoryArbitrator::Stats SharedArbitrator::statsLocked() const { Stats stats; stats.numRequests = numRequests_; - stats.numSucceeded = numSucceeded_; stats.numAborted = numAborted_; stats.numFailures = numFailures_; - stats.queueTimeUs = queueTimeUs_; + stats.queueTimeUs = waitTimeUs_; stats.arbitrationTimeUs = arbitrationTimeUs_; - stats.numShrunkBytes = numShrunkBytes_; - stats.numReclaimedBytes = numReclaimedBytes_; + stats.numShrunkBytes = reclaimedFreeBytes_; + stats.numReclaimedBytes = reclaimedUsedBytes_; stats.maxCapacityBytes = capacity_; - stats.freeCapacityBytes = freeCapacity_; + stats.freeCapacityBytes = freeNonReservedCapacity_ + freeReservedCapacity_; + stats.freeReservedCapacityBytes = freeReservedCapacity_; stats.reclaimTimeUs = reclaimTimeUs_; stats.numNonReclaimableAttempts = numNonReclaimableAttempts_; + stats.numShrinks = numShrinks_; return stats; } std::string SharedArbitrator::toString() const { - std::lock_guard l(mutex_); + std::lock_guard l(stateLock_); return toStringLocked(); } std::string SharedArbitrator::toStringLocked() const { return fmt::format( - "ARBITRATOR[{} CAPACITY[{}] {}]", + "ARBITRATOR[{} CAPACITY[{}] PENDING[{}] {}]", kind_, succinctBytes(capacity_), + numPending_, statsLocked().toString()); } SharedArbitrator::ScopedArbitration::ScopedArbitration( - MemoryPool* requestor, - SharedArbitrator* arbitrator) - : requestor_(requestor), + SharedArbitrator* arbitrator, + ArbitrationOperation* operation) + : operation_(operation), arbitrator_(arbitrator), - startTime_(std::chrono::steady_clock::now()), - arbitrationCtx_(*requestor_) { + arbitrationCtx_(operation->requestPool), + startTime_(std::chrono::steady_clock::now()) { VELOX_CHECK_NOT_NULL(arbitrator_); - arbitrator_->startArbitration(requestor); - if (arbitrator_->arbitrationStateCheckCb_ != nullptr) { - arbitrator_->arbitrationStateCheckCb_(*requestor); + VELOX_CHECK_NOT_NULL(operation_); + if (arbitrator_->arbitrationStateCheckCb_ != nullptr && + operation_->requestPool != nullptr) { + arbitrator_->arbitrationStateCheckCb_(*operation_->requestPool); } + arbitrator_->startArbitration(operation_); } SharedArbitrator::ScopedArbitration::~ScopedArbitration() { - requestor_->leaveArbitration(); - const auto arbitrationTime = + arbitrator_->finishArbitration(operation_); + + // Report arbitration operation stats. + const auto arbitrationTimeUs = std::chrono::duration_cast( - std::chrono::steady_clock::now() - startTime_); - arbitrator_->arbitrationTimeUs_ += arbitrationTime.count(); - arbitrator_->finishArbitration(); + std::chrono::steady_clock::now() - operation_->startTime) + .count(); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricArbitratorArbitrationTimeMs, arbitrationTimeUs / 1'000); + addThreadLocalRuntimeStat( + kMemoryArbitrationWallNanos, + RuntimeCounter(arbitrationTimeUs * 1'000, RuntimeCounter::Unit::kNanos)); + if (operation_->localArbitrationQueueTimeUs != 0) { + addThreadLocalRuntimeStat( + kLocalArbitrationQueueWallNanos, + RuntimeCounter( + operation_->localArbitrationQueueTimeUs * 1'000, + RuntimeCounter::Unit::kNanos)); + } + if (operation_->localArbitrationLockWaitTimeUs != 0) { + addThreadLocalRuntimeStat( + kLocalArbitrationLockWaitWallNanos, + RuntimeCounter( + operation_->localArbitrationLockWaitTimeUs * 1'000, + RuntimeCounter::Unit::kNanos)); + } + if (operation_->globalArbitrationLockWaitTimeUs != 0) { + addThreadLocalRuntimeStat( + kGlobalArbitrationLockWaitWallNanos, + RuntimeCounter( + operation_->globalArbitrationLockWaitTimeUs * 1'000, + RuntimeCounter::Unit::kNanos)); + } + arbitrator_->arbitrationTimeUs_ += arbitrationTimeUs; + + const uint64_t waitTimeUs = operation_->waitTimeUs(); + if (waitTimeUs != 0) { + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricArbitratorWaitTimeMs, waitTimeUs / 1'000); + arbitrator_->waitTimeUs_ += waitTimeUs; + } } -void SharedArbitrator::startArbitration(MemoryPool* requestor) { - requestor->enterArbitration(); +void SharedArbitrator::startArbitration(ArbitrationOperation* op) { + updateArbitrationRequestStats(); ContinueFuture waitPromise{ContinueFuture::makeEmpty()}; { - std::lock_guard l(mutex_); - ++numRequests_; - if (running_) { - waitPromises_.emplace_back(fmt::format( - "Wait for arbitration, requestor: {}[{}]", - requestor->name(), - requestor->root()->name())); - waitPromise = waitPromises_.back().getSemiFuture(); - } else { - VELOX_CHECK(waitPromises_.empty()); - running_ = true; + std::lock_guard l(stateLock_); + ++numPending_; + if (op->requestPool != nullptr) { + auto it = arbitrationQueues_.find(op->requestPool); + if (it != arbitrationQueues_.end()) { + it->second->waitPromises.emplace_back( + fmt::format("Wait for arbitration {}", op->requestPool->name())); + waitPromise = it->second->waitPromises.back().getSemiFuture(); + } else { + arbitrationQueues_.emplace( + op->requestPool, std::make_unique(op)); + } } } TestValue::adjust( - "facebook::velox::memory::SharedArbitrator::startArbitration", requestor); + "facebook::velox::memory::SharedArbitrator::startArbitration", this); if (waitPromise.valid()) { uint64_t waitTimeUs{0}; @@ -561,20 +1153,29 @@ void SharedArbitrator::startArbitration(MemoryPool* requestor) { MicrosecondTimer timer(&waitTimeUs); waitPromise.wait(); } - queueTimeUs_ += waitTimeUs; + op->localArbitrationQueueTimeUs += waitTimeUs; } } -void SharedArbitrator::finishArbitration() { +void SharedArbitrator::finishArbitration(ArbitrationOperation* op) { ContinuePromise resumePromise{ContinuePromise::makeEmpty()}; { - std::lock_guard l(mutex_); - VELOX_CHECK(running_); - if (!waitPromises_.empty()) { - resumePromise = std::move(waitPromises_.back()); - waitPromises_.pop_back(); - } else { - running_ = false; + std::lock_guard l(stateLock_); + VELOX_CHECK_GT(numPending_, 0); + --numPending_; + if (op->requestPool != nullptr) { + auto it = arbitrationQueues_.find(op->requestPool); + VELOX_CHECK( + it != arbitrationQueues_.end(), + "{} not found", + op->requestPool->name()); + auto* runningArbitration = it->second.get(); + if (runningArbitration->waitPromises.empty()) { + arbitrationQueues_.erase(it); + } else { + resumePromise = std::move(runningArbitration->waitPromises.back()); + runningArbitration->waitPromises.pop_back(); + } } } if (resumePromise.valid()) { @@ -582,6 +1183,10 @@ void SharedArbitrator::finishArbitration() { } } +bool SharedArbitrator::isUnderArbitrationLocked(MemoryPool* pool) const { + return arbitrationQueues_.count(pool) != 0; +} + std::string SharedArbitrator::kind() const { return kind_; } @@ -596,4 +1201,16 @@ void SharedArbitrator::registerFactory() { void SharedArbitrator::unregisterFactory() { MemoryArbitrator::unregisterFactory(kind_); } + +void SharedArbitrator::incrementGlobalArbitrationCount() { + RECORD_METRIC_VALUE(kMetricArbitratorGlobalArbitrationCount); + addThreadLocalRuntimeStat( + kGlobalArbitrationCount, RuntimeCounter(1, RuntimeCounter::Unit::kNone)); +} + +void SharedArbitrator::incrementLocalArbitrationCount() { + RECORD_METRIC_VALUE(kMetricArbitratorLocalArbitrationCount); + addThreadLocalRuntimeStat( + kLocalArbitrationCount, RuntimeCounter(1, RuntimeCounter::Unit::kNone)); +} } // namespace facebook::velox::memory diff --git a/velox/common/memory/SharedArbitrator.h b/velox/common/memory/SharedArbitrator.h index b291b2af3bb13..6c784f5373f29 100644 --- a/velox/common/memory/SharedArbitrator.h +++ b/velox/common/memory/SharedArbitrator.h @@ -16,10 +16,14 @@ #pragma once -#include "velox/common/memory/MemoryArbitrator.h" +#include +#include "velox/common/base/Counters.h" +#include "velox/common/base/GTestMacros.h" +#include "velox/common/base/StatsReporter.h" #include "velox/common/future/VeloxPromise.h" #include "velox/common/memory/Memory.h" +#include "velox/common/memory/MemoryArbitrator.h" namespace facebook::velox::memory { @@ -31,30 +35,148 @@ namespace facebook::velox::memory { /// aborting a query. For Prestissimo-on-Spark, we can configure it to /// reclaim from a running query through techniques such as disk-spilling, /// partial aggregation or persistent shuffle data flushes. -class SharedArbitrator : public MemoryArbitrator { +class SharedArbitrator : public memory::MemoryArbitrator { public: - static void registerFactory(); - - static void unregisterFactory(); + struct ExtraConfig { + /// The memory capacity reserved to ensure each running query has minimal + /// capacity of 'memoryPoolReservedCapacity' to run. + static constexpr std::string_view kReservedCapacity{"reserved-capacity"}; + static constexpr std::string_view kDefaultReservedCapacity{"0B"}; + static int64_t getReservedCapacity( + const std::unordered_map& configs); + + /// The initial memory capacity to reserve for a newly created query memory + /// pool. + static constexpr std::string_view kMemoryPoolInitialCapacity{ + "memory-pool-initial-capacity"}; + static constexpr std::string_view kDefaultMemoryPoolInitialCapacity{ + "256MB"}; + static uint64_t getMemoryPoolInitialCapacity( + const std::unordered_map& configs); + + /// The minimal amount of memory capacity reserved for each query to run. + static constexpr std::string_view kMemoryPoolReservedCapacity{ + "memory-pool-reserved-capacity"}; + static constexpr std::string_view kDefaultMemoryPoolReservedCapacity{"0B"}; + static uint64_t getMemoryPoolReservedCapacity( + const std::unordered_map& configs); + + /// The minimal memory capacity to transfer out of or into a memory pool + /// during the memory arbitration. + static constexpr std::string_view kMemoryPoolTransferCapacity{ + "memory-pool-transfer-capacity"}; + static constexpr std::string_view kDefaultMemoryPoolTransferCapacity{ + "128MB"}; + static uint64_t getMemoryPoolTransferCapacity( + const std::unordered_map& configs); + + /// Specifies the max time to wait for memory reclaim by arbitration. The + /// memory reclaim might fail if the max time has exceeded. This prevents + /// the memory arbitration from getting stuck when the memory reclaim waits + /// for a hanging query task to pause. If it is zero, then there is no + /// timeout. + static constexpr std::string_view kMemoryReclaimMaxWaitTime{ + "memory-reclaim-max-wait-time"}; + static constexpr std::string_view kDefaultMemoryReclaimMaxWaitTime{"0ms"}; + static uint64_t getMemoryReclaimMaxWaitTimeMs( + const std::unordered_map& configs); + + /// When shrinking capacity, the shrink bytes will be adjusted in a way such + /// that AFTER shrink, the stricter (whichever is smaller) of the following + /// conditions is met, in order to better fit the pool's current memory + /// usage: + /// - Free capacity is greater or equal to capacity * + /// 'memoryPoolMinFreeCapacityPct' + /// - Free capacity is greater or equal to 'memoryPoolMinFreeCapacity' + /// + /// NOTE: In the conditions when original requested shrink bytes ends up + /// with more free capacity than above 2 conditions, the adjusted shrink + /// bytes is not respected. + /// + /// NOTE: Capacity shrink adjustment is enabled when both + /// 'memoryPoolMinFreeCapacityPct' and 'memoryPoolMinFreeCapacity' are set. + static constexpr std::string_view kMemoryPoolMinFreeCapacity{ + "memory-pool-min-free-capacity"}; + static constexpr std::string_view kDefaultMemoryPoolMinFreeCapacity{ + "128MB"}; + static uint64_t getMemoryPoolMinFreeCapacity( + const std::unordered_map& configs); + + static constexpr std::string_view kMemoryPoolMinFreeCapacityPct{ + "memory-pool-min-free-capacity-pct"}; + static constexpr double kDefaultMemoryPoolMinFreeCapacityPct{0.25}; + static double getMemoryPoolMinFreeCapacityPct( + const std::unordered_map& configs); + + /// If true, it allows memory arbitrator to reclaim used memory cross query + /// memory pools. + static constexpr std::string_view kGlobalArbitrationEnabled{ + "global-arbitration-enabled"}; + static constexpr bool kDefaultGlobalArbitrationEnabled{false}; + static bool getGlobalArbitrationEnabled( + const std::unordered_map& configs); + + /// When growing capacity, the growth bytes will be adjusted in the + /// following way: + /// - If 2 * current capacity is less than or equal to + /// 'fastExponentialGrowthCapacityLimit', grow through fast path by at + /// least doubling the current capacity, when conditions allow (see below + /// NOTE section). + /// - If 2 * current capacity is greater than + /// 'fastExponentialGrowthCapacityLimit', grow through slow path by + /// growing capacity by at least 'slowCapacityGrowPct' * current capacity + /// if allowed (see below NOTE section). + /// + /// NOTE: If original requested growth bytes is larger than the adjusted + /// growth bytes or adjusted growth bytes reaches max capacity limit, the + /// adjusted growth bytes will not be respected. + /// + /// NOTE: Capacity growth adjust is only enabled if both + /// 'fastExponentialGrowthCapacityLimit' and 'slowCapacityGrowPct' are set, + /// otherwise it is disabled. + static constexpr std::string_view kFastExponentialGrowthCapacityLimit{ + "fast-exponential-growth-capacity-limit"}; + static constexpr std::string_view + kDefaultFastExponentialGrowthCapacityLimit{"512MB"}; + static uint64_t getFastExponentialGrowthCapacityLimitBytes( + const std::unordered_map& configs); + + static constexpr std::string_view kSlowCapacityGrowPct{ + "slow-capacity-grow-pct"}; + static constexpr double kDefaultSlowCapacityGrowPct{0.25}; + static double getSlowCapacityGrowPct( + const std::unordered_map& configs); + + /// If true, do sanity check on the arbitrator state on destruction. + /// + /// TODO: deprecate this flag after all the existing memory leak use cases + /// have been fixed. + static constexpr std::string_view kCheckUsageLeak{"check-usage-leak"}; + static constexpr bool kDefaultCheckUsageLeak{true}; + static bool getCheckUsageLeak( + const std::unordered_map& configs); + }; explicit SharedArbitrator(const Config& config); ~SharedArbitrator() override; - void reserveMemory(MemoryPool* pool, uint64_t /*unused*/) final; + static void registerFactory(); - void releaseMemory(MemoryPool* pool) final; + static void unregisterFactory(); - bool growMemory( - MemoryPool* pool, - const std::vector>& candidatePools, - uint64_t targetBytes) final; + void addPool(const std::shared_ptr& pool) final; - uint64_t shrinkMemory( - const std::vector>& /*unused*/, - uint64_t /*unused*/) override final { - VELOX_NYI("shrinkMemory is not supported by SharedArbitrator"); - } + void removePool(MemoryPool* pool) final; + + bool growCapacity(MemoryPool* pool, uint64_t requestBytes) final; + + uint64_t shrinkCapacity(MemoryPool* pool, uint64_t requestBytes = 0) final; + + uint64_t shrinkCapacity( + uint64_t requestBytes, + bool allowSpill = true, + bool force = false) override final; Stats stats() const final; @@ -62,12 +184,37 @@ class SharedArbitrator : public MemoryArbitrator { std::string toString() const final; - // The candidate memory pool stats used by arbitration. + /// Returns 'freeCapacity' back to the arbitrator for testing. + void testingFreeCapacity(uint64_t freeCapacity); + + uint64_t testingNumRequests() const; + + /// Enables/disables global arbitration accordingly. + void testingSetGlobalArbitration(bool enableGlobalArbitration) { + *const_cast(&globalArbitrationEnabled_) = enableGlobalArbitration; + } + + /// Operator level runtime stats that are reported during a shared arbitration + /// attempt. + static inline const std::string kMemoryArbitrationWallNanos{ + "memoryArbitrationWallNanos"}; + static inline const std::string kGlobalArbitrationCount{ + "globalArbitrationCount"}; + static inline const std::string kLocalArbitrationCount{ + "localArbitrationCount"}; + static inline const std::string kLocalArbitrationQueueWallNanos{ + "localArbitrationQueueWallNanos"}; + static inline const std::string kLocalArbitrationLockWaitWallNanos{ + "localArbitrationLockWaitWallNanos"}; + static inline const std::string kGlobalArbitrationLockWaitWallNanos{ + "globalArbitrationLockWaitWallNanos"}; + + /// The candidate memory pool stats used by arbitration. struct Candidate { - bool reclaimable{false}; - uint64_t reclaimableBytes{0}; - uint64_t freeBytes{0}; - MemoryPool* pool; + std::shared_ptr pool; + int64_t reclaimableBytes{0}; + int64_t freeBytes{0}; + int64_t reservedBytes{0}; std::string toString() const; }; @@ -76,130 +223,339 @@ class SharedArbitrator : public MemoryArbitrator { // The kind string of shared arbitrator. inline static const std::string kind_{"SHARED"}; + // Contains the execution state of an arbitration operation. + struct ArbitrationOperation { + MemoryPool* const requestPool; + const uint64_t requestBytes; + + // The adjusted grow bytes based on 'requestBytes'. This 'targetBytes' is a + // best effort target, and hence will not be guaranteed. The adjustment is + // based on 'SharedArbitrator::fastExponentialGrowthCapacityLimit_' + // 'SharedArbitrator::slowCapacityGrowPct_' and + // 'MemoryArbitrator::memoryPoolTransferCapacity_'. + // + // TODO: deprecate 'MemoryArbitrator::memoryPoolTransferCapacity_' once + // exponential growth works well in production. + const std::optional targetBytes; + + // The start time of this arbitration operation. + const std::chrono::steady_clock::time_point startTime; + + // The candidate memory pools. + std::vector candidates; + + // The time that waits in local arbitration queue. + uint64_t localArbitrationQueueTimeUs{0}; + + // The time that waits to acquire the local arbitration lock. + uint64_t localArbitrationLockWaitTimeUs{0}; + + // The time that waits to acquire the global arbitration lock. + uint64_t globalArbitrationLockWaitTimeUs{0}; + + explicit ArbitrationOperation(uint64_t requestBytes) + : ArbitrationOperation(nullptr, requestBytes, std::nullopt) {} + + ArbitrationOperation( + MemoryPool* _requestor, + uint64_t _requestBytes, + std::optional _targetBytes) + : requestPool(_requestor), + requestBytes(_requestBytes), + targetBytes(_targetBytes), + startTime(std::chrono::steady_clock::now()) { + VELOX_CHECK(requestPool == nullptr || requestPool->isRoot()); + } + + uint64_t waitTimeUs() const { + return localArbitrationQueueTimeUs + localArbitrationLockWaitTimeUs + + globalArbitrationLockWaitTimeUs; + } + }; + + // Used to start and finish an arbitration operation initiated from a memory + // pool or memory capacity shrink request sent through shrinkPools() API. class ScopedArbitration { public: - ScopedArbitration(MemoryPool* requestor, SharedArbitrator* arbitrator); + ScopedArbitration(SharedArbitrator* arbitrator, ArbitrationOperation* op); ~ScopedArbitration(); private: - MemoryPool* const requestor_; + ArbitrationOperation* const operation_; SharedArbitrator* const arbitrator_; - const std::chrono::steady_clock::time_point startTime_; const ScopedMemoryArbitrationContext arbitrationCtx_; + const std::chrono::steady_clock::time_point startTime_; + }; + + // The arbitration running queue for arbitration requests from the same query + // pool. + struct ArbitrationQueue { + // Points to the current running arbitration. + ArbitrationOperation* current; + + // The promises of the arbitration requests from the same query pool waiting + // for the serial execution. + std::vector waitPromises; + + explicit ArbitrationQueue(ArbitrationOperation* op) : current(op) { + VELOX_CHECK_NOT_NULL(current); + } }; // Invoked to check if the memory growth will exceed the memory pool's max // capacity limit or the arbitrator's node capacity limit. - bool checkCapacityGrowth(const MemoryPool& pool, uint64_t targetBytes) const; + bool checkCapacityGrowth(ArbitrationOperation* op) const; + + // Invoked to ensure the memory growth request won't exceed the request memory + // pool's max capacity as well as the arbitrator's node capacity. If it does, + // then we first need to reclaim the used memory from the request memory pool + // itself to ensure the memory growth won't exceed the capacity limit, and + // then proceed with the memory arbitration process across queries. + bool ensureCapacity(ArbitrationOperation* op); + + // Invoked to reclaim the memory from the other query memory pools to grow the + // request memory pool's capacity. + bool arbitrateMemory(ArbitrationOperation* op); - // Invoked to ensure the memory growth request won't exceed the requestor's - // max capacity as well as the arbitrator's node capacity. If it does, then we - // first need to reclaim the used memory from the requestor itself to ensure - // the memory growth won't exceed the capacity limit, and then proceed with - // the memory arbitration process. The reclaimed memory capacity returns to - // the arbitrator, and let the memory arbitration process to grow the - // requestor capacity accordingly. - bool ensureCapacity(MemoryPool* requestor, uint64_t targetBytes); + // Invoked to start next memory arbitration request, and it will wait for + // the serialized execution if there is a running or other waiting + // arbitration requests. + void startArbitration(ArbitrationOperation* op); - // Invoked to capture the candidate memory pools stats for arbitration. - static std::vector getCandidateStats( - const std::vector>& pools); + // Invoked by a finished memory arbitration request to kick off the next + // arbitration request execution if there are any ones waiting. + void finishArbitration(ArbitrationOperation* op); + + // Invoked to run local arbitration on the request memory pool. It first + // ensures the memory growth is within both memory pool and arbitrator + // capacity limits. This step might reclaim the used memory from the request + // memory pool itself. Then it tries to obtain free capacity from the + // arbitrator. At last, it tries to reclaim free memory from itself before it + // falls back to the global arbitration. The local arbitration run is + // protected by shared lock of 'arbitrationLock_' which can run in parallel + // for different query pools. The free memory reclamation is protected by + // arbitrator 'mutex_' which is an in-memory fast operation. The function + // returns false on failure. Otherwise, it needs to further check if + // 'needGlobalArbitration' is true or not. If true, needs to proceed with the + // global arbitration run. + bool runLocalArbitration( + ArbitrationOperation* op, + bool& needGlobalArbitration); + + // Invoked to run global arbitration to reclaim free or used memory from the + // other queries. The global arbitration run is protected by the exclusive + // lock of 'arbitrationLock_' for serial execution mode. The function returns + // true on success, false on failure. + bool runGlobalArbitration(ArbitrationOperation* op); + + // Gets the mim/max memory capacity growth targets for 'op'. The min and max + // targets are calculated based on memoryPoolReservedCapacity_ requirements + // and the pool's max capacity. + void getGrowTargets( + ArbitrationOperation* op, + uint64_t& maxGrowTarget, + uint64_t& minGrowTarget); + + // Invoked to get or refresh the candidate memory pools for arbitration. If + // 'freeCapacityOnly' is true, then we only get free capacity stats for each + // candidate memory pool. + void getCandidates(ArbitrationOperation* op, bool freeCapacityOnly = false); + + // Sorts 'candidates' based on reclaimable free capacity in descending order. + static void sortCandidatesByReclaimableFreeCapacity( + std::vector& candidates); - void sortCandidatesByReclaimableMemory( - std::vector& candidates) const; + // Sorts 'candidates' based on reclaimable used capacity in descending order. + static void sortCandidatesByReclaimableUsedCapacity( + std::vector& candidates); - void sortCandidatesByFreeCapacity(std::vector& candidates) const; + // Sorts 'candidates' based on actual used memory in descending order. + static void sortCandidatesByUsage(std::vector& candidates); // Finds the candidate with the largest capacity. For 'requestor', the // capacity for comparison including its current capacity and the capacity to // grow. - const Candidate& findCandidateWithLargestCapacity( + static const SharedArbitrator::Candidate& findCandidateWithLargestCapacity( MemoryPool* requestor, uint64_t targetBytes, - const std::vector& candidates) const; - - bool arbitrateMemory( - MemoryPool* requestor, - std::vector& candidates, - uint64_t targetBytes); + const std::vector& candidates); - // Invoked to start next memory arbitration request, and it will wait for the - // serialized execution if there is a running or other waiting arbitration - // requests. - void startArbitration(MemoryPool* requestor); - - // Invoked by a finished memory arbitration request to kick off the next - // arbitration request execution if there are any ones waiting. - void finishArbitration(); - - // Invoked to reclaim free memory capacity from 'candidates' without actually - // freeing used memory. + // Invoked to reclaim free memory capacity from 'candidates' without + // actually freeing used memory. // // NOTE: the function might sort 'candidates' based on each candidate's free // capacity internally. uint64_t reclaimFreeMemoryFromCandidates( - std::vector& candidates, - uint64_t targetBytes); + ArbitrationOperation* op, + uint64_t reclaimTargetBytes, + bool isLocalArbitration); - // Invoked to reclaim used memory capacity from 'candidates'. + // Invoked to reclaim used memory capacity from 'candidates' by spilling. // // NOTE: the function might sort 'candidates' based on each candidate's // reclaimable memory internally. - uint64_t reclaimUsedMemoryFromCandidates( - MemoryPool* requestor, - std::vector& candidates, - uint64_t targetBytes); - - // Invoked to reclaim used memory from 'pool' with specified 'targetBytes'. - // The function returns the actually freed capacity. - uint64_t reclaim(MemoryPool* pool, uint64_t targetBytes) noexcept; + void reclaimUsedMemoryFromCandidatesBySpill( + ArbitrationOperation* op, + uint64_t& freedBytes); + + // Invoked to reclaim used memory capacity from 'candidates' by aborting the + // top memory users' queries. + void reclaimUsedMemoryFromCandidatesByAbort( + ArbitrationOperation* op, + uint64_t& freedBytes); + + // Checks if request pool has been aborted or not. + void checkIfAborted(ArbitrationOperation* op); + + // Checks if the request pool already has enough free capacity for the growth. + // This could happen if there are multiple arbitration operations from the + // same query. When the first served operation succeeds, it might have + // reserved enough capacity for the followup operations. + bool maybeGrowFromSelf(ArbitrationOperation* op); + + // Invoked to grow 'pool' capacity by 'growBytes' and commit used reservation + // by 'reservationBytes'. The function throws if the growth fails. + void + checkedGrow(MemoryPool* pool, uint64_t growBytes, uint64_t reservationBytes); + + // Invoked to reclaim used memory from 'targetPool' with specified + // 'targetBytes'. The function returns the actually freed capacity. + // 'isLocalArbitration' is true when the reclaim attempt is within a local + // arbitration. + uint64_t reclaim( + MemoryPool* targetPool, + uint64_t targetBytes, + bool isLocalArbitration) noexcept; // Invoked to abort memory 'pool'. void abort(MemoryPool* pool, const std::exception_ptr& error); // Invoked to handle the memory arbitration failure to abort the memory pool // with the largest capacity to free up memory. The function returns true on - // success and false if the requestor itself has been selected as the victim. - // We don't abort the requestor itself but just fails the arbitration to let - // the user decide to either proceed with the query or fail it. - bool handleOOM( - MemoryPool* requestor, - uint64_t targetBytes, - std::vector& candidates); - - // Decrement free capacity from the arbitrator with up to 'bytes'. The - // arbitrator might have less free available capacity. The function returns - // the actual decremented free capacity bytes. - uint64_t decrementFreeCapacity(uint64_t bytes); - uint64_t decrementFreeCapacityLocked(uint64_t bytes); + // success and false if the requestor itself has been selected as the + // victim. We don't abort the requestor itself but just fails the + // arbitration to let the user decide to either proceed with the query or + // fail it. + bool handleOOM(ArbitrationOperation* op); + + // Decrements free capacity from the arbitrator with up to + // 'maxBytesToReserve'. The arbitrator might have less free available + // capacity. The function returns the actual decremented free capacity + // bytes. If 'minBytesToReserve' is not zero and there is less than + // 'minBytes' available in non-reserved capacity, then the arbitrator tries + // to decrement up to 'minBytes' from the reserved capacity. + uint64_t decrementFreeCapacity( + uint64_t maxBytesToReserve, + uint64_t minBytesToReserve); + uint64_t decrementFreeCapacityLocked( + uint64_t maxBytesToReserve, + uint64_t minBytesToReserve); // Increment free capacity by 'bytes'. void incrementFreeCapacity(uint64_t bytes); void incrementFreeCapacityLocked(uint64_t bytes); + // Increments the free reserved capacity up to 'bytes' until reaches to the + // reserved capacity limit. 'bytes' is updated accordingly. + void incrementFreeReservedCapacityLocked(uint64_t& bytes); + + void incrementGlobalArbitrationCount(); + void incrementLocalArbitrationCount(); std::string toStringLocked() const; Stats statsLocked() const; - mutable std::mutex mutex_; - uint64_t freeCapacity_{0}; - // Indicates if there is a running arbitration request or not. - bool running_{false}; - - // The promises of the arbitration requests waiting for the serialized - // execution. - std::vector waitPromises_; - - tsan_atomic numRequests_{0}; - std::atomic numSucceeded_{0}; + // Returns the max reclaimable capacity from 'pool' which includes both used + // and free capacities. If 'isSelfReclaim' true, we reclaim memory from the + // request pool itself so that we can bypass the reserved free capacity + // reclaim restriction. + int64_t maxReclaimableCapacity(const MemoryPool& pool, bool isSelfReclaim) + const; + + // Returns the free memory capacity that can be reclaimed from 'pool' by + // shrink. If 'isSelfReclaim' true, we reclaim memory from the request pool + // itself so that we can bypass the reserved free capacity reclaim + // restriction. + int64_t reclaimableFreeCapacity(const MemoryPool& pool, bool isSelfReclaim) + const; + + // Returns the used memory capacity that can be reclaimed from 'pool' by + // disk spill. If 'isSelfReclaim' true, we reclaim memory from the request + // pool itself so that we can bypass the reserved free capacity reclaim + // restriction. + int64_t reclaimableUsedCapacity(const MemoryPool& pool, bool isSelfReclaim) + const; + + // Returns the minimal amount of memory capacity to grow for 'pool' to have + // the reserved capacity as specified by 'memoryPoolReservedCapacity_'. + int64_t minGrowCapacity(const MemoryPool& pool) const; + + // The capacity growth target is set to have a coarser granularity. It can + // help to reduce the number of future grow calls, and hence reducing the + // number of unnecessary memory arbitration requests. + uint64_t getCapacityGrowthTarget( + const MemoryPool& pool, + uint64_t requestBytes) const; + + // The capacity shrink target is adjusted from request shrink bytes to give + // the memory pool more headroom free capacity after shrink. It can help to + // reduce the number of future grow calls, and hence reducing the number of + // unnecessary memory arbitration requests. + uint64_t getCapacityShrinkTarget( + const MemoryPool& pool, + uint64_t requestBytes) const; + + // Returns true if 'pool' is under memory arbitration. + bool isUnderArbitrationLocked(MemoryPool* pool) const; + + void updateArbitrationRequestStats(); + + void updateArbitrationFailureStats(); + + const uint64_t reservedCapacity_; + const uint64_t memoryPoolInitialCapacity_; + const uint64_t memoryPoolReservedCapacity_; + const uint64_t memoryPoolTransferCapacity_; + const uint64_t memoryReclaimWaitMs_; + const bool globalArbitrationEnabled_; + const bool checkUsageLeak_; + + const uint64_t fastExponentialGrowthCapacityLimit_; + const double slowCapacityGrowPct_; + const uint64_t memoryPoolMinFreeCapacity_; + const double memoryPoolMinFreeCapacityPct_; + + mutable folly::SharedMutex poolLock_; + std::unordered_map> candidates_; + + // Lock used to protect the arbitrator state. + mutable std::mutex stateLock_; + tsan_atomic freeReservedCapacity_{0}; + tsan_atomic freeNonReservedCapacity_{0}; + + // Contains the arbitration running queues with one per each query memory + // pool. + std::unordered_map> + arbitrationQueues_; + + // R/W lock used to control local and global arbitration runs. A local + // arbitration run needs to hold a shared lock while the latter needs to hold + // an exclusive lock. Hence, multiple local arbitration runs from different + // query memory pools can run in parallel but the global ones has to run with + // one at a time. + mutable std::shared_mutex arbitrationLock_; + + std::atomic_uint64_t numRequests_{0}; + std::atomic_uint32_t numPending_{0}; tsan_atomic numAborted_{0}; - tsan_atomic numFailures_{0}; - tsan_atomic queueTimeUs_{0}; + std::atomic_uint64_t numFailures_{0}; + std::atomic_uint64_t waitTimeUs_{0}; tsan_atomic arbitrationTimeUs_{0}; - tsan_atomic numShrunkBytes_{0}; - tsan_atomic numReclaimedBytes_{0}; + tsan_atomic reclaimedFreeBytes_{0}; + tsan_atomic reclaimedUsedBytes_{0}; tsan_atomic reclaimTimeUs_{0}; tsan_atomic numNonReclaimableAttempts_{0}; + tsan_atomic numShrinks_{0}; }; } // namespace facebook::velox::memory diff --git a/velox/common/memory/StreamArena.cpp b/velox/common/memory/StreamArena.cpp index 14e5dbc37c11a..1153afbb8cf69 100644 --- a/velox/common/memory/StreamArena.cpp +++ b/velox/common/memory/StreamArena.cpp @@ -20,7 +20,10 @@ namespace facebook::velox { StreamArena::StreamArena(memory::MemoryPool* pool) : pool_(pool) {} -void StreamArena::newRange(int32_t bytes, ByteRange* range) { +void StreamArena::newRange( + int32_t bytes, + ByteRange* /*lastRange*/, + ByteRange* range) { VELOX_CHECK_GT(bytes, 0, "StreamArena::newRange can't be zero length"); const memory::MachinePageCount numPages = memory::AllocationTraits::numPages(bytes); @@ -62,7 +65,10 @@ void StreamArena::newRange(int32_t bytes, ByteRange* range) { } } -void StreamArena::newTinyRange(int32_t bytes, ByteRange* range) { +void StreamArena::newTinyRange( + int32_t bytes, + ByteRange* /*lastRange*/, + ByteRange* range) { VELOX_CHECK_GT(bytes, 0, "StreamArena::newTinyRange can't be zero length"); tinyRanges_.emplace_back(); tinyRanges_.back().resize(bytes); @@ -70,4 +76,14 @@ void StreamArena::newTinyRange(int32_t bytes, ByteRange* range) { range->buffer = reinterpret_cast(tinyRanges_.back().data()); range->size = bytes; } +void StreamArena::clear() { + allocations_.clear(); + pool_->freeNonContiguous(allocation_); + currentRun_ = 0; + currentOffset_ = 0; + largeAllocations_.clear(); + size_ = 0; + tinyRanges_.clear(); +} + } // namespace facebook::velox diff --git a/velox/common/memory/StreamArena.h b/velox/common/memory/StreamArena.h index a9e9f71e1954a..d46f00c436f4c 100644 --- a/velox/common/memory/StreamArena.h +++ b/velox/common/memory/StreamArena.h @@ -30,16 +30,28 @@ class StreamArena { virtual ~StreamArena() = default; - /// Sets range to the request 'bytes' of writable memory owned by 'this'. - /// We allocate non-contiguous memory to store range bytes if requested - /// 'bytes' is equal or less than the largest class page size. Otherwise, we - /// allocate from contiguous memory. - virtual void newRange(int32_t bytes, ByteRange* range); + /// Sets range to the request 'bytes' of writable memory owned by + /// 'this'. We allocate non-contiguous memory to store range bytes + /// if requested 'bytes' is equal or less than the largest class + /// page size. Otherwise, we allocate from contiguous + /// memory. 'range' is set to point to the allocated memory. If + /// 'lastRange' is non-nullptr, it is the last range of the stream + /// to which we are adding the new range. 'lastRange' is nullptr if + /// adding the first range to a stream. The memory is stays owned by + /// 'this' in all cases. Used by HashStringAllocator when extending + /// a multipart entry. The previously last part has its last 8 bytes + /// moved to the next part and gets a pointer to the next part as + /// its last 8 bytes. When extending, we need to update the entry so + /// that the next pointer is not seen when reading the content and + /// is also not counted in the payload size of the multipart entry. + virtual void newRange(int32_t bytes, ByteRange* lastRange, ByteRange* range); /// sets 'range' to point to a small piece of memory owned by this. These /// always come from the heap. The use case is for headers that may change - /// length based on data properties, not for bulk data. - virtual void newTinyRange(int32_t bytes, ByteRange* range); + /// length based on data properties, not for bulk data. See 'newRange' for the + /// meaning of 'lastRange'. + virtual void + newTinyRange(int32_t bytes, ByteRange* lastRange, ByteRange* range); /// Returns the Total size in bytes held by all Allocations. virtual size_t size() const { @@ -50,6 +62,10 @@ class StreamArena { return pool_; } + /// Restores 'this' to post-construction state. Used in recycling streams for + /// serilizers. + virtual void clear(); + private: memory::MemoryPool* const pool_; const memory::MachinePageCount allocationQuantum_{2}; diff --git a/velox/common/memory/tests/AllocationPoolTest.cpp b/velox/common/memory/tests/AllocationPoolTest.cpp index cd116d9de08a2..9713302744e82 100644 --- a/velox/common/memory/tests/AllocationPoolTest.cpp +++ b/velox/common/memory/tests/AllocationPoolTest.cpp @@ -14,25 +14,27 @@ * limitations under the License. */ #include "velox/common/memory/AllocationPool.h" +#include "velox/common/base/tests/GTestUtils.h" #include "velox/common/memory/MallocAllocator.h" #include "velox/common/memory/Memory.h" +#include "velox/common/testutil/TestValue.h" #include #include using namespace facebook::velox; +using namespace facebook::velox::common::testutil; class AllocationPoolTest : public testing::Test { protected: void SetUp() override { - allocator_ = std::make_shared(8L << 30); - manager_ = - std::make_shared(memory::MemoryManagerOptions{ - .capacity = (int64_t)allocator_->capacity(), - .allocator = allocator_.get()}); + manager_ = std::make_shared( + memory::MemoryManagerOptions{.allocatorCapacity = 8L << 30}); root_ = manager_->addRootPool("allocationPoolTestRoot"); pool_ = root_->addLeafChild("leaf"); + + TestValue::enable(); } // Writes a byte at pointer so we see RSS change. @@ -40,7 +42,6 @@ class AllocationPoolTest : public testing::Test { *reinterpret_cast(ptr) = 1; } - std::shared_ptr allocator_; std::shared_ptr manager_; std::shared_ptr root_; std::shared_ptr pool_; @@ -58,17 +59,17 @@ TEST_F(AllocationPoolTest, hugePages) { EXPECT_EQ(1, allocationPool->numRanges()); EXPECT_EQ(allocationPool->testingFreeAddressableBytes(), 64 << 10); allocationPool->newRun(64 << 10); - EXPECT_LE(128 << 10, pool_->currentBytes()); + EXPECT_LE(128 << 10, pool_->usedBytes()); allocationPool->allocateFixed(64 << 10); // Now at end of second 64K range, next will go to huge pages. setByte(allocationPool->allocateFixed(11)); EXPECT_LE((2 << 20) - 11, allocationPool->testingFreeAddressableBytes()); // The first 2MB of the hugepage run are marked reserved. - EXPECT_LE((2048 + 128) << 10, pool_->currentBytes()); + EXPECT_LE((2048 + 128) << 10, pool_->usedBytes()); // The next allocation starts reserves the next 2MB of the mmapped range. setByte(allocationPool->allocateFixed(2 << 20)); - EXPECT_LE((4096 + 128) << 10, pool_->currentBytes()); + EXPECT_LE((4096 + 128) << 10, pool_->usedBytes()); // Allocate the rest. allocationPool->allocateFixed( @@ -96,7 +97,7 @@ TEST_F(AllocationPoolTest, hugePages) { EXPECT_LE( (5UL << 30) + (31 << 20) + (128 << 10), allocationPool->allocatedBytes()); - EXPECT_LE((5UL << 30) + (31 << 20) + (128 << 10), pool_->currentBytes()); + EXPECT_LE((5UL << 30) + (31 << 20) + (128 << 10), pool_->usedBytes()); if (counter++ >= 1) { break; @@ -105,9 +106,51 @@ TEST_F(AllocationPoolTest, hugePages) { // Repeat the above after a clear(). allocationPool->clear(); // Should be empty after clear(). - EXPECT_EQ(0, pool_->currentBytes()); + EXPECT_EQ(0, pool_->usedBytes()); } allocationPool.reset(); // Should be empty after destruction. - EXPECT_EQ(0, pool_->currentBytes()); + EXPECT_EQ(0, pool_->usedBytes()); +} + +// This test relies on TestValue, so needs to be run in debug mode. +DEBUG_ONLY_TEST_F(AllocationPoolTest, oomCleanUp) { + // Test that when an OOM happens while growing an allocation in the + // AllocationPool, the AllocationPool is still in a valid state. + auto test = [&](int32_t alignment) { + auto allocationPool = std::make_unique(pool_.get()); + // Ensure we're beyond the huge page threshod. + allocationPool->setHugePageThreshold(32 << 10); + allocationPool->allocateFixed(32 << 10, alignment); + + // Allocate some memory so we have a large allocation. + allocationPool->allocateFixed(1 << 20, alignment); + + { + static const std::string kErrorMessage = "Simulate OOM for testing."; + // Trigger an OOM. + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::MemoryPoolImpl::reserveThreadSafe", + std::function( + [&](memory::MemoryPool* /*unused*/) { + VELOX_FAIL(kErrorMessage); + })); + VELOX_ASSERT_THROW( + allocationPool->allocateFixed( + allocationPool->testingFreeAddressableBytes(), alignment), + kErrorMessage); + } + + // Ensure the last range in the pool is still consistent, e.g. + // currentOffset_ isn't pointing into unallocated memory. + ASSERT_EQ( + allocationPool->rangeAt(allocationPool->numRanges() - 1).size(), + 1 << 20); + }; + + // Test with an alignment of 1. + test(1); + // Test with an alignment of 2 (this goes through a different code path in + // allocateFixed). + test(2); } diff --git a/velox/common/memory/tests/AllocationTest.cpp b/velox/common/memory/tests/AllocationTest.cpp index f27618303fa1c..ea6d3e58115f9 100644 --- a/velox/common/memory/tests/AllocationTest.cpp +++ b/velox/common/memory/tests/AllocationTest.cpp @@ -85,4 +85,28 @@ TEST_F(AllocationTest, appendMove) { allocation.clear(); } +TEST_F(AllocationTest, maxPageRunLimit) { + Allocation allocation; + const uint64_t vaildBufAddrValue = 4096; + uint8_t* validBufAddr = reinterpret_cast(vaildBufAddrValue); + allocation.append(validBufAddr, Allocation::PageRun::kMaxPagesInRun); + ASSERT_EQ(allocation.numPages(), Allocation::PageRun::kMaxPagesInRun); + ASSERT_EQ(allocation.numRuns(), 1); + + const uint64_t invaildBufAddrValue = 4096 * 1024; + uint8_t* invalidBufAddr = reinterpret_cast(invaildBufAddrValue); + VELOX_ASSERT_THROW( + allocation.append( + invalidBufAddr, Allocation::PageRun::kMaxPagesInRun + 1), + "The number of pages to append 65536 exceeds the PageRun limit 65535"); + VELOX_ASSERT_THROW( + allocation.append( + invalidBufAddr, Allocation::PageRun::kMaxPagesInRun * 2), + "The number of pages to append 131070 exceeds the PageRun limit 65535"); + ASSERT_EQ(allocation.numPages(), Allocation::PageRun::kMaxPagesInRun); + ASSERT_EQ(allocation.numRuns(), 1); + LOG(ERROR) << "here"; + allocation.clear(); +} + } // namespace facebook::velox::memory diff --git a/velox/common/memory/tests/ByteStreamTest.cpp b/velox/common/memory/tests/ByteStreamTest.cpp index 73f0bbb4d3c0b..ec9e43941b325 100644 --- a/velox/common/memory/tests/ByteStreamTest.cpp +++ b/velox/common/memory/tests/ByteStreamTest.cpp @@ -14,8 +14,12 @@ * limitations under the License. */ #include "velox/common/memory/ByteStream.h" -#include "velox/common/memory/MemoryAllocator.h" + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/file/FileInputStream.h" +#include "velox/common/file/FileSystems.h" #include "velox/common/memory/MmapAllocator.h" +#include "velox/exec/tests/utils/TempDirectoryPath.h" #include #include @@ -27,29 +31,26 @@ class ByteStreamTest : public testing::Test { protected: void SetUp() override { constexpr uint64_t kMaxMappedMemory = 64 << 20; - MmapAllocator::Options options; - options.capacity = kMaxMappedMemory; - mmapAllocator_ = std::make_shared(options); - MemoryAllocator::setDefaultInstance(mmapAllocator_.get()); - memoryManager_ = std::make_unique(MemoryManagerOptions{ - .capacity = kMaxMappedMemory, - .allocator = MemoryAllocator::getInstance()}); + MemoryManagerOptions options; + options.useMmapAllocator = true; + options.allocatorCapacity = kMaxMappedMemory; + options.arbitratorCapacity = kMaxMappedMemory; + options.arbitratorReservedCapacity = 0; + memoryManager_ = std::make_unique(options); + mmapAllocator_ = static_cast(memoryManager_->allocator()); pool_ = memoryManager_->addLeafPool("ByteStreamTest"); rng_.seed(124); } - void TearDown() override { - MmapAllocator::testingDestroyInstance(); - MemoryAllocator::setDefaultInstance(nullptr); - } + void TearDown() override {} std::unique_ptr newArena() { return std::make_unique(pool_.get()); } folly::Random::DefaultGenerator rng_; - std::shared_ptr mmapAllocator_; std::unique_ptr memoryManager_; + MmapAllocator* mmapAllocator_; std::shared_ptr pool_; }; @@ -100,80 +101,6 @@ TEST_F(ByteStreamTest, outputStream) { EXPECT_EQ(0, mmapAllocator_->numAllocated()); } -TEST_F(ByteStreamTest, resetInput) { - uint8_t* const kFakeBuffer = reinterpret_cast(this); - std::vector byteRanges; - size_t totalBytes{0}; - size_t lastRangeEnd; - for (int32_t i = 0; i < 32; ++i) { - byteRanges.push_back(ByteRange{kFakeBuffer, 4096 + i, 0}); - totalBytes += 4096 + i; - } - lastRangeEnd = byteRanges.back().size; - ByteStream byteStream; - ASSERT_EQ(byteStream.size(), 0); - ASSERT_EQ(byteStream.lastRangeEnd(), 0); - byteStream.resetInput(std::move(byteRanges)); - ASSERT_EQ(byteStream.size(), totalBytes); - ASSERT_EQ(byteStream.lastRangeEnd(), lastRangeEnd); -} - -TEST_F(ByteStreamTest, remainingSize) { - const int32_t kSize = 100; - const int32_t kBufferSize = 4096; - std::vector buffers; - std::vector byteRanges; - for (int32_t i = 0; i < kSize; i++) { - buffers.push_back(pool_->allocate(kBufferSize)); - byteRanges.push_back( - ByteRange{reinterpret_cast(buffers.back()), kBufferSize, 0}); - } - ByteStream byteStream; - byteStream.resetInput(std::move(byteRanges)); - const int32_t kReadBytes = 2048; - int32_t remainingSize = kSize * kBufferSize; - uint8_t* tempBuffer = reinterpret_cast(pool_->allocate(kReadBytes)); - while (byteStream.remainingSize() > 0) { - byteStream.readBytes(tempBuffer, kReadBytes); - remainingSize -= kReadBytes; - ASSERT_EQ(remainingSize, byteStream.remainingSize()); - } - ASSERT_EQ(0, byteStream.remainingSize()); - for (int32_t i = 0; i < kSize; i++) { - pool_->free(buffers[i], kBufferSize); - } - pool_->free(tempBuffer, kReadBytes); -} - -TEST_F(ByteStreamTest, toString) { - const int32_t kSize = 10; - const int32_t kBufferSize = 4096; - std::vector buffers; - std::vector byteRanges; - for (int32_t i = 0; i < kSize; i++) { - buffers.push_back(pool_->allocate(kBufferSize)); - byteRanges.push_back( - ByteRange{reinterpret_cast(buffers.back()), kBufferSize, 0}); - } - ByteStream byteStream; - byteStream.resetInput(std::move(byteRanges)); - const int32_t kReadBytes = 2048; - uint8_t* tempBuffer = reinterpret_cast(pool_->allocate(kReadBytes)); - for (int32_t i = 0; i < kSize / 2; i++) { - byteStream.readBytes(tempBuffer, kReadBytes); - } - std::string byteStreamStr = byteStream.toString(); - EXPECT_EQ( - byteStreamStr, - "ByteStream[lastRangeEnd 4096, 10 ranges " - "(position/size) [(4096/4096),(4096/4096),(2048/4096 current)," - "(0/4096),(0/4096),(0/4096),(0/4096),(0/4096),(0/4096),(0/4096)]]"); - for (int32_t i = 0; i < kSize; i++) { - pool_->free(buffers[i], kBufferSize); - } - pool_->free(tempBuffer, kReadBytes); -} - TEST_F(ByteStreamTest, newRangeAllocation) { const int kPageSize = AllocationTraits::kPageSize; struct { @@ -241,7 +168,7 @@ TEST_F(ByteStreamTest, newRangeAllocation) { const auto prevAllocCount = pool_->stats().numAllocs; auto arena = newArena(); - ByteStream byteStream(arena.get()); + ByteOutputStream byteStream(arena.get()); byteStream.startWrite(0); for (int i = 0; i < testData.newRangeSizes.size(); ++i) { const auto newRangeSize = testData.newRangeSizes[i]; @@ -249,8 +176,8 @@ TEST_F(ByteStreamTest, newRangeAllocation) { "iteration {} allocation size {}", i, succinctBytes(testData.newRangeSizes[i]))); - byteStream.appendStringPiece( - folly::StringPiece(std::string(newRangeSize, 'a'))); + std::string value(newRangeSize, 'a'); + byteStream.appendStringView(value); ASSERT_EQ(arena->size(), testData.expectedArenaAllocationSizes[i]); ASSERT_EQ( pool_->stats().numAllocs - prevAllocCount, @@ -265,9 +192,9 @@ TEST_F(ByteStreamTest, newRangeAllocation) { TEST_F(ByteStreamTest, randomRangeAllocationFromMultiStreamsTest) { auto arena = newArena(); const int numByteStreams = 10; - std::vector> byteStreams; + std::vector> byteStreams; for (int i = 0; i < numByteStreams; ++i) { - byteStreams.push_back(std::make_unique(arena.get())); + byteStreams.push_back(std::make_unique(arena.get())); byteStreams.back()->startWrite(0); } const int testIterations = 1000; @@ -284,9 +211,365 @@ TEST_F(ByteStreamTest, randomRangeAllocationFromMultiStreamsTest) { } break; case 2: { const int size = folly::Random::rand32(rng_) % 8192 + 1; - byteStream->appendStringPiece( - folly::StringPiece(std::string(size, 'a'))); + const std::string value(size, 'a'); + byteStream->appendStringView(value); } break; } } } + +TEST_F(ByteStreamTest, bits) { + std::vector bits; + uint64_t seed = 0x12345689abcdefLLU; + for (auto i = 0; i < 1000; ++i) { + bits.push_back(seed * (i + 1)); + } + auto arena = newArena(); + ByteOutputStream bitStream(arena.get(), true); + bitStream.startWrite(11); + int32_t offset = 0; + // Odd number of sizes. + std::vector bitSizes = {1, 19, 52, 58, 129}; + int32_t counter = 0; + auto totalBits = bits.size() * 64; + while (offset < totalBits) { + // Every second uses the fast path for aligned source and append only. + auto numBits = std::min( + totalBits - offset, bitSizes[counter % bitSizes.size()]); + if (counter % 1 == 0) { + bitStream.appendBits(bits.data(), offset, offset + numBits); + } else { + uint64_t aligned[10]; + bits::copyBits(bits.data(), offset, aligned, 0, numBits); + bitStream.appendBitsFresh(aligned, 0, numBits); + } + offset += numBits; + ++counter; + } + std::stringstream stringStream; + OStreamOutputStream out(&stringStream); + bitStream.flush(&out); + EXPECT_EQ( + 0, + memcmp( + stringStream.str().data(), + bits.data(), + bits.size() * sizeof(bits[0]))); +} + +TEST_F(ByteStreamTest, appendWindow) { + // A littel over 1MB. We must test appendss that involve multiple extend() + // calls for one window. + constexpr int32_t kNumWords = 140000; + Scratch scratch; + std::vector words; + uint64_t seed = 0x12345689abcdefLLU; + words.reserve(kNumWords); + for (auto i = 0; i < kNumWords; ++i) { + words.push_back(seed * (i + 1)); + } + auto arena = newArena(); + + ByteOutputStream stream(arena.get()); + int32_t offset = 0; + std::vector sizes = {1, 19, 52, 58, 129}; + int32_t counter = 0; + while (offset < words.size()) { + // there is one large window that spans multiple extend() calls. + auto numWords = std::min( + words.size() - offset, + (counter == 2 ? 130000 : sizes[counter % sizes.size()])); + int32_t bytes = -1; + { + AppendWindow window(stream, scratch); + auto ptr = window.get(numWords); + bytes = arena->pool()->usedBytes(); + memcpy(ptr, words.data() + offset, numWords * sizeof(words[0])); + offset += numWords; + ++counter; + } + // We check that there is no allocation at exit of AppendWindow block.k + EXPECT_EQ(arena->pool()->usedBytes(), bytes); + } + std::stringstream stringStream; + OStreamOutputStream out(&stringStream); + stream.flush(&out); + EXPECT_EQ(0, memcmp(stringStream.str().data(), words.data(), words.size())); +} + +TEST_F(ByteStreamTest, byteRange) { + ByteRange range; + range.size = 0; + range.position = 1; + ASSERT_EQ(range.availableBytes(), 0); + range.size = 1; + ASSERT_EQ(range.availableBytes(), 0); + range.size = 2; + ASSERT_EQ(range.availableBytes(), 1); +} + +TEST_F(ByteStreamTest, reuse) { + auto arena = newArena(); + ByteOutputStream stream(arena.get()); + char bytes[10000] = {}; + for (auto i = 0; i < 10; ++i) { + arena->clear(); + stream.startWrite(i * 100); + stream.appendStringView(std::string_view(bytes, sizeof(bytes))); + EXPECT_EQ(sizeof(bytes), stream.size()); + } +} + +class InputByteStreamTest : public ByteStreamTest, + public testing::WithParamInterface { + protected: + static void SetUpTestCase() { + filesystems::registerLocalFileSystem(); + } + + void SetUp() override { + ByteStreamTest::SetUp(); + tempDirPath_ = exec::test::TempDirectoryPath::create(); + fs_ = filesystems::getFileSystem(tempDirPath_->getPath(), nullptr); + } + + std::unique_ptr createStream( + const std::vector& byteRanges, + uint32_t bufferSize = 1024) { + if (GetParam()) { + return std::make_unique(std::move(byteRanges)); + } else { + const auto filePath = + fmt::format("{}/{}", tempDirPath_->getPath(), fileId_++); + auto writeFile = fs_->openFileForWrite(filePath); + for (auto& byteRange : byteRanges) { + writeFile->append(std::string_view( + reinterpret_cast(byteRange.buffer), byteRange.size)); + } + writeFile->close(); + return std::make_unique( + fs_->openFileForRead(filePath), bufferSize, pool_.get()); + } + } + + std::atomic_uint64_t fileId_{0}; + std::shared_ptr tempDirPath_; + std::shared_ptr fs_; +}; + +TEST_P(InputByteStreamTest, inputStream) { + uint8_t kFakeBuffer[8192]; + std::vector byteRanges; + size_t totalBytes{0}; + for (int32_t i = 0; i < 32; ++i) { + byteRanges.push_back(ByteRange{kFakeBuffer, 4096 + i, 0}); + totalBytes += 4096 + i; + } + auto byteStream = createStream(byteRanges); + ASSERT_EQ(byteStream->size(), totalBytes); + ASSERT_FALSE(byteStream->atEnd()); + byteStream->skip(totalBytes); + ASSERT_TRUE(byteStream->atEnd()); + if (GetParam()) { + VELOX_ASSERT_THROW( + byteStream->skip(1), + "(32 vs. 32) Reading past end of BufferInputStream"); + } else { + VELOX_ASSERT_THROW( + byteStream->skip(1), + "(1 vs. 0) Skip past the end of FileInputStream: 131568"); + } + ASSERT_TRUE(byteStream->atEnd()); +} + +TEST_P(InputByteStreamTest, emptyInputStreamError) { + if (GetParam()) { + VELOX_ASSERT_THROW(createStream({}), "Empty BufferInputStream"); + } else { + VELOX_ASSERT_THROW(createStream({}), "(0 vs. 0) Empty FileInputStream"); + } +} + +TEST_P(InputByteStreamTest, remainingSize) { + const int32_t kSize = 100; + const int32_t kBufferSize = 4096; + std::vector buffers; + std::vector byteRanges; + for (int32_t i = 0; i < kSize; i++) { + buffers.push_back(pool_->allocate(kBufferSize)); + byteRanges.push_back( + ByteRange{reinterpret_cast(buffers.back()), kBufferSize, 0}); + } + auto byteStream = createStream(byteRanges); + const int32_t kReadBytes = 2048; + int32_t remainingSize = kSize * kBufferSize; + ASSERT_EQ(byteStream->remainingSize(), remainingSize); + uint8_t* tempBuffer = reinterpret_cast(pool_->allocate(kReadBytes)); + while (byteStream->remainingSize() > 0) { + byteStream->readBytes(tempBuffer, kReadBytes); + remainingSize -= kReadBytes; + ASSERT_EQ(remainingSize, byteStream->remainingSize()); + } + ASSERT_EQ(byteStream->remainingSize(), 0); + for (int32_t i = 0; i < kSize; i++) { + pool_->free(buffers[i], kBufferSize); + } + pool_->free(tempBuffer, kReadBytes); +} + +TEST_P(InputByteStreamTest, toString) { + const int32_t kSize = 10; + const int32_t kBufferSize = 4096; + std::vector buffers; + std::vector byteRanges; + for (int32_t i = 0; i < kSize; i++) { + buffers.push_back(pool_->allocate(kBufferSize)); + byteRanges.push_back( + ByteRange{reinterpret_cast(buffers.back()), kBufferSize, 0}); + } + auto byteStream = createStream(std::move(byteRanges)); + const int32_t kReadBytes = 2048; + uint8_t* tempBuffer = reinterpret_cast(pool_->allocate(kReadBytes)); + for (int32_t i = 0; i < kSize / 2; i++) { + byteStream->readBytes(tempBuffer, kReadBytes); + } + + if (GetParam()) { + ASSERT_EQ( + byteStream->toString(), + "10 ranges " + "(position/size) [(4096/4096),(4096/4096),(2048/4096 current)," + "(0/4096),(0/4096),(0/4096),(0/4096),(0/4096),(0/4096),(0/4096)]"); + } else { + ASSERT_EQ( + byteStream->toString(), + "file (offset 10.00KB/size 40.00KB) current (position 1.00KB/ size 1.00KB)"); + } + + for (int32_t i = 0; i < kSize; i++) { + pool_->free(buffers[i], kBufferSize); + } + pool_->free(tempBuffer, kReadBytes); +} + +TEST_P(InputByteStreamTest, readBytesNegativeSize) { + constexpr int32_t kBufferSize = 4096; + uint8_t buffer[kBufferSize]; + auto byteStream = + createStream(std::vector{ByteRange{buffer, kBufferSize, 0}}); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); + uint8_t outputBuffer[kBufferSize]; + VELOX_ASSERT_THROW( + byteStream->readBytes(outputBuffer, -100), + "(-100 vs. 0) Attempting to read negative number of byte"); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); +} + +TEST_P(InputByteStreamTest, skipNegativeSize) { + constexpr int32_t kBufferSize = 4096; + uint8_t buffer[kBufferSize]; + auto byteStream = std::make_unique( + std::vector{ByteRange{buffer, kBufferSize, 0}}); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); + VELOX_ASSERT_THROW( + byteStream->skip(-100), + "(-100 vs. 0) Attempting to skip negative number of bytes"); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); +} + +TEST_P(InputByteStreamTest, nextViewNegativeSize) { + constexpr int32_t kBufferSize = 4096; + uint8_t buffer[kBufferSize]; + auto byteStream = + createStream(std::vector{ByteRange{buffer, kBufferSize, 0}}); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); + VELOX_ASSERT_THROW( + byteStream->nextView(-100), + "(-100 vs. 0) Attempting to view negative number of bytes"); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); +} + +TEST_P(InputByteStreamTest, view) { + SCOPED_TRACE(fmt::format("BufferInputStream: {}", GetParam())); + constexpr int32_t kBufferSize = 1024; + uint8_t buffer[kBufferSize]; + constexpr int32_t kNumRanges = 10; + std::vector fakeRanges; + fakeRanges.reserve(kNumRanges); + for (int i = 0; i < kNumRanges; ++i) { + fakeRanges.push_back(ByteRange{buffer, kBufferSize, 0}); + } + auto byteStream = createStream(fakeRanges); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize * kNumRanges); + ASSERT_EQ(byteStream->nextView(kBufferSize / 2).size(), kBufferSize / 2); + ASSERT_EQ(byteStream->nextView(kBufferSize).size(), kBufferSize / 2); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize * (kNumRanges - 1)); + byteStream->skip(byteStream->remainingSize()); + ASSERT_EQ(byteStream->remainingSize(), 0); + ASSERT_TRUE(byteStream->atEnd()); + ASSERT_EQ(byteStream->nextView(100).size(), 0); +} + +TEST_P(InputByteStreamTest, tellP) { + constexpr int32_t kBufferSize = 4096; + uint8_t buffer[kBufferSize]; + auto byteStream = + createStream(std::vector{ByteRange{buffer, kBufferSize, 0}}); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); + byteStream->readBytes(buffer, kBufferSize / 2); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize / 2); + ASSERT_EQ(byteStream->tellp(), kBufferSize / 2); + byteStream->skip(kBufferSize / 2); + ASSERT_EQ(byteStream->remainingSize(), 0); + ASSERT_EQ(byteStream->tellp(), kBufferSize); +} + +TEST_P(InputByteStreamTest, skip) { + constexpr int32_t kBufferSize = 4096; + uint8_t buffer[kBufferSize]; + auto byteStream = + createStream(std::vector{ByteRange{buffer, kBufferSize, 0}}); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); + byteStream->skip(0); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); + byteStream->skip(1); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize - 1); + if (GetParam()) { + VELOX_ASSERT_THROW( + byteStream->skip(kBufferSize), + "(1 vs. 1) Reading past end of BufferInputStream"); + ASSERT_EQ(byteStream->remainingSize(), 0); + ASSERT_TRUE(byteStream->atEnd()); + } else { + VELOX_ASSERT_THROW( + byteStream->skip(kBufferSize), + "(4096 vs. 4095) Skip past the end of FileInputStream: 4096"); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize - 1); + ASSERT_FALSE(byteStream->atEnd()); + } +} + +TEST_P(InputByteStreamTest, seekp) { + constexpr int32_t kBufferSize = 4096; + uint8_t buffer[kBufferSize]; + auto byteStream = + createStream(std::vector{ByteRange{buffer, kBufferSize, 0}}); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize); + byteStream->seekp(kBufferSize / 2); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize / 2); + byteStream->seekp(kBufferSize / 2); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize / 2); + if (GetParam()) { + byteStream->seekp(kBufferSize / 4); + ASSERT_EQ(byteStream->remainingSize(), kBufferSize / 2 + kBufferSize / 4); + } else { + VELOX_ASSERT_THROW( + byteStream->seekp(kBufferSize / 4), + "(1024 vs. 2048) Backward seek is not supported by FileInputStream"); + } +} + +VELOX_INSTANTIATE_TEST_SUITE_P( + InputByteStreamTest, + InputByteStreamTest, + testing::ValuesIn({false, true})); diff --git a/velox/common/memory/tests/CMakeLists.txt b/velox/common/memory/tests/CMakeLists.txt index 94821e05f4fd9..8fd55b025f493 100644 --- a/velox/common/memory/tests/CMakeLists.txt +++ b/velox/common/memory/tests/CMakeLists.txt @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. include(GoogleTest) + add_executable( velox_memory_test + AllocationPoolTest.cpp AllocationTest.cpp ByteStreamTest.cpp CompactDoubleListTest.cpp HashStringAllocatorTest.cpp - AllocationPoolTest.cpp MemoryAllocatorTest.cpp MemoryArbitratorTest.cpp MemoryCapExceededTest.cpp @@ -30,33 +31,36 @@ add_executable( target_link_libraries( velox_memory_test - PRIVATE velox_caching - velox_common_base - velox_exception - velox_exec - velox_exec_test_lib - velox_memory - velox_temp_path - velox_test_util - velox_vector_fuzzer - Folly::folly - gflags::gflags - glog::glog - gmock - gtest - gtest_main - re2::re2) + PRIVATE + velox_caching + velox_common_base + velox_exception + velox_exec + velox_exec_test_lib + velox_memory + velox_temp_path + velox_test_util + velox_vector_fuzzer + Folly::folly + fmt::fmt + gflags::gflags + glog::glog + GTest::gmock + GTest::gtest + GTest::gtest_main + re2::re2) gtest_add_tests(velox_memory_test "" AUTO) add_executable(velox_fragmentation_benchmark FragmentationBenchmark.cpp) target_link_libraries( - velox_fragmentation_benchmark PRIVATE velox_memory Folly::folly - gflags::gflags glog::glog) + velox_fragmentation_benchmark + PRIVATE velox_memory Folly::folly gflags::gflags glog::glog) add_executable(velox_concurrent_allocation_benchmark ConcurrentAllocationBenchmark.cpp) -target_link_libraries(velox_concurrent_allocation_benchmark PRIVATE velox_memory - velox_time) +target_link_libraries( + velox_concurrent_allocation_benchmark + PRIVATE velox_memory velox_time) diff --git a/velox/common/memory/tests/ConcurrentAllocationBenchmark.cpp b/velox/common/memory/tests/ConcurrentAllocationBenchmark.cpp index 5d4623744d19b..406004ccece67 100644 --- a/velox/common/memory/tests/ConcurrentAllocationBenchmark.cpp +++ b/velox/common/memory/tests/ConcurrentAllocationBenchmark.cpp @@ -34,6 +34,10 @@ DEFINE_uint32( memory_allocator_type, 0, "The type of memory allocator. 0 is malloc allocator, 1 is mmap allocator"); +DEFINE_uint32( + memory_allocation_type, + 0, + "The type of memory allocation. 0 is small allocation, 1 non-contiguous allocation"); DEFINE_uint32( num_runs, 32, @@ -67,10 +71,17 @@ class MemoryOperator { } private: - struct Allocation { + struct SmallAllocation { void* ptr; - explicit Allocation(void* _ptr) : ptr(_ptr) {} + explicit SmallAllocation(void* _ptr) : ptr(_ptr) {} + }; + + struct NonContiguousAllocation { + std::unique_ptr allocation; + + NonContiguousAllocation() + : allocation(std::make_unique()) {} }; bool full() const; @@ -79,18 +90,26 @@ class MemoryOperator { void free(); + int randomAllocationIndex() const; + void cleanup(); + void freeSmallAllocation(SmallAllocation& allocation); + + void freeNonContiguousAllocation(NonContiguousAllocation& allocation); + static inline int32_t poolId_{0}; const uint64_t maxMemory_; const size_t allocationBytes_; + const uint32_t allocationType_{FLAGS_memory_allocation_type}; const uint32_t maxOps_; const std::shared_ptr pool_; - folly::Random::DefaultGenerator rng_; + mutable folly::Random::DefaultGenerator rng_; uint64_t allocatedBytes_{0}; - std::deque allocations_; + std::deque smallAllocations_; + std::deque nonContiguousAllocations_; uint64_t clockCount_{0}; }; @@ -107,29 +126,70 @@ void MemoryOperator::allocate() { } { ClockTimer cpuTimer(clockCount_); - allocations_.emplace_back(pool_->allocate(allocationBytes_)); + if (allocationType_ == 0) { + smallAllocations_.emplace_back(pool_->allocate(allocationBytes_)); + } else { + NonContiguousAllocation nonContiguousAllocation; + pool_->allocateNonContiguous( + memory::AllocationTraits::numPages(allocationBytes_), + *nonContiguousAllocation.allocation); + nonContiguousAllocations_.emplace_back( + std::move(nonContiguousAllocation)); + } } allocatedBytes_ += allocationBytes_; } void MemoryOperator::free() { - const int freeIdx = folly::Random::rand32(rng_) % allocations_.size(); - Allocation freeAllocation = allocations_[freeIdx]; - allocations_[freeIdx] = allocations_.back(); - allocations_.pop_back(); + const int freeIdx = randomAllocationIndex(); + if (allocationType_ == 0) { + SmallAllocation allocation = smallAllocations_[freeIdx]; + smallAllocations_[freeIdx] = smallAllocations_.back(); + smallAllocations_.pop_back(); + + freeSmallAllocation(allocation); + } else { + NonContiguousAllocation allocation = + std::move(nonContiguousAllocations_[freeIdx]); + nonContiguousAllocations_[freeIdx] = + std::move(nonContiguousAllocations_.back()); + nonContiguousAllocations_.pop_back(); + + freeNonContiguousAllocation(allocation); + } +} + +int MemoryOperator::randomAllocationIndex() const { + const int randIdx = folly::Random::rand32(rng_); + if (allocationType_ == 0) { + return randIdx % smallAllocations_.size(); + } + return randIdx % nonContiguousAllocations_.size(); +} + +void MemoryOperator::freeSmallAllocation(SmallAllocation& allocation) { { ClockTimer cpuTimer(clockCount_); - pool_->free(freeAllocation.ptr, allocationBytes_); + pool_->free(allocation.ptr, allocationBytes_); + } + allocatedBytes_ -= allocationBytes_; +} + +void MemoryOperator::freeNonContiguousAllocation( + NonContiguousAllocation& alllocation) { + { + ClockTimer cpuTimer(clockCount_); + pool_->freeNonContiguous(*alllocation.allocation); } allocatedBytes_ -= allocationBytes_; } void MemoryOperator::cleanup() { - for (const auto& allocation : allocations_) { - { - ClockTimer cpuTimer(clockCount_); - pool_->free(allocation.ptr, allocationBytes_); - } + for (auto& allocation : smallAllocations_) { + freeSmallAllocation(allocation); + } + for (auto& allocation : nonContiguousAllocations_) { + freeNonContiguousAllocation(allocation); } } @@ -157,15 +217,12 @@ class MemoryAllocationBenchMark { const int64_t maxMemory = options_.maxMemory + (256 << 20); switch (options_.allocatorType) { case Type::kMmap: { - memory::MmapAllocator::Options mmapOptions; - mmapOptions.capacity = maxMemory; - allocator_ = std::make_shared(mmapOptions); manager_ = std::make_shared(MemoryManagerOptions{ - .capacity = maxMemory, .allocator = allocator_.get()}); + .allocatorCapacity = maxMemory, .useMmapAllocator = true}); } break; case Type::kMalloc: manager_ = std::make_shared( - MemoryManagerOptions{.capacity = maxMemory}); + MemoryManagerOptions{.allocatorCapacity = maxMemory}); break; default: VELOX_USER_FAIL( @@ -188,7 +245,6 @@ class MemoryAllocationBenchMark { }; const Options options_; - std::shared_ptr allocator_; std::shared_ptr manager_; std::vector results_; }; diff --git a/velox/common/memory/tests/FragmentationBenchmark.cpp b/velox/common/memory/tests/FragmentationBenchmark.cpp index 455c4a420ead3..eb9124f2235ca 100644 --- a/velox/common/memory/tests/FragmentationBenchmark.cpp +++ b/velox/common/memory/tests/FragmentationBenchmark.cpp @@ -34,19 +34,21 @@ using namespace facebook::velox; using namespace facebook::velox::memory; struct Block { - explicit Block(MemoryAllocator& _allocator) : allocator(_allocator) {} + explicit Block(MemoryAllocator* _allocator) : allocator(_allocator) {} ~Block() { if (data != nullptr) { free(data); } - if (allocation != nullptr) { - allocator.freeNonContiguous(*allocation); + if (allocator != nullptr) { + if (allocation != nullptr) { + allocator->freeNonContiguous(*allocation); + } + allocator->freeContiguous(contiguous); } - allocator.freeContiguous(contiguous); } - MemoryAllocator& allocator; + MemoryAllocator* allocator; size_t size = 0; char* data = nullptr; std::shared_ptr allocation; @@ -91,7 +93,7 @@ class FragmentationTest { } void allocate(size_t size) { - auto block = std::make_unique(*memory_); + auto block = std::make_unique(memory_.get()); block->size = size; if (memory_) { if (size <= 8 << 20) { diff --git a/velox/common/memory/tests/HashStringAllocatorTest.cpp b/velox/common/memory/tests/HashStringAllocatorTest.cpp index 6784eb5e44625..124220d4d57f0 100644 --- a/velox/common/memory/tests/HashStringAllocatorTest.cpp +++ b/velox/common/memory/tests/HashStringAllocatorTest.cpp @@ -36,8 +36,12 @@ struct Multipart { class HashStringAllocatorTest : public testing::Test { protected: + static void SetUpTestCase() { + memory::MemoryManager::initialize({}); + } + void SetUp() override { - pool_ = memory::addDefaultLeafMemoryPool(); + pool_ = memory::memoryManager()->addLeafPool(); allocator_ = std::make_unique(pool_.get()); rng_.seed(1); } @@ -51,7 +55,8 @@ class HashStringAllocatorTest : public testing::Test { void initializeContents(HSA::Header* header) { auto sequence = ++sequence_; - int32_t numWords = header->size() / sizeof(void*); + int32_t numWords = + header->size() / HashStringAllocator::Header::kContinuedPtrSize; void** ptr = reinterpret_cast(header->begin()); ptr[0] = reinterpret_cast(sequence); for (int32_t offset = 1; offset < numWords; offset++) { @@ -95,6 +100,48 @@ class HashStringAllocatorTest : public testing::Test { folly::Random::DefaultGenerator rng_; }; +TEST_F(HashStringAllocatorTest, multipleFree) { + ASSERT_NO_THROW(allocator_->toString()); + + auto h1 = allocate(123); + ASSERT_EQ(h1->toString(), "size: 123"); + + allocator_->free(h1); + // Running free() multiple times on the same memory block should result in an + // error. + VELOX_ASSERT_THROW(allocator_->free(h1), ""); +} + +TEST_F(HashStringAllocatorTest, multipleFreeAncCheckCurrentBytes) { + ASSERT_NO_THROW(allocator_->toString()); + + auto h1 = allocate(123); + auto h2 = allocate(456); + auto h3 = allocate(789); + + ASSERT_EQ(h1->toString(), "size: 123"); + ASSERT_EQ(h2->toString(), "size: 456"); + ASSERT_EQ(h3->toString(), "size: 789"); + + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); + + allocator_->free(h3); + allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); + + allocator_->free(h2); + allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); + + allocator_->free(h1); + allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); + + // After all blocks execute free(), the allocated bytes should be equal to 0. + ASSERT_EQ(allocator_->currentBytes(), 0); +} + TEST_F(HashStringAllocatorTest, headerToString) { ASSERT_NO_THROW(allocator_->toString()); @@ -113,10 +160,10 @@ TEST_F(HashStringAllocatorTest, headerToString) { ASSERT_NO_THROW(allocator_->toString()); - ByteStream stream(allocator_.get()); + ByteOutputStream stream(allocator_.get()); auto h4 = allocator_->newWrite(stream).header; std::string data(123'456, 'x'); - stream.appendStringPiece(folly::StringPiece(data.data(), data.size())); + stream.appendStringView(data); allocator_->finishWrite(stream, 0); ASSERT_EQ(h4->toString(), "|multipart| size: 123 [64913, 58436]"); @@ -137,7 +184,8 @@ TEST_F(HashStringAllocatorTest, allocate) { headers.push_back(allocate((i % 10) * 10)); } EXPECT_FALSE(allocator_->isEmpty()); - allocator_->checkConsistency(); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); for (int32_t step = 7; step >= 1; --step) { for (auto i = 0; i < headers.size(); i += step) { if (headers[i]) { @@ -145,7 +193,8 @@ TEST_F(HashStringAllocatorTest, allocate) { headers[i] = nullptr; } } - allocator_->checkConsistency(); + allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); } } EXPECT_TRUE(allocator_->isEmpty()); @@ -164,11 +213,11 @@ TEST_F(HashStringAllocatorTest, allocateLarge) { } TEST_F(HashStringAllocatorTest, finishWrite) { - ByteStream stream(allocator_.get()); + ByteOutputStream stream(allocator_.get()); auto start = allocator_->newWrite(stream); // Write a short string. - stream.appendStringPiece(folly::StringPiece("abc")); + stream.appendStringView(std::string_view("abc")); auto [firstStart, firstFinish] = allocator_->finishWrite(stream, 0); ASSERT_EQ(start.header, firstStart.header); @@ -177,8 +226,8 @@ TEST_F(HashStringAllocatorTest, finishWrite) { // Replace short string with a long string that uses two bytes short of // available space. allocator_->extendWrite(start, stream); - auto longString = std::string(start.header->size() - 2, 'x'); - stream.appendStringPiece(folly::StringPiece(longString)); + std::string longString(start.header->size() - 2, 'x'); + stream.appendStringView(longString); auto [longStart, longFinish] = allocator_->finishWrite(stream, 0); ASSERT_EQ(start.header, longStart.header); @@ -186,7 +235,7 @@ TEST_F(HashStringAllocatorTest, finishWrite) { // Append another string after the long string. allocator_->extendWrite(longFinish, stream); - stream.appendStringPiece(folly::StringPiece("abc")); + stream.appendStringView(std::string_view("abc")); auto [appendStart, appendFinish] = allocator_->finishWrite(stream, 0); ASSERT_NE(appendStart.header, longFinish.header); @@ -196,7 +245,7 @@ TEST_F(HashStringAllocatorTest, finishWrite) { // Replace last string. allocator_->extendWrite(appendStart, stream); - stream.appendStringPiece(folly::StringPiece("abcd")); + stream.appendStringView(std::string_view("abcd")); auto [replaceStart, replaceFinish] = allocator_->finishWrite(stream, 0); ASSERT_EQ(appendStart.header, replaceStart.header); @@ -205,18 +254,19 @@ TEST_F(HashStringAllocatorTest, finishWrite) { replaceStart.offset() + 4); // Read back long and short strings. - HSA::prepareRead(longStart.header, stream); + auto inputStream = HSA::prepareRead(longStart.header); std::string copy; copy.resize(longString.size()); - stream.readBytes(copy.data(), copy.size()); + inputStream->readBytes(copy.data(), copy.size()); ASSERT_EQ(copy, longString); copy.resize(4); - stream.readBytes(copy.data(), 4); + inputStream->readBytes(copy.data(), 4); ASSERT_EQ(copy, "abcd"); - allocator_->checkConsistency(); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); std::vector sizes = { 50000, 100000, 200000, 1000000, 3000000, 5000000}; @@ -224,15 +274,16 @@ TEST_F(HashStringAllocatorTest, finishWrite) { auto largeString = randomString(size); auto start = allocator_->newWrite(stream); - stream.appendStringPiece(folly::StringPiece(largeString)); + stream.appendStringView(largeString); allocator_->finishWrite(stream, 0); - HSA::prepareRead(start.header, stream); + auto inStream = HSA::prepareRead(start.header); std::string copy; copy.resize(largeString.size()); - stream.readBytes(copy.data(), copy.size()); + inStream->readBytes(copy.data(), copy.size()); ASSERT_EQ(copy, largeString); - allocator_->checkConsistency(); + allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); } } @@ -246,7 +297,7 @@ TEST_F(HashStringAllocatorTest, multipart) { continue; } auto chars = randomString(); - ByteStream stream(allocator_.get()); + ByteOutputStream stream(allocator_.get()); if (data[i].start.header) { if (rand32() % 5) { // 4/5 of cases append to the end. @@ -263,13 +314,14 @@ TEST_F(HashStringAllocatorTest, multipart) { EXPECT_EQ( data[i].start.header, HSA::headerOf(stream.ranges()[0].buffer)); } - stream.appendStringPiece(folly::StringPiece(chars.data(), chars.size())); + stream.appendStringView(chars); auto reserve = rand32() % 100; data[i].current = allocator_->finishWrite(stream, reserve).second; data[i].reference.insert( data[i].reference.end(), chars.begin(), chars.end()); } - allocator_->checkConsistency(); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); } for (const auto& d : data) { if (d.start.isSet()) { @@ -279,14 +331,48 @@ TEST_F(HashStringAllocatorTest, multipart) { for (auto& d : data) { if (d.start.isSet()) { checkAndFree(d); - allocator_->checkConsistency(); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); } } - allocator_->checkConsistency(); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); +} + +TEST_F(HashStringAllocatorTest, mixedMultipart) { + // Create multi-part allocation with a mix of block allocated from Arena and + // MemoryPool. + + const std::string shortString(25, 'x'); + const std::string extraLongString(5'000, 'y'); + + ByteOutputStream stream(allocator_.get()); + + auto start = allocator_->newWrite(stream); + stream.appendStringView(shortString); + auto current = allocator_->finishWrite(stream, 0); + + allocator_->extendWrite(current.second, stream); + + ByteRange range; + allocator_->newContiguousRange(extraLongString.size(), &range); + stream.setRange(range, 0); + + stream.appendStringView(extraLongString); + current = allocator_->finishWrite(stream, 0); + + allocator_->extendWrite(current.second, stream); + stream.appendStringView(shortString); + allocator_->finishWrite(stream, 0); + + allocator_->free(start.header); + + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); } TEST_F(HashStringAllocatorTest, rewrite) { - ByteStream stream(allocator_.get()); + ByteOutputStream stream(allocator_.get()); auto header = allocator_->allocate(5); EXPECT_EQ(16, header->size()); // Rounds up to kMinAlloc. HSA::Position current = HSA::Position::atOffset(header, 0); @@ -310,10 +396,10 @@ TEST_F(HashStringAllocatorTest, rewrite) { stream.appendOne(67890LL); position = allocator_->finishWrite(stream, 0).second; EXPECT_EQ(3 * sizeof(int64_t), HSA::offset(header, position)); - HSA::prepareRead(header, stream); - EXPECT_EQ(123456789012345LL, stream.read()); - EXPECT_EQ(12345LL, stream.read()); - EXPECT_EQ(67890LL, stream.read()); + auto inStream = HSA::prepareRead(header); + EXPECT_EQ(123456789012345LL, inStream->read()); + EXPECT_EQ(12345LL, inStream->read()); + EXPECT_EQ(67890LL, inStream->read()); } // The stream contains 3 int64_t's. auto end = HSA::seek(header, 3 * sizeof(int64_t)); @@ -363,7 +449,8 @@ TEST_F(HashStringAllocatorTest, stlAllocator) { } } - allocator_->checkConsistency(); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); // We allow for some overhead for free lists after all is freed. EXPECT_LE(allocator_->retainedSize() - allocator_->freeSpace(), 100); @@ -398,10 +485,11 @@ TEST_F(HashStringAllocatorTest, stlAllocatorWithSet) { } } - allocator_->checkConsistency(); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); // We allow for some overhead for free lists after all is freed. - EXPECT_LE(allocator_->retainedSize() - allocator_->freeSpace(), 180); + EXPECT_LE(allocator_->retainedSize() - allocator_->freeSpace(), 220); } TEST_F(HashStringAllocatorTest, alignedStlAllocatorWithF14Map) { @@ -435,13 +523,33 @@ TEST_F(HashStringAllocatorTest, alignedStlAllocatorWithF14Map) { } } - allocator_->checkConsistency(); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); // We allow for some overhead for free lists after all is freed. Map tends to // generate more free blocks at the end, so we loosen the upper bound a bit. EXPECT_LE(allocator_->retainedSize() - allocator_->freeSpace(), 130); } +TEST_F(HashStringAllocatorTest, alignedStlAllocatorLargeAllocation) { + const auto allocateSize = 1ULL << 10; + + // Test large allocation + aligned pool. + AlignedStlAllocator alignedAlloc16(allocator_.get()); + int64_t* ptr = alignedAlloc16.allocate(allocateSize); + alignedAlloc16.deallocate(ptr, allocateSize); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); + + // Test large allocation + un-aligned pool. + ASSERT_LT(allocator_->pool()->alignment(), 128); + AlignedStlAllocator alignedAlloc128(allocator_.get()); + ptr = alignedAlloc128.allocate(allocateSize); + alignedAlloc128.deallocate(ptr, allocateSize); + allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); +} + TEST_F(HashStringAllocatorTest, stlAllocatorOverflow) { StlAllocator alloc(allocator_.get()); VELOX_ASSERT_THROW(alloc.allocate(1ULL << 62), "integer overflow"); @@ -451,60 +559,211 @@ TEST_F(HashStringAllocatorTest, stlAllocatorOverflow) { TEST_F(HashStringAllocatorTest, externalLeak) { constexpr int32_t kSize = HashStringAllocator ::kMaxAlloc * 10; - auto root = - memory::MemoryManager::getInstance().addRootPool("HSALeakTestRoot"); + auto root = memory::memoryManager()->addRootPool("HSALeakTestRoot"); auto pool = root->addLeafChild("HSALeakLeaf"); - auto initialBytes = pool->currentBytes(); + auto initialBytes = pool->usedBytes(); auto allocator = std::make_unique(pool.get()); for (auto i = 0; i < 100; ++i) { allocator->allocate(kSize); } - EXPECT_LE(100 * kSize, pool->currentBytes()); + EXPECT_LE(100 * kSize, pool->usedBytes()); StlAllocator stlAlloc(allocator.get()); for (auto i = 0; i < 100; ++i) { stlAlloc.allocate(kSize); } - EXPECT_LE(200 * kSize, pool->currentBytes()); + EXPECT_LE(200 * kSize, pool->usedBytes()); allocator->clear(); - EXPECT_GE(initialBytes + 1000, pool->currentBytes()); + EXPECT_GE(initialBytes + 1000, pool->usedBytes()); allocator.reset(); - EXPECT_EQ(initialBytes, pool->currentBytes()); + EXPECT_EQ(initialBytes, pool->usedBytes()); } TEST_F(HashStringAllocatorTest, freeLists) { - auto sizes = HashStringAllocator::freeListSizeClasses(); + constexpr int kSize = 100'000; + constexpr int kSmall = 17; + constexpr int kMedium = kSmall + 1; + constexpr int kLarge = 128; std::vector allocations; - // We make alternating allocations of different free list size - // classes. We free the small ones. We allocate a larger size from - // the same size class, This reads the free list and moves to a - // larger list. On the second time around, this remembers that the - // smaller fre list dies not have entries of that size. - auto small = sizes[1] + 2; - auto larger = sizes[2] + 2; - for (auto i = 0; i < 100; ++i) { - allocations.push_back(allocator_->allocate(i == 0 ? small + 10 : small)); - allocations.push_back(allocator_->allocate(larger)); + for (int i = 0; i < 2 * kSize; ++i) { + allocations.push_back(allocator_->allocate(i < kSize ? kMedium : kSmall)); + allocations.push_back(allocator_->allocate(kLarge)); } - for (auto i = 0; i < allocations.size(); i += 2) { + // Release medium blocks, then small ones. + for (int i = 0; i < allocations.size(); i += 2) { allocator_->free(allocations[i]); } - allocations[0] = allocator_->allocate(small); - // This comes from head of free list. - EXPECT_EQ(0, allocator_->numFreeListNoFit()); - allocations[2] = allocator_->allocate(small + 10); - // Last in free llist fits. - EXPECT_EQ(98, allocator_->numFreeListNoFit()); - allocations[4] = allocator_->allocate(small + 10); - // Traverse the free list again but no fit. - EXPECT_EQ(2 * 98, allocator_->numFreeListNoFit()); - allocations[6] = allocator_->allocate(small + 10); - // Now we know there is no fit in the list for small, so it is not - // retraversed. - EXPECT_EQ(2 * 98, allocator_->numFreeListNoFit()); + // Make sure we don't traverse the whole small free list while looking for + // medium free blocks. + auto t0 = std::chrono::steady_clock::now(); + for (int i = 0; i < kSize; ++i) { + allocator_->allocate(kSmall + 1); + } + ASSERT_LT(std::chrono::steady_clock::now() - t0, std::chrono::seconds(30)); +} + +TEST_F(HashStringAllocatorTest, strings) { + constexpr uint64_t kMagic1 = 0x133788a07; + constexpr uint64_t kMagic2 = 0xe7ababe11e; + std::vector strings; + std::vector views; + for (auto i = 0; i < 20000; ++i) { + std::string str; + auto freeBytes = allocator_->freeSpace(); + if (freeBytes > 20 && freeBytes < 120) { + // Target the next allocation to take all of the last free block. + str.resize(freeBytes - 15); + } else { + if (i % 11 == 0) { + str.resize((i * kMagic1) % 6001); + } else { + str.resize(24 + (i % 22)); + } + } + for (auto c = 0; c < str.size(); ++c) { + str[c] = ((c + i) % 64) + 32; + } + if (i > 0 && i % 3 == 0) { + auto freeIdx = ((i * kMagic2) % views.size()); + if (!strings[freeIdx].empty()) { + strings[freeIdx].clear(); + allocator_->free(HashStringAllocator::headerOf(views[i].data())); + } + } + strings.push_back(str); + views.push_back(StringView(str.data(), str.size())); + allocator_->copyMultipart(views[i], reinterpret_cast(&views[i]), 0); + if (i % 10 == 0) { + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); + } + } + for (auto i = 0; i < strings.size(); ++i) { + if (strings[i].empty()) { + continue; + } + std::string temp; + ASSERT_TRUE( + StringView(strings[i]) == + HashStringAllocator::contiguousString(views[i], temp)); + } + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); } +TEST_F(HashStringAllocatorTest, sizeAndPosition) { + // We make a stream consisting of multiple non-contiguous ranges + // and verify that it is writable and appendable and that its + // size() always reflects the number of written bytes, excluding + // any overheads. + + // First, we make a free list to make sure things are multipart. + constexpr int32_t kUnitSize = 256; + std::vector pieces; + for (auto i = 0; i < 100; ++i) { + pieces.push_back(allocator_->allocate(kUnitSize + 30)); + } + for (auto i = 0; i < pieces.size(); i += 2) { + allocator_->free(pieces[i]); + } + + // We write each nth character of stream to be n % kunitSize. + std::string allChars; + allChars.resize(kUnitSize); + for (auto i = 0; i < kUnitSize; ++i) { + allChars[i] = i; + } + + ByteOutputStream stream(allocator_.get()); + auto position = allocator_->newWrite(stream, 20); + // Nothing written yet. + EXPECT_EQ(0, stream.size()); + for (auto i = 0; i < 10; ++i) { + stream.appendStringView(allChars); + // We check that the size reflects the payload size after each write. + EXPECT_EQ((i + 1) * kUnitSize, stream.size()); + } + // We expect a multipart allocation. + EXPECT_TRUE(position.header->isContinued()); + EXPECT_EQ(kUnitSize * 10, stream.tellp()); + + // we check and rewrite different offsets in the stream, not to pass past end. + for (auto start = 90; start < kUnitSize * 9; start += 125) { + stream.seekp(start); + EXPECT_EQ(start, stream.tellp()); + EXPECT_EQ(kUnitSize * 10, stream.size()); + auto input = stream.inputStream(); + input->seekp(start); + EXPECT_EQ(kUnitSize * 10 - start, input->remainingSize()); + for (auto c = 0; c < 10; ++c) { + uint8_t byte = input->readByte(); + EXPECT_EQ(byte, (start + c) % kUnitSize); + } + // Overwrite the bytes just read. + stream.seekp(start); + stream.appendStringView(std::string_view(allChars.data(), 100)); + input = stream.inputStream(); + input->seekp(start); + for (auto c = 0; c < 100; ++c) { + uint8_t byte = input->readByte(); + EXPECT_EQ(byte, c % kUnitSize); + } + } + EXPECT_EQ(kUnitSize * 10, stream.size()); + stream.seekp(kUnitSize * 10 - 100); + stream.appendStringView(allChars); + // The last write extends the size. + EXPECT_EQ(kUnitSize * 11 - 100, stream.size()); +} + +TEST_F(HashStringAllocatorTest, storeStringFast) { + allocator_->allocate(HashStringAllocator::kMinAlloc); + std::string s( + allocator_->freeSpace() + HashStringAllocator::Header::kContinuedPtrSize, + 'x'); + StringView sv(s); + allocator_->copyMultipart(sv, reinterpret_cast(&sv), 0); + ASSERT_NE(sv.data(), s.data()); + ASSERT_EQ(sv, StringView(s)); + auto allocatedBytes = allocator_->checkConsistency(); + ASSERT_EQ(allocatedBytes, allocator_->currentBytes()); +} + +TEST_F(HashStringAllocatorTest, clear) { + allocator_->allocate(HashStringAllocator::kMinAlloc); + allocator_->allocate(HashStringAllocator::kMaxAlloc + 1); + EXPECT_GT(allocator_->retainedSize(), 0); + allocator_->clear(); + EXPECT_EQ(allocator_->retainedSize(), 0); +} + +TEST_F(HashStringAllocatorTest, freezeAndExecute) { + std::string str = "abc"; + StringView view(str.data(), str.size()); + allocator_->copyMultipart(view, reinterpret_cast(&view), 0); + + str.clear(); + + // Freeing memory requires the HashStringAllocator to be mutable. + VELOX_ASSERT_THROW( + allocator_->freezeAndExecute([&]() { + allocator_->free(HashStringAllocator::headerOf(view.data())); + }), + "The HashStringAllocator is immutable."); + + HashStringAllocator::Header* header; + + // Allocating memory requires the HashStringAllocator to be mutable. + VELOX_ASSERT_THROW( + allocator_->freezeAndExecute( + [&]() { header = allocator_->allocate(24); }), + "The HashStringAllocator is immutable."); + + // Simply fetching state should not require the HashStringAllocator to be + // mutable. + allocator_->freezeAndExecute([&]() { allocator_->currentBytes(); }); +} } // namespace } // namespace facebook::velox diff --git a/velox/common/memory/tests/MemoryAllocatorTest.cpp b/velox/common/memory/tests/MemoryAllocatorTest.cpp index fb582f1ef8662..3c2f48f0c4cc1 100644 --- a/velox/common/memory/tests/MemoryAllocatorTest.cpp +++ b/velox/common/memory/tests/MemoryAllocatorTest.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "velox/common/memory/MemoryAllocator.h" +#include #include "velox/common/base/tests/GTestUtils.h" #include "velox/common/memory/AllocationPool.h" #include "velox/common/memory/MallocAllocator.h" @@ -21,16 +22,18 @@ #include "velox/common/memory/MmapArena.h" #include "velox/common/testutil/TestValue.h" -#include -#include - +#include #include #include #include -#include +#include #include -DECLARE_int32(velox_memory_pool_mb); +#ifdef linux +#include +#endif // linux + +DECLARE_bool(velox_memory_leak_check_enabled); using namespace facebook::velox::common::testutil; @@ -43,14 +46,15 @@ struct ProcessSize { }; } // namespace -static constexpr uint64_t kCapacityBytes = 256UL * 1024 * 1024; +static constexpr uint64_t kCapacityBytes = 1ULL << 30; static constexpr MachinePageCount kCapacityPages = (kCapacityBytes / AllocationTraits::kPageSize); -class MemoryAllocatorTest : public testing::TestWithParam { +class MemoryAllocatorTest : public testing::TestWithParam { protected: static void SetUpTestCase() { TestValue::enable(); + FLAGS_velox_memory_leak_check_enabled = true; } void SetUp() override { @@ -59,50 +63,55 @@ class MemoryAllocatorTest : public testing::TestWithParam { void setupAllocator() { pool_.reset(); - MemoryAllocator::testingDestroyInstance(); - useMmap_ = GetParam(); + useMmap_ = GetParam() == 0; + enableReservation_ = GetParam() == 2; maxMallocBytes_ = 3072; if (useMmap_) { - MmapAllocator::Options options; - options.capacity = kCapacityBytes; + MemoryManagerOptions options; + options.useMmapAllocator = true; + options.allocatorCapacity = kCapacityBytes; + options.arbitratorCapacity = kCapacityBytes; + options.arbitratorReservedCapacity = 128 << 20; + options.memoryPoolReservedCapacity = 1 << 20; options.smallAllocationReservePct = 4; options.maxMallocBytes = maxMallocBytes_; - allocator_ = std::make_shared(options); - auto mmapAllocator = std::dynamic_pointer_cast(allocator_); + memoryManager_ = std::make_unique(options); ASSERT_EQ( - AllocationTraits::numPages(mmapAllocator->capacity()), + AllocationTraits::numPages(memoryManager_->allocator()->capacity()), bits::roundUp( kCapacityBytes * (100 - options.smallAllocationReservePct) / 100 / AllocationTraits::kPageSize, - 64 * mmapAllocator->sizeClasses().back())); - MemoryAllocator::setDefaultInstance(allocator_.get()); + 64 * memoryManager_->allocator()->sizeClasses().back())); } else { - allocator_ = std::make_shared(kCapacityBytes); - MemoryAllocator::setDefaultInstance(allocator_.get()); + MemoryManagerOptions options; + options.allocatorCapacity = kCapacityBytes; + options.arbitratorCapacity = kCapacityBytes; + options.arbitratorReservedCapacity = 128 << 20; + options.memoryPoolReservedCapacity = 1 << 20; + if (!enableReservation_) { + options.allocationSizeThresholdWithReservation = 0; + } + memoryManager_ = std::make_unique(options); } - instance_ = MemoryAllocator::getInstance(); - memoryManager_ = std::make_unique(MemoryManagerOptions{ - .capacity = (int64_t)instance_->capacity(), .allocator = instance_}); + instance_ = memoryManager_->allocator(); pool_ = memoryManager_->addLeafPool("allocatorTest"); if (useMmap_) { ASSERT_EQ(instance_->kind(), MemoryAllocator::Kind::kMmap); ASSERT_EQ( instance_->toString(), - "Memory Allocator[MMAP capacity 64.00KB allocated pages 0 mapped pages 0 external mapped pages 0\n[size 1: 0(0MB) allocated 0 mapped]\n[size 2: 0(0MB) allocated 0 mapped]\n[size 4: 0(0MB) allocated 0 mapped]\n[size 8: 0(0MB) allocated 0 mapped]\n[size 16: 0(0MB) allocated 0 mapped]\n[size 32: 0(0MB) allocated 0 mapped]\n[size 64: 0(0MB) allocated 0 mapped]\n[size 128: 0(0MB) allocated 0 mapped]\n[size 256: 0(0MB) allocated 0 mapped]\n]"); + "Memory Allocator[MMAP total capacity 1.00GB free capacity 1.00GB allocated pages 0 mapped pages 0 external mapped pages 0\n[size 1: 0(0MB) allocated 0 mapped]\n[size 2: 0(0MB) allocated 0 mapped]\n[size 4: 0(0MB) allocated 0 mapped]\n[size 8: 0(0MB) allocated 0 mapped]\n[size 16: 0(0MB) allocated 0 mapped]\n[size 32: 0(0MB) allocated 0 mapped]\n[size 64: 0(0MB) allocated 0 mapped]\n[size 128: 0(0MB) allocated 0 mapped]\n[size 256: 0(0MB) allocated 0 mapped]\n]"); } else { ASSERT_EQ(instance_->kind(), MemoryAllocator::Kind::kMalloc); ASSERT_EQ( instance_->toString(), - "Memory Allocator[MALLOC capacity 256.00MB allocated bytes 0 allocated pages 0 mapped pages 0]"); + "Memory Allocator[MALLOC capacity 1.00GB allocated bytes 0 allocated pages 0 mapped pages 0]"); } ASSERT_EQ( MemoryAllocator::kindString(static_cast(100)), "UNKNOWN: 100"); } - void TearDown() override { - MemoryAllocator::testingDestroyInstance(); - } + void TearDown() override {} bool allocate(int32_t numPages, Allocation& result) { try { @@ -110,7 +119,7 @@ class MemoryAllocatorTest : public testing::TestWithParam { EXPECT_TRUE(result.empty()); return false; } - } catch (const VeloxException& e) { + } catch (const VeloxException&) { EXPECT_TRUE(result.empty()); return false; } @@ -338,7 +347,7 @@ class MemoryAllocatorTest : public testing::TestWithParam { bool allocateContiguous( int numPages, - Allocation* FOLLY_NULLABLE collateral, + Allocation* collateral, ContiguousAllocation& allocation) { bool success = instance_->allocateContiguous(numPages, collateral, allocation); @@ -404,8 +413,8 @@ class MemoryAllocatorTest : public testing::TestWithParam { } bool useMmap_; + bool enableReservation_; int32_t maxMallocBytes_; - std::shared_ptr allocator_; MemoryAllocator* instance_; std::unique_ptr memoryManager_; std::shared_ptr pool_; @@ -621,7 +630,42 @@ TEST_P(MemoryAllocatorTest, allocationClass2) { allocation->clear(); } +TEST_P(MemoryAllocatorTest, stats) { + const std::vector& sizes = instance_->sizeClasses(); + MachinePageCount capacity = kCapacityPages; + for (auto i = 0; i < sizes.size(); ++i) { + std::unique_ptr allocation = std::make_unique(); + auto size = sizes[i]; + ASSERT_TRUE(allocate(size, *allocation)); + ASSERT_GT(instance_->numAllocated(), 0); + instance_->freeNonContiguous(*allocation); + auto stats = instance_->stats(); + ASSERT_EQ(0, stats.sizes[i].clocks()); + ASSERT_EQ(stats.sizes[i].totalBytes, 0); + ASSERT_EQ(stats.sizes[i].numAllocations, 0); + } + + gflags::FlagSaver flagSaver; + FLAGS_velox_time_allocations = true; + for (auto i = 0; i < sizes.size(); ++i) { + std::unique_ptr allocation = std::make_unique(); + auto size = sizes[i]; + ASSERT_TRUE(allocate(size, *allocation)); + ASSERT_GT(instance_->numAllocated(), 0); + instance_->freeNonContiguous(*allocation); + auto stats = instance_->stats(); + ASSERT_LT(0, stats.sizes[i].clocks()); + ASSERT_GE(stats.sizes[i].totalBytes, size * AllocationTraits::kPageSize); + ASSERT_GE(stats.sizes[i].numAllocations, 1); + } +} + TEST_P(MemoryAllocatorTest, singleAllocation) { + if (!useMmap_ && enableReservation_) { + return; + } + gflags::FlagSaver flagSaver; + FLAGS_velox_time_allocations = true; const std::vector& sizes = instance_->sizeClasses(); MachinePageCount capacity = kCapacityPages; for (auto i = 0; i < sizes.size(); ++i) { @@ -683,6 +727,9 @@ TEST_P(MemoryAllocatorTest, increasingSize) { } TEST_P(MemoryAllocatorTest, increasingSizeWithThreads) { + if (!useMmap_ && enableReservation_) { + return; + } const int32_t numThreads = 20; std::vector>> allocations; allocations.reserve(numThreads); @@ -727,7 +774,7 @@ TEST_P(MemoryAllocatorTest, externalAdvise) { } constexpr int32_t kSmallSize = 16; constexpr int32_t kLargeSize = 32 * kSmallSize + 1; - auto instance = dynamic_cast(MemoryAllocator::getInstance()); + auto instance = dynamic_cast(instance_); std::vector> allocations; auto numAllocs = kCapacityPages / kSmallSize; allocations.reserve(numAllocs); @@ -776,8 +823,8 @@ TEST_P(MemoryAllocatorTest, nonContiguousFailure) { std::string debugString() const { return fmt::format( "numOldPages:{}, numNewPages:{}, injectedFailure:{}", - numOldPages, - numNewPages, + static_cast(numOldPages), + static_cast(numNewPages), injectedFailure); } } testSettings[] = {// Cap failure injection. @@ -839,28 +886,49 @@ TEST_P(MemoryAllocatorTest, nonContiguousFailure) { Allocation::PageRun::kMaxPagesInRun, MemoryAllocator::InjectedFailure::kMadvise}, {200, 100, MemoryAllocator::InjectedFailure::kMadvise}}; + std::unordered_map + expectedErrorMsg = { + {MemoryAllocator::InjectedFailure::kAllocate, + "Malloc failed to allocate"}, + {MemoryAllocator::InjectedFailure::kCap, + "Exceeded memory allocator limit"}}; + if (useMmap_) { + expectedErrorMsg = { + {MemoryAllocator::InjectedFailure::kCap, + "Exceeded memory allocator limit"}, + {MemoryAllocator::InjectedFailure::kMadvise, + "Could not advise away enough"}, + {MemoryAllocator::InjectedFailure::kAllocate, + "Failed allocation in size class"}}; + } + // Some error messages are only set when a reservationCB is provided + auto dummyReservationCB = [](int64_t /*bytes*/, bool /*preAllocation*/) {}; for (const auto& testData : testSettings) { SCOPED_TRACE( fmt::format("{}, useMmap:{}", testData.debugString(), useMmap_)); - if ((testData.injectedFailure != - MemoryAllocator::InjectedFailure::kAllocate) && + if ((testData.injectedFailure == + MemoryAllocator::InjectedFailure::kMadvise) && !useMmap_) { - // Non-Allocate failure injection only applies for MmapAllocator. + // Madvise failure injection only applies for MmapAllocator. continue; } setupAllocator(); Allocation allocation; if (testData.numOldPages > 0) { - allocator_->allocateNonContiguous(testData.numOldPages, allocation); + instance_->allocateNonContiguous(testData.numOldPages, allocation); } ASSERT_GE(allocation.numPages(), testData.numOldPages); - allocator_->testingSetFailureInjection(testData.injectedFailure, true); - ASSERT_FALSE( - allocator_->allocateNonContiguous(testData.numNewPages, allocation)); - ASSERT_EQ(allocator_->numAllocated(), 0); - allocator_->testingClearFailureInjection(); + instance_->testingSetFailureInjection(testData.injectedFailure, true); + ASSERT_FALSE(instance_->allocateNonContiguous( + testData.numNewPages, allocation, dummyReservationCB)); + auto failureMsg = instance_->getAndClearFailureMessage(); + EXPECT_THAT( + failureMsg, + testing::HasSubstr(expectedErrorMsg[testData.injectedFailure])); + ASSERT_EQ(instance_->numAllocated(), 0); + instance_->testingClearFailureInjection(); } - ASSERT_TRUE(allocator_->checkConsistency()); + ASSERT_TRUE(instance_->checkConsistency()); } TEST_P(MemoryAllocatorTest, allocContiguous) { @@ -914,7 +982,7 @@ TEST_P(MemoryAllocatorTest, allocContiguous) { } else { ASSERT_EQ(instance_->numMapped(), testData.newContiguousPages); } - auto mappedAllocator = dynamic_cast(allocator_.get()); + auto mappedAllocator = dynamic_cast(instance_); ASSERT_EQ( mappedAllocator->numExternalMapped(), testData.newContiguousPages); } else { @@ -930,7 +998,7 @@ TEST_P(MemoryAllocatorTest, allocContiguous) { } else { ASSERT_EQ(instance_->numMapped(), 0); } - auto mappedAllocator = dynamic_cast(allocator_.get()); + auto mappedAllocator = dynamic_cast(instance_); ASSERT_EQ(mappedAllocator->numExternalMapped(), 0); } else { ASSERT_EQ(instance_->numMapped(), 0); @@ -976,9 +1044,16 @@ TEST_P(MemoryAllocatorTest, allocContiguousFail) { {100, 0, 100, MemoryAllocator::InjectedFailure::kMadvise}, {200, 0, 100, MemoryAllocator::InjectedFailure::kMadvise}, {100, 0, 200, MemoryAllocator::InjectedFailure::kMadvise}}; + + std::unordered_map + expectedErrorMsg = { + {MemoryAllocator::InjectedFailure::kCap, + "Exceeded memory allocator limit"}, + {MemoryAllocator::InjectedFailure::kMmap, "Mmap failed with"}, + {MemoryAllocator::InjectedFailure::kMadvise, + "Could not advise away enough"}}; for (const auto& testData : testSettings) { - if ((testData.injectedFailure != - MemoryAllocator::InjectedFailure::kAllocate) && + if ((testData.injectedFailure != MemoryAllocator::InjectedFailure::kCap) && !useMmap_) { continue; } @@ -1003,13 +1078,17 @@ TEST_P(MemoryAllocatorTest, allocContiguousFail) { ASSERT_FALSE(instance_->allocateContiguous( testData.newContiguousPages, &allocation, contiguousAllocation)); + auto failureMsg = instance_->getAndClearFailureMessage(); + EXPECT_THAT( + failureMsg, + testing::HasSubstr(expectedErrorMsg[testData.injectedFailure])); ASSERT_EQ(instance_->numAllocated(), 0); if (useMmap_) { // Mmap allocator doesn't free mapped pages count for the old // non-contiguous allocation. ASSERT_EQ(instance_->numMapped(), testData.nonContiguousPages); - auto mappedAllocator = dynamic_cast(allocator_.get()); + auto mappedAllocator = dynamic_cast(instance_); ASSERT_EQ(mappedAllocator->numExternalMapped(), 0); } else { ASSERT_EQ(instance_->numMapped(), 0); @@ -1047,7 +1126,19 @@ TEST_P(MemoryAllocatorTest, allocContiguousGrow) { EXPECT_TRUE(instance_->allocateContiguous( kInitialLarge, nullptr, large, nullptr, kCapacityPages)); EXPECT_FALSE(instance_->growContiguous(kMinGrow, large)); + auto failureMsg = instance_->getAndClearFailureMessage(); + auto expected = "Exceeded memory allocator limit"; + EXPECT_THAT(failureMsg, testing::HasSubstr(expected)); freeSmall(kMinGrow); + if (useMmap_) { + // Also test mmap failure path + instance_->testingSetFailureInjection( + MemoryAllocator::InjectedFailure::kMmap, false); + EXPECT_FALSE(instance_->growContiguous(kMinGrow, large)); + failureMsg = instance_->getAndClearFailureMessage(); + expected = "Could not advise away enough"; + EXPECT_THAT(failureMsg, testing::HasSubstr(expected)); + } EXPECT_TRUE(instance_->growContiguous(kMinGrow, large)); EXPECT_EQ(instance_->numAllocated(), kCapacityPages); freeSmall(4 * kMinGrow); @@ -1304,16 +1395,18 @@ TEST_P(MemoryAllocatorTest, StlMemoryAllocator) { } } -TEST_P(MemoryAllocatorTest, badNonContiguousAllocation) { +TEST_P(MemoryAllocatorTest, nonContiguousAllocationBounds) { // Set the num of pages to allocate exceeds one PageRun limit. constexpr MachinePageCount kNumPages = Allocation::PageRun::kMaxPagesInRun + 1; std::unique_ptr allocation(new Allocation()); - ASSERT_THROW( - instance_->allocateNonContiguous(kNumPages, *allocation), - VeloxRuntimeError); + ASSERT_TRUE(instance_->allocateNonContiguous(kNumPages, *allocation)); + instance_->freeNonContiguous(*allocation); ASSERT_TRUE(instance_->allocateNonContiguous(kNumPages - 1, *allocation)); instance_->freeNonContiguous(*allocation); + ASSERT_TRUE(instance_->allocateNonContiguous( + Allocation::PageRun::kMaxPagesInRun * 2, *allocation)); + instance_->freeNonContiguous(*allocation); } TEST_P(MemoryAllocatorTest, contiguousAllocation) { @@ -1354,7 +1447,7 @@ TEST_P(MemoryAllocatorTest, contiguousAllocation) { ASSERT_EQ(movedAllocation.pool(), pool_.get()); *allocation = std::move(movedAllocation); ASSERT_TRUE(!allocation->empty()); // NOLINT - ASSERT_TRUE(movedAllocation.empty()); + ASSERT_TRUE(movedAllocation.empty()); // NOLINT ASSERT_EQ(allocation->pool(), pool_.get()); } ASSERT_THROW(allocation->setPool(pool_.get()), VeloxRuntimeError); @@ -1379,7 +1472,13 @@ TEST_P(MemoryAllocatorTest, allocatorCapacity) { EXPECT_NE(nullptr, preExistingBuf); EXPECT_EQ(nullptr, instance_->allocateBytes(allocationBytes)); + EXPECT_THAT( + instance_->getAndClearFailureMessage(), + testing::HasSubstr("Exceeded memory allocator limit")); EXPECT_EQ(nullptr, instance_->allocateZeroFilled(allocationBytes)); + EXPECT_THAT( + instance_->getAndClearFailureMessage(), + testing::HasSubstr("Exceeded memory allocator limit")); Allocation small; if (allocationBytes <= Allocation::PageRun::kMaxPagesInRun) { EXPECT_FALSE(instance_->allocateNonContiguous(allocationBytes, small)); @@ -1398,10 +1497,8 @@ TEST_P(MemoryAllocatorTest, allocatorCapacity) { TEST_P(MemoryAllocatorTest, allocatorCapacityWithThreads) { std::atomic numOps{0}; const int64_t numMaxOps = 100000; - // We need large enough (at least close to capacity) allocations to breach the - // capacity limit in this test. - EXPECT_GT(Allocation::PageRun::kMaxPagesInRun, kCapacityPages / 4 * 3); - const int64_t nonContAllocPages = Allocation::PageRun::kMaxPagesInRun; + const int64_t nonContAllocPages = + Allocation::PageRun::kMaxPagesInRun / 256 * 256; std::function nonContiguousReserveFail = [&, this]() { while (numOps < numMaxOps) { @@ -1512,11 +1609,6 @@ TEST_P(MemoryAllocatorTest, allocatorCapacityWithThreads) { EXPECT_EQ(instance_->numAllocated(), 0); } -VELOX_INSTANTIATE_TEST_SUITE_P( - MemoryAllocatorTests, - MemoryAllocatorTest, - testing::ValuesIn({false, true})); - class MmapArenaTest : public testing::Test { public: // 32 MB arena space @@ -1808,4 +1900,92 @@ TEST_F(MmapArenaTest, managedMmapArenasFreeError) { ASSERT_ANY_THROW(managedArenas->free(alloc2, kArenaCapacityBytes)); } } + +TEST_P(MemoryAllocatorTest, unmap) { + const int smallAllocationSize = 1024; + const int largeAllocationSize = 8192; + const int numAllocations = 10; + std::vector smallBuffers; + std::vector largeBuffers; + for (int i = 0; i < numAllocations; ++i) { + smallBuffers.push_back(instance_->allocateBytes(smallAllocationSize)); + largeBuffers.push_back(instance_->allocateBytes(largeAllocationSize)); + } + const auto numAllocated = instance_->numAllocated(); + if (useMmap_) { + ASSERT_EQ(instance_->numMapped(), numAllocated); + } else { + ASSERT_EQ(instance_->numAllocated(), 0); + ASSERT_EQ(instance_->numMapped(), 0); + } + // Nothing can be unmapped. + ASSERT_EQ(instance_->unmap(numAllocated), 0); + for (const auto& smallBuffer : smallBuffers) { + instance_->freeBytes(smallBuffer, smallAllocationSize); + } + for (const auto& largeBuffer : largeBuffers) { + instance_->freeBytes(largeBuffer, largeAllocationSize); + } + ASSERT_EQ(instance_->numAllocated(), 0); + if (useMmap_) { + ASSERT_EQ(instance_->numMapped(), numAllocated); + ASSERT_EQ(instance_->unmap(numAllocated / 2), numAllocated / 2); + ASSERT_GT(instance_->unmap(numAllocated), 0); + ASSERT_EQ(instance_->numMapped(), 0); + } else { + ASSERT_EQ(instance_->numMapped(), 0); + ASSERT_EQ(instance_->unmap(numAllocated), 0); + } +} + +VELOX_INSTANTIATE_TEST_SUITE_P( + MemoryAllocatorTestSuite, + MemoryAllocatorTest, + testing::ValuesIn({0, 1, 2})); + +class MmapConfigTest : public testing::Test { + public: + protected: + void setupAllocator() { + constexpr int64_t kCapacityBytes = 900LL << 20; // 900MB. + MemoryManagerOptions options; + options.useMmapAllocator = true; + options.allocatorCapacity = kCapacityBytes; + options.largestSizeClassPages = 4096; + options.arbitratorCapacity = kCapacityBytes; + options.arbitratorReservedCapacity = 128 << 20; + options.memoryPoolReservedCapacity = 1 << 20; + options.smallAllocationReservePct = 4; + options.maxMallocBytes = 3 * 1024; + memoryManager_ = std::make_unique(options); + allocator_ = memoryManager_->allocator(); + ASSERT_EQ( + AllocationTraits::numPages(memoryManager_->allocator()->capacity()), + bits::roundUp( + kCapacityBytes * (100 - options.smallAllocationReservePct) / 100 / + AllocationTraits::kPageSize, + 64 * allocator_->sizeClasses().back())); + } + + std::unique_ptr memoryManager_; + MemoryAllocator* allocator_; +}; + +TEST_F(MmapConfigTest, sizeClasses) { + setupAllocator(); + Allocation result; + ASSERT_TRUE( + allocator_->allocateNonContiguous(2 * 4096 - 1, result, nullptr, 0)); + auto g = folly::makeGuard([&]() { allocator_->freeNonContiguous(result); }); + // Check that the allocation has one page of each size class, largest to + // smallest. + EXPECT_EQ(4096 * 2 - 1, result.numPages()); + EXPECT_EQ(13, result.numRuns()); + int32_t runPages = 4096; + for (auto i = 0; i < result.numRuns(); ++i) { + EXPECT_EQ(runPages, result.runAt(i).numPages()); + runPages = runPages / 2; + } +} + } // namespace facebook::velox::memory diff --git a/velox/common/memory/tests/MemoryArbitratorTest.cpp b/velox/common/memory/tests/MemoryArbitratorTest.cpp index 9913ed7298c9d..848ab33f79603 100644 --- a/velox/common/memory/tests/MemoryArbitratorTest.cpp +++ b/velox/common/memory/tests/MemoryArbitratorTest.cpp @@ -32,7 +32,20 @@ constexpr int64_t GB = 1024L * MB; namespace facebook::velox::memory { -class MemoryArbitrationTest : public testing::Test {}; +class MemoryArbitrationTest : public testing::Test { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + void SetUp() { + SharedArbitrator::registerFactory(); + } + + void TearDown() { + SharedArbitrator::unregisterFactory(); + } +}; TEST_F(MemoryArbitrationTest, stats) { MemoryArbitrator::Stats stats; @@ -43,11 +56,24 @@ TEST_F(MemoryArbitrationTest, stats) { stats.arbitrationTimeUs = 1020; stats.numShrunkBytes = 100'000'000; stats.numReclaimedBytes = 10'000; + stats.freeReservedCapacityBytes = 1000; + stats.freeCapacityBytes = 2000; stats.reclaimTimeUs = 1'000; stats.numNonReclaimableAttempts = 5; ASSERT_EQ( stats.toString(), - "STATS[numRequests 2 numSucceeded 0 numAborted 3 numFailures 100 numNonReclaimableAttempts 5 queueTime 230.00ms arbitrationTime 1.02ms reclaimTime 1.00ms shrunkMemory 95.37MB reclaimedMemory 9.77KB maxCapacity 0B freeCapacity 0B]"); + "STATS[numRequests 2 numAborted 3 numFailures 100 " + "numNonReclaimableAttempts 5 numShrinks 0 " + "queueTime 230.00ms arbitrationTime 1.02ms reclaimTime 1.00ms " + "shrunkMemory 95.37MB reclaimedMemory 9.77KB " + "maxCapacity 0B freeCapacity 1.95KB freeReservedCapacity 1000B]"); + ASSERT_EQ( + fmt::format("{}", stats), + "STATS[numRequests 2 numAborted 3 numFailures 100 " + "numNonReclaimableAttempts 5 numShrinks 0 " + "queueTime 230.00ms arbitrationTime 1.02ms reclaimTime 1.00ms " + "shrunkMemory 95.37MB reclaimedMemory 9.77KB " + "maxCapacity 0B freeCapacity 1.95KB freeReservedCapacity 1000B]"); } TEST_F(MemoryArbitrationTest, create) { @@ -58,7 +84,7 @@ TEST_F(MemoryArbitrationTest, create) { }; for (const auto& kind : kinds) { MemoryArbitrator::Config config; - config.capacity = 1 * GB; + config.capacity = 8 * GB; config.kind = kind; if (kind.empty()) { auto arbitrator = MemoryArbitrator::create(config); @@ -75,7 +101,7 @@ TEST_F(MemoryArbitrationTest, create) { TEST_F(MemoryArbitrationTest, createWithDefaultConf) { MemoryArbitrator::Config config; - config.capacity = 1 * GB; + config.capacity = 8 * GB; const auto& arbitrator = MemoryArbitrator::create(config); ASSERT_EQ(arbitrator->kind(), "NOOP"); } @@ -83,11 +109,11 @@ TEST_F(MemoryArbitrationTest, createWithDefaultConf) { TEST_F(MemoryArbitrationTest, queryMemoryCapacity) { { // Reserved memory is not enforced when no arbitrator is provided. - auto allocator = std::make_shared(8L << 20); - MemoryManager manager{ - {.capacity = (int64_t)allocator->capacity(), - .queryMemoryCapacity = 4L << 20, - .allocator = allocator.get()}}; + MemoryManagerOptions options; + options.allocatorCapacity = 8L << 20; + options.arbitratorCapacity = 4L << 20; + options.arbitratorReservedCapacity = 2L << 20; + MemoryManager manager(options); auto rootPool = manager.addRootPool("root-1", 8L << 20); auto leafPool = rootPool->addLeafChild("leaf-1.0"); void* buffer; @@ -98,32 +124,175 @@ TEST_F(MemoryArbitrationTest, queryMemoryCapacity) { } { // Reserved memory is enforced when SharedMemoryArbitrator is used. - SharedArbitrator::registerFactory(); - auto allocator = std::make_shared(8L << 20); - MemoryManager manager{ - {.capacity = (int64_t)allocator->capacity(), - .queryMemoryCapacity = 4L << 20, - .allocator = allocator.get(), - .arbitratorKind = "SHARED"}}; + MemoryManagerOptions options; + options.allocatorCapacity = 16L << 20; + options.arbitratorCapacity = 6L << 20; + options.arbitratorReservedCapacity = 2L << 20; + options.arbitratorKind = "SHARED"; + options.memoryPoolInitCapacity = 1 << 20; + options.memoryPoolReservedCapacity = 1 << 20; + MemoryManager manager(options); auto rootPool = manager.addRootPool("root-1", 8L << 20, MemoryReclaimer::create()); + ASSERT_EQ(rootPool->capacity(), 1 << 20); + ASSERT_TRUE(manager.arbitrator()->growCapacity(rootPool.get(), 1 << 20)); + ASSERT_EQ(rootPool->capacity(), 1 << 20); + ASSERT_FALSE(manager.arbitrator()->growCapacity(rootPool.get(), 6 << 20)); + ASSERT_EQ(rootPool->capacity(), 1 << 20); + ASSERT_TRUE(manager.arbitrator()->growCapacity(rootPool.get(), 2 << 20)); + ASSERT_EQ(rootPool->capacity(), 4 << 20); + ASSERT_EQ(manager.arbitrator()->stats().freeCapacityBytes, 2 << 20); + ASSERT_EQ(manager.arbitrator()->stats().freeReservedCapacityBytes, 2 << 20); + auto leafPool = rootPool->addLeafChild("leaf-1.0"); - void* buffer; VELOX_ASSERT_THROW( - buffer = leafPool->allocate(7L << 20), - "Exceeded memory pool cap of 4.00MB"); - ASSERT_NO_THROW(buffer = leafPool->allocate(4L << 20)); - leafPool->free(buffer, 4L << 20); - SharedArbitrator::unregisterFactory(); + leafPool->allocate(7L << 20), + "Exceeded memory pool capacity after attempt to grow capacity through " + "arbitration. Requestor pool name 'leaf-1.0', request size 7.00MB, " + "memory pool capacity 4.00MB, memory pool max capacity 8.00MB"); + ASSERT_EQ(manager.arbitrator()->shrinkCapacity(rootPool.get(), 0), 1 << 20); + ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 0), 0); + ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 1), 0); + ASSERT_EQ(manager.arbitrator()->shrinkCapacity(rootPool.get(), 1), 0); + ASSERT_EQ(rootPool->capacity(), 3 << 20); + static_cast(rootPool.get())->testingSetReservation(0); + ASSERT_EQ( + manager.arbitrator()->shrinkCapacity(leafPool.get(), 1 << 20), 1 << 20); + ASSERT_EQ( + manager.arbitrator()->shrinkCapacity(rootPool.get(), 1 << 20), 1 << 20); + ASSERT_EQ(rootPool->capacity(), 1 << 20); + ASSERT_EQ(leafPool->capacity(), 1 << 20); + ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 0), 1 << 20); + ASSERT_EQ(rootPool->capacity(), 0); + ASSERT_EQ(leafPool->capacity(), 0); + } +} + +TEST_F(MemoryArbitrationTest, memoryPoolCapacityOnCreation) { + struct { + uint64_t freeNonReservedCapacity; + uint64_t freeReservedCapacity; + uint64_t poolMaxCapacity; + uint64_t poolInitCapacity; + uint64_t poolReservedCapacity; + uint64_t expectedPoolCapacityOnCreation; + + std::string debugString() const { + return fmt::format( + "freeNonReservedCapacity {} freeReservedCapacity {} poolMaxCapacity {} poolInitCapacity {} poolReservedCapacity {} expectedPoolCapacityOnCreation {}", + freeNonReservedCapacity, + freeReservedCapacity, + poolMaxCapacity, + poolInitCapacity, + poolReservedCapacity, + expectedPoolCapacityOnCreation); + } + } testSettings[] = { + {1 << 20, 3 << 20, kMaxMemory, 3 << 20, 1 << 20, 1 << 20}, + {1 << 20, 3 << 20, kMaxMemory, 1 << 20, 1 << 20, 1 << 20}, + {1 << 20, 3 << 20, kMaxMemory, 8 << 20, 1 << 20, 1 << 20}, + {0 << 20, 3 << 20, kMaxMemory, 1 << 20, 1 << 20, 1 << 20}, + {0 << 20, 3 << 20, kMaxMemory, 2 << 20, 1 << 20, 1 << 20}, + {0 << 20, 3 << 20, kMaxMemory, 8 << 20, 1 << 20, 1 << 20}, + {0 << 20, 3 << 20, kMaxMemory, 8 << 20, 0 << 20, 0}, + {1 << 20, 3 << 20, kMaxMemory, 3 << 20, 2 << 20, 2 << 20}, + {1 << 20, 3 << 20, kMaxMemory, 3 << 20, 3 << 20, 3 << 20}, + {1 << 20, 3 << 20, kMaxMemory, 3 << 20, 4 << 20, 4 << 20}, + {1 << 20, 3 << 20, kMaxMemory, 3 << 20, 5 << 20, 4 << 20}, + {1 << 20, 3 << 20, 3 << 20, 3 << 20, 5 << 20, 4 << 20}, + {1 << 20, 3 << 20, 3 << 20, 3 << 20, 1 << 20, 1 << 20}, + {1 << 20, 3 << 20, 3 << 20, 3 << 20, 2 << 20, 2 << 20}, + {1 << 20, 3 << 20, 3 << 20, 1 << 20, 2 << 20, 2 << 20}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + + MemoryManagerOptions options; + options.arbitratorKind = "SHARED"; + options.arbitratorReservedCapacity = testData.freeReservedCapacity; + options.arbitratorCapacity = + testData.freeReservedCapacity + testData.freeNonReservedCapacity; + options.allocatorCapacity = options.arbitratorCapacity; + options.memoryPoolInitCapacity = testData.poolInitCapacity; + options.memoryPoolReservedCapacity = testData.poolReservedCapacity; + + MemoryManager manager(options); + auto rootPool = manager.addRootPool("root-1", kMaxMemory); + ASSERT_EQ(rootPool->capacity(), testData.expectedPoolCapacityOnCreation); } } +TEST_F(MemoryArbitrationTest, reservedCapacityFreeByPoolRelease) { + MemoryManagerOptions options; + options.arbitratorKind = "SHARED"; + options.arbitratorReservedCapacity = 4 << 20; + options.arbitratorCapacity = 9 << 20; + options.allocatorCapacity = options.arbitratorCapacity; + options.memoryPoolInitCapacity = 3 << 20; + options.memoryPoolReservedCapacity = 1 << 20; + + MemoryManager manager(options); + auto* arbitrator = manager.arbitrator(); + auto pool1 = manager.addRootPool("root-1", kMaxMemory); + ASSERT_EQ(pool1->capacity(), 3 << 20); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 4 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 6 << 20); + + auto pool2 = manager.addRootPool("root-2", kMaxMemory); + ASSERT_EQ(pool2->capacity(), 2 << 20); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 4 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 4 << 20); + + auto pool3 = manager.addRootPool("root-3", kMaxMemory); + ASSERT_EQ(pool3->capacity(), 1 << 20); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 3 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 3 << 20); + + auto pool4 = manager.addRootPool("root-4", kMaxMemory); + ASSERT_EQ(pool4->capacity(), 1 << 20); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 2 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 2 << 20); + + auto pool5 = manager.addRootPool("root-5", kMaxMemory); + ASSERT_EQ(pool4->capacity(), 1 << 20); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 1 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 1 << 20); + + auto pool6 = manager.addRootPool("root-6", kMaxMemory); + ASSERT_EQ(pool4->capacity(), 1 << 20); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 0 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 0 << 20); + + auto pool7 = manager.addRootPool("root-7", kMaxMemory); + ASSERT_EQ(pool7->capacity(), 0); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 0 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 0 << 20); + + pool7.reset(); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 0 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 0 << 20); + + pool6.reset(); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 1 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 1 << 20); + + pool1.reset(); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 4 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 4 << 20); + + pool2.reset(); + ASSERT_EQ(arbitrator->stats().freeReservedCapacityBytes, 4 << 20); + ASSERT_EQ(arbitrator->stats().freeCapacityBytes, 6 << 20); +} + TEST_F(MemoryArbitrationTest, arbitratorStats) { const MemoryArbitrator::Stats emptyStats; ASSERT_TRUE(emptyStats.empty()); - const MemoryArbitrator::Stats anchorStats(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5); + const MemoryArbitrator::Stats anchorStats( + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5); ASSERT_FALSE(anchorStats.empty()); - const MemoryArbitrator::Stats largeStats(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); + const MemoryArbitrator::Stats largeStats( + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); ASSERT_FALSE(largeStats.empty()); ASSERT_TRUE(!(anchorStats == largeStats)); ASSERT_TRUE(anchorStats != largeStats); @@ -132,9 +301,11 @@ TEST_F(MemoryArbitrationTest, arbitratorStats) { ASSERT_TRUE(anchorStats <= largeStats); ASSERT_TRUE(!(anchorStats >= largeStats)); const auto delta = largeStats - anchorStats; - ASSERT_EQ(delta, MemoryArbitrator::Stats(3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 3, 3)); + ASSERT_EQ( + delta, MemoryArbitrator::Stats(3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 3, 3, 3)); - const MemoryArbitrator::Stats smallStats(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); + const MemoryArbitrator::Stats smallStats( + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); ASSERT_TRUE(!(anchorStats == smallStats)); ASSERT_TRUE(anchorStats != smallStats); ASSERT_TRUE(!(anchorStats < smallStats)); @@ -143,13 +314,13 @@ TEST_F(MemoryArbitrationTest, arbitratorStats) { ASSERT_TRUE(anchorStats >= smallStats); const MemoryArbitrator::Stats invalidStats( - 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 2); + 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 2, 8, 2); ASSERT_TRUE(!(anchorStats == invalidStats)); ASSERT_TRUE(anchorStats != invalidStats); - ASSERT_THROW(anchorStats < invalidStats, VeloxException); - ASSERT_THROW(anchorStats > invalidStats, VeloxException); - ASSERT_THROW(anchorStats <= invalidStats, VeloxException); - ASSERT_THROW(anchorStats >= invalidStats, VeloxException); + VELOX_ASSERT_THROW(anchorStats < invalidStats, ""); + VELOX_ASSERT_THROW(anchorStats > invalidStats, ""); + VELOX_ASSERT_THROW(anchorStats <= invalidStats, ""); + VELOX_ASSERT_THROW(anchorStats >= invalidStats, ""); } namespace { @@ -159,32 +330,27 @@ class FakeTestArbitrator : public MemoryArbitrator { : MemoryArbitrator( {.kind = config.kind, .capacity = config.capacity, - .memoryPoolInitCapacity = config.memoryPoolInitCapacity, - .memoryPoolTransferCapacity = config.memoryPoolTransferCapacity}) { - } + .extraConfigs = config.extraConfigs}) {} - void reserveMemory(MemoryPool* pool, uint64_t bytes) override { - VELOX_NYI(); + std::string kind() const override { + return "USER"; } - void releaseMemory(MemoryPool* pool) override { - VELOX_NYI(); - } + void addPool(const std::shared_ptr& /*unused*/) override {} - std::string kind() const override { - return "USER"; + void removePool(MemoryPool* /*unused*/) override {} + + bool growCapacity(MemoryPool* /*unused*/, uint64_t /*unused*/) override { + VELOX_NYI(); } - bool growMemory( - MemoryPool* pool, - const std::vector>& candidatePools, - uint64_t targetBytes) override { + uint64_t shrinkCapacity(uint64_t /*unused*/, bool /*unused*/, bool /*unused*/) + override { VELOX_NYI(); } - uint64_t shrinkMemory( - const std::vector>& pools, - uint64_t targetBytes) override { + uint64_t shrinkCapacity(MemoryPool* /*unused*/, uint64_t /*unused*/) + override { VELOX_NYI(); } @@ -203,6 +369,7 @@ class MemoryArbitratorFactoryTest : public testing::Test { protected: static void SetUpTestCase() { MemoryArbitrator::registerFactory(kind_, factory_); + memory::MemoryManager::testingSetInstance({}); } static void TearDownTestCase() { @@ -220,7 +387,7 @@ class MemoryArbitratorFactoryTest : public testing::Test { TEST_F(MemoryArbitratorFactoryTest, register) { VELOX_ASSERT_THROW( MemoryArbitrator::registerFactory(kind_, factory_), - "Arbitrator factory for kind USER already registered"); + "Arbitrator factory for kind USER is already registered"); } TEST_F(MemoryArbitratorFactoryTest, create) { @@ -232,6 +399,10 @@ TEST_F(MemoryArbitratorFactoryTest, create) { class MemoryReclaimerTest : public testing::Test { protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + MemoryReclaimerTest() { const auto seed = std::chrono::system_clock::now().time_since_epoch().count(); @@ -279,7 +450,7 @@ TEST_F(MemoryReclaimerTest, common) { for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); std::vector> pools; - auto pool = defaultMemoryManager().addRootPool( + auto pool = memory::memoryManager()->addRootPool( "shrinkAPIs", kMaxMemory, memory::MemoryReclaimer::create()); pools.push_back(pool); @@ -312,12 +483,13 @@ TEST_F(MemoryReclaimerTest, common) { } } for (auto& pool : pools) { - uint64_t reclaimableBytes; - ASSERT_FALSE(pool->reclaimableBytes(reclaimableBytes)); - ASSERT_EQ(reclaimableBytes, 0); - ASSERT_EQ(pool->reclaim(0, stats_), 0); - ASSERT_EQ(pool->reclaim(100, stats_), 0); - ASSERT_EQ(pool->reclaim(kMaxMemory, stats_), 0); + ASSERT_FALSE(pool->reclaimableBytes().has_value()); + ASSERT_EQ(pool->reclaim(0, 0, stats_), 0); + ASSERT_EQ(stats_, MemoryReclaimer::Stats{}); + ASSERT_EQ(pool->reclaim(100, 0, stats_), 0); + ASSERT_EQ(stats_, MemoryReclaimer::Stats{}); + ASSERT_EQ(pool->reclaim(kMaxMemory, 0, stats_), 0); + ASSERT_EQ(stats_, MemoryReclaimer::Stats{}); } ASSERT_EQ(stats_, MemoryReclaimer::Stats{}); for (const auto& allocation : allocations) { @@ -328,16 +500,37 @@ TEST_F(MemoryReclaimerTest, common) { class MockLeafMemoryReclaimer : public MemoryReclaimer { public: - explicit MockLeafMemoryReclaimer(std::atomic& totalUsedBytes) - : totalUsedBytes_(totalUsedBytes) {} + explicit MockLeafMemoryReclaimer( + std::atomic& totalUsedBytes, + bool reclaimable = true, + bool* underArbitration = nullptr) + : reclaimable_(reclaimable), + underArbitration_(underArbitration), + totalUsedBytes_(totalUsedBytes) {} ~MockLeafMemoryReclaimer() override { VELOX_CHECK(allocations_.empty()); } + virtual void enterArbitration() override { + if (underArbitration_ != nullptr) { + *underArbitration_ = true; + } + } + + virtual void leaveArbitration() noexcept override { + if (underArbitration_ != nullptr) { + *underArbitration_ = false; + } + } + bool reclaimableBytes(const MemoryPool& pool, uint64_t& bytes) const override { VELOX_CHECK_EQ(pool.name(), pool_->name()); + if (!reclaimable_) { + bytes = 0; + return false; + } bytes = reclaimableBytes(); return true; } @@ -345,7 +538,12 @@ class MockLeafMemoryReclaimer : public MemoryReclaimer { uint64_t reclaim( MemoryPool* /*unused*/, uint64_t targetBytes, + uint64_t /*unused*/, Stats& stats) noexcept override { + ++reclaimCount_; + if (!reclaimable_) { + return 0; + } std::lock_guard l(mu_); uint64_t reclaimedBytes{0}; while (!allocations_.empty() && @@ -354,6 +552,7 @@ class MockLeafMemoryReclaimer : public MemoryReclaimer { reclaimedBytes += allocations_.front().size; allocations_.pop_front(); } + stats.reclaimedBytes += reclaimedBytes; return reclaimedBytes; } @@ -369,6 +568,18 @@ class MockLeafMemoryReclaimer : public MemoryReclaimer { totalUsedBytes_ += size; } + int reclaimCount() const { + return reclaimCount_; + } + + void freeAll() { + std::lock_guard l(mu_); + while (!allocations_.empty()) { + free(allocations_.front()); + allocations_.pop_front(); + } + } + private: struct Allocation { void* buffer; @@ -378,10 +589,11 @@ class MockLeafMemoryReclaimer : public MemoryReclaimer { void free(const Allocation& allocation) { pool_->free(allocation.buffer, allocation.size); totalUsedBytes_ -= allocation.size; - VELOX_CHECK_GE(totalUsedBytes_, 0); + VELOX_CHECK_GE(static_cast(totalUsedBytes_), 0); } uint64_t reclaimableBytes() const { + VELOX_CHECK(reclaimable_); uint64_t sumBytes{0}; std::lock_guard l(mu_); for (const auto& allocation : allocations_) { @@ -390,7 +602,10 @@ class MockLeafMemoryReclaimer : public MemoryReclaimer { return sumBytes; } - std::atomic& totalUsedBytes_; + const bool reclaimable_{true}; + bool* const underArbitration_{nullptr}; + std::atomic_uint64_t& totalUsedBytes_; + std::atomic_int reclaimCount_{0}; mutable std::mutex mu_; MemoryPool* pool_{nullptr}; std::deque allocations_; @@ -402,7 +617,7 @@ TEST_F(MemoryReclaimerTest, mockReclaim) { const int numAllocationsPerLeaf = 10; const int allocBytes = 10; std::atomic totalUsedBytes{0}; - auto root = defaultMemoryManager().addRootPool( + auto root = memory::memoryManager()->addRootPool( "mockReclaim", kMaxMemory, MemoryReclaimer::create()); std::vector> childPools; for (int i = 0; i < numChildren; ++i) { @@ -427,26 +642,29 @@ TEST_F(MemoryReclaimerTest, mockReclaim) { ASSERT_EQ( numGrandchildren * numChildren * numAllocationsPerLeaf * allocBytes, totalUsedBytes); - uint64_t reclaimableBytes; - ASSERT_TRUE(root->reclaimableBytes(reclaimableBytes)); - ASSERT_EQ(reclaimableBytes, totalUsedBytes); + ASSERT_EQ(root->reclaimableBytes().value(), totalUsedBytes); const int numReclaims = 5; const int numBytesToReclaim = allocBytes * 3; for (int iter = 0; iter < numReclaims; ++iter) { - const auto reclaimedBytes = root->reclaim(numBytesToReclaim, stats_); + const auto reclaimedBytes = root->reclaim(numBytesToReclaim, 0, stats_); ASSERT_EQ(reclaimedBytes, numBytesToReclaim); - ASSERT_TRUE(root->reclaimableBytes(reclaimableBytes)); - ASSERT_EQ(reclaimableBytes, totalUsedBytes); + ASSERT_EQ(reclaimedBytes, stats_.reclaimedBytes); + ASSERT_EQ(root->reclaimableBytes().value(), totalUsedBytes); + stats_.reset(); } - ASSERT_TRUE(root->reclaimableBytes(reclaimableBytes)); - ASSERT_EQ(totalUsedBytes, reclaimableBytes); - ASSERT_EQ(root->reclaim(allocBytes + 1, stats_), 2 * allocBytes); - ASSERT_EQ(root->reclaim(allocBytes - 1, stats_), allocBytes); + + ASSERT_EQ(totalUsedBytes, root->reclaimableBytes().value()); + ASSERT_EQ(root->reclaim(allocBytes + 1, 0, stats_), 2 * allocBytes); + ASSERT_EQ(root->reclaim(allocBytes - 1, 0, stats_), allocBytes); + ASSERT_EQ(3 * allocBytes, stats_.reclaimedBytes); + const uint64_t expectedReclaimedBytes = totalUsedBytes; - ASSERT_EQ(root->reclaim(0, stats_), expectedReclaimedBytes); + ASSERT_EQ(root->reclaim(0, 0, stats_), expectedReclaimedBytes); + ASSERT_EQ(3 * allocBytes + expectedReclaimedBytes, stats_.reclaimedBytes); ASSERT_EQ(totalUsedBytes, 0); - ASSERT_TRUE(root->reclaimableBytes(reclaimableBytes)); - ASSERT_EQ(reclaimableBytes, 0); + ASSERT_EQ(root->reclaimableBytes().value(), 0); + + stats_.reset(); ASSERT_EQ(stats_, MemoryReclaimer::Stats{}); } @@ -455,7 +673,7 @@ TEST_F(MemoryReclaimerTest, mockReclaimMoreThanAvailable) { const int numAllocationsPerLeaf = 10; const int allocBytes = 100; std::atomic totalUsedBytes{0}; - auto root = defaultMemoryManager().addRootPool( + auto root = memory::memoryManager()->addRootPool( "mockReclaimMoreThanAvailable", kMaxMemory, MemoryReclaimer::create()); std::vector> childPools; for (int i = 0; i < numChildren; ++i) { @@ -473,18 +691,86 @@ TEST_F(MemoryReclaimerTest, mockReclaimMoreThanAvailable) { } } ASSERT_EQ(numChildren * numAllocationsPerLeaf * allocBytes, totalUsedBytes); - uint64_t reclaimableBytes; - ASSERT_TRUE(root->reclaimableBytes(reclaimableBytes)); + uint64_t reclaimableBytes = root->reclaimableBytes().value(); ASSERT_EQ(reclaimableBytes, totalUsedBytes); const uint64_t expectedReclaimedBytes = totalUsedBytes; ASSERT_EQ( - root->reclaim(totalUsedBytes + 100, stats_), expectedReclaimedBytes); + root->reclaim(totalUsedBytes + 100, 0, stats_), expectedReclaimedBytes); + ASSERT_EQ(expectedReclaimedBytes, stats_.reclaimedBytes); ASSERT_EQ(totalUsedBytes, 0); - ASSERT_TRUE(root->reclaimableBytes(reclaimableBytes)); + reclaimableBytes = root->reclaimableBytes().value(); ASSERT_EQ(reclaimableBytes, 0); + stats_.reset(); ASSERT_EQ(stats_, MemoryReclaimer::Stats{}); } +TEST_F(MemoryReclaimerTest, scopedReclaimedBytesRecorder) { + auto root = memory::memoryManager()->addRootPool( + "memoryReclaimRecorder", kMaxMemory, MemoryReclaimer::create()); + auto childPool = root->addLeafChild("memoryReclaimRecorder", true); + ASSERT_EQ(childPool->reservedBytes(), 0); + int64_t reclaimedBytes{0}; + { ScopedReclaimedBytesRecorder recorder(childPool.get(), &reclaimedBytes); } + ASSERT_EQ(reclaimedBytes, 0); + + void* buffer = childPool->allocate(1 << 20); + ASSERT_EQ(childPool->reservedBytes(), 1 << 20); + { ScopedReclaimedBytesRecorder recorder(childPool.get(), &reclaimedBytes); } + ASSERT_EQ(reclaimedBytes, 0); + + reclaimedBytes = 0; + { + ScopedReclaimedBytesRecorder recorder(childPool.get(), &reclaimedBytes); + childPool->free(buffer, 1 << 20); + } + ASSERT_EQ(reclaimedBytes, 1 << 20); + + childPool->maybeReserve(1 << 20); + buffer = childPool->allocate(1 << 20); + ASSERT_EQ(childPool->reservedBytes(), 8 << 20); + reclaimedBytes = 0; + { + ScopedReclaimedBytesRecorder recorder(childPool.get(), &reclaimedBytes); + childPool->free(buffer, 1 << 20); + } + ASSERT_EQ(reclaimedBytes, 0); + + { + ScopedReclaimedBytesRecorder recorder(childPool.get(), &reclaimedBytes); + childPool->release(); + } + ASSERT_EQ(reclaimedBytes, 8 << 20); + + // Bad state. + reclaimedBytes = 100; + VELOX_ASSERT_THROW( + ScopedReclaimedBytesRecorder(childPool.get(), &reclaimedBytes), + "(100 vs. 0)"); + + reclaimedBytes = 0; + buffer = childPool->allocate(1 << 20); + { + const std::string throwMsg("throw"); + try { + ScopedReclaimedBytesRecorder recorder(childPool.get(), &reclaimedBytes); + childPool->free(buffer, 1 << 20); + VELOX_FAIL(throwMsg); + } catch (const VeloxRuntimeError& ex) { + ASSERT_EQ(ex.message(), throwMsg); + } + } + ASSERT_EQ(reclaimedBytes, 0); + + // Negative reclaim. + reclaimedBytes = 0; + { + ScopedReclaimedBytesRecorder recorder(childPool.get(), &reclaimedBytes); + buffer = childPool->allocate(1 << 20); + } + ASSERT_EQ(reclaimedBytes, -(1 << 20)); + childPool->free(buffer, 1 << 20); +} + TEST_F(MemoryReclaimerTest, orderedReclaim) { // Set 1MB unit to avoid memory pool quantized reservation effect. const int allocUnitBytes = 1L << 20; @@ -493,7 +779,7 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { const std::vector initAllocUnitsVec = {10, 11, 8, 16, 5}; ASSERT_EQ(initAllocUnitsVec.size(), numChildren); std::atomic totalUsedBytes{0}; - auto root = defaultMemoryManager().addRootPool( + auto root = memory::memoryManager()->addRootPool( "orderedReclaim", kMaxMemory, MemoryReclaimer::create()); int totalAllocUnits{0}; std::vector> childPools; @@ -537,8 +823,9 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { // child. // So expected reclaimable allocation units are {10, 11, 8, *14*, 5} ASSERT_EQ( - root->reclaimer()->reclaim(root.get(), 2 * allocUnitBytes, stats_), + root->reclaimer()->reclaim(root.get(), 2 * allocUnitBytes, 0, stats_), 2 * allocUnitBytes); + ASSERT_EQ(2 * allocUnitBytes, stats_.reclaimedBytes); totalAllocUnits -= 2; verify({10, 11, 8, 14, 5}); @@ -546,8 +833,9 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { // child. // So expected reclaimable allocation units are {10, 11, 8, *12*, 5} ASSERT_EQ( - root->reclaimer()->reclaim(root.get(), 2 * allocUnitBytes, stats_), + root->reclaimer()->reclaim(root.get(), 2 * allocUnitBytes, 0, stats_), 2 * allocUnitBytes); + ASSERT_EQ(4 * allocUnitBytes, stats_.reclaimedBytes); totalAllocUnits -= 2; verify({10, 11, 8, 12, 5}); @@ -555,8 +843,9 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { // child. // So expected reclaimable allocation units are {10, 11, 8, *4*, 5} ASSERT_EQ( - root->reclaimer()->reclaim(root.get(), 8 * allocUnitBytes, stats_), + root->reclaimer()->reclaim(root.get(), 8 * allocUnitBytes, 0, stats_), 8 * allocUnitBytes); + ASSERT_EQ(12 * allocUnitBytes, stats_.reclaimedBytes); totalAllocUnits -= 8; verify({10, 11, 8, 4, 5}); @@ -564,7 +853,7 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { // child. // So expected reclaimable allocation gunits are {10, *9*, 8, 4, 5} ASSERT_EQ( - root->reclaimer()->reclaim(root.get(), 2 * allocUnitBytes, stats_), + root->reclaimer()->reclaim(root.get(), 2 * allocUnitBytes, 0, stats_), 2 * allocUnitBytes); totalAllocUnits -= 2; verify({10, 9, 8, 4, 5}); @@ -573,7 +862,7 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { // child. // So expected reclaimable allocation units are {*7*, 9, 8, 4, 5} ASSERT_EQ( - root->reclaimer()->reclaim(root.get(), 3 * allocUnitBytes, stats_), + root->reclaimer()->reclaim(root.get(), 3 * allocUnitBytes, 0, stats_), 3 * allocUnitBytes); totalAllocUnits -= 3; verify({7, 9, 8, 4, 5}); @@ -582,7 +871,7 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { // child and two from 2nd child. // So expected reclaimable allocation units are {7, *0*, *6*, 4, 5} ASSERT_EQ( - root->reclaimer()->reclaim(root.get(), 11 * allocUnitBytes, stats_), + root->reclaimer()->reclaim(root.get(), 11 * allocUnitBytes, 0, stats_), 11 * allocUnitBytes); totalAllocUnits -= 11; verify({7, 0, 6, 4, 5}); @@ -591,7 +880,7 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { // child and three from 2nd child. // So expected reclaimable allocation units are {*0*, 0, *3*, 4, 5} ASSERT_EQ( - root->reclaimer()->reclaim(root.get(), 10 * allocUnitBytes, stats_), + root->reclaimer()->reclaim(root.get(), 10 * allocUnitBytes, 0, stats_), 10 * allocUnitBytes); totalAllocUnits -= 10; verify({0, 0, 3, 4, 5}); @@ -600,7 +889,7 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { // child and 4 from 4th child and 1 from 2nd. // So expected reclaimable allocation units are {0, 0, 2, *0*, *0*} ASSERT_EQ( - root->reclaimer()->reclaim(root.get(), 10 * allocUnitBytes, stats_), + root->reclaimer()->reclaim(root.get(), 10 * allocUnitBytes, 0, stats_), 10 * allocUnitBytes); totalAllocUnits -= 10; verify({0, 0, 2, 0, 0}); @@ -609,15 +898,88 @@ TEST_F(MemoryReclaimerTest, orderedReclaim) { // cleared. ASSERT_EQ( root->reclaimer()->reclaim( - root.get(), totalAllocUnits * allocUnitBytes, stats_), + root.get(), totalAllocUnits * allocUnitBytes, 0, stats_), totalAllocUnits * allocUnitBytes); totalAllocUnits = 0; verify({0, 0, 0, 0, 0}); + stats_.reset(); ASSERT_EQ(stats_, MemoryReclaimer::Stats{}); } +TEST_F(MemoryReclaimerTest, skipNonReclaimableChild) { + // Set 1MB unit to avoid memory pool quantized reservation effect. + const int allocUnitBytes = 1L << 20; + const int numChildren = 5; + // The initial allocation units per each child pool. + const std::vector initAllocUnitsVec = {10, 11, 8, 16, 5}; + const std::vector reclaimableChildVec = { + true, false, true, false, true}; + ASSERT_EQ(initAllocUnitsVec.size(), numChildren); + std::atomic totalUsedBytes{0}; + auto root = memory::memoryManager()->addRootPool( + "orderedReclaim", kMaxMemory, MemoryReclaimer::create()); + int totalAllocUnits{0}; + std::vector> childPools; + for (int i = 0; i < numChildren; ++i) { + auto childPool = root->addLeafChild( + std::to_string(i), + true, + std::make_unique( + totalUsedBytes, reclaimableChildVec[i])); + childPools.push_back(childPool); + auto* reclaimer = + static_cast(childPool->reclaimer()); + reclaimer->setPool(childPool.get()); + + const auto initAllocUnit = initAllocUnitsVec[i]; + totalAllocUnits += initAllocUnit; + for (int j = 0; j < initAllocUnit; ++j) { + void* buffer = childPool->allocate(allocUnitBytes); + reclaimer->addAllocation(buffer, allocUnitBytes); + } + } + + uint64_t reclaimableBytes{0}; + // 'expectedReclaimableUnits' is the expected allocation unit per each child + // pool after each round of memory reclaim. And we expect the memory reclaimer + // always reclaim from the child with most meomry usage. + auto verify = [&](const std::vector& expectedReclaimableUnits) { + uint64_t expectedTotalReclaimableBytes{0}; + for (int i = 0; i < numChildren; ++i) { + auto* reclaimer = + static_cast(childPools[i]->reclaimer()); + reclaimer->reclaimableBytes(*childPools[i], reclaimableBytes); + ASSERT_EQ(reclaimableBytes, expectedReclaimableUnits[i] * allocUnitBytes) + << " " << i; + expectedTotalReclaimableBytes += reclaimableBytes; + } + root->reclaimer()->reclaimableBytes(*root, reclaimableBytes); + ASSERT_EQ(reclaimableBytes, expectedTotalReclaimableBytes); + }; + const std::vector expectedReclaimableUnits{10, 0, 8, 0, 5}; + // No reclaim so far so just expect the initial allocation unit distribution. + verify(expectedReclaimableUnits); + + // Tries to reclaim all units and only expect to reclaim from reclaimable + // child pools. + ASSERT_EQ( + root->reclaimer()->reclaim( + root.get(), totalAllocUnits * allocUnitBytes, 0, stats_), + 23 * allocUnitBytes); + for (int i = 0; i < numChildren; ++i) { + auto* reclaimer = + static_cast(childPools[i]->reclaimer()); + if (reclaimableChildVec[i]) { + ASSERT_EQ(reclaimer->reclaimCount(), 1); + } else { + ASSERT_EQ(reclaimer->reclaimCount(), 0); + } + reclaimer->freeAll(); + } +} + TEST_F(MemoryReclaimerTest, arbitrationContext) { - auto root = defaultMemoryManager().addRootPool( + auto root = memory::memoryManager()->addRootPool( "arbitrationContext", kMaxMemory, MemoryReclaimer::create()); ASSERT_FALSE(isSpillMemoryPool(root.get())); ASSERT_TRUE(isSpillMemoryPool(spillMemoryPool())); @@ -627,29 +989,29 @@ TEST_F(MemoryReclaimerTest, arbitrationContext) { ASSERT_FALSE(isSpillMemoryPool(leafChild2.get())); ASSERT_TRUE(memoryArbitrationContext() == nullptr); { - ScopedMemoryArbitrationContext arbitrationContext(*leafChild1); + ScopedMemoryArbitrationContext arbitrationContext(leafChild1.get()); ASSERT_TRUE(memoryArbitrationContext() != nullptr); - ASSERT_EQ(&memoryArbitrationContext()->requestor, leafChild1.get()); + ASSERT_EQ(memoryArbitrationContext()->requestor, leafChild1.get()); } ASSERT_TRUE(memoryArbitrationContext() == nullptr); { - ScopedMemoryArbitrationContext arbitrationContext(*leafChild2); + ScopedMemoryArbitrationContext arbitrationContext(leafChild2.get()); ASSERT_TRUE(memoryArbitrationContext() != nullptr); - ASSERT_EQ(&memoryArbitrationContext()->requestor, leafChild2.get()); + ASSERT_EQ(memoryArbitrationContext()->requestor, leafChild2.get()); } ASSERT_TRUE(memoryArbitrationContext() == nullptr); std::thread nonAbitrationThread([&]() { ASSERT_TRUE(memoryArbitrationContext() == nullptr); { - ScopedMemoryArbitrationContext arbitrationContext(*leafChild1); + ScopedMemoryArbitrationContext arbitrationContext(leafChild1.get()); ASSERT_TRUE(memoryArbitrationContext() != nullptr); - ASSERT_EQ(&memoryArbitrationContext()->requestor, leafChild1.get()); + ASSERT_EQ(memoryArbitrationContext()->requestor, leafChild1.get()); } ASSERT_TRUE(memoryArbitrationContext() == nullptr); { - ScopedMemoryArbitrationContext arbitrationContext(*leafChild2); + ScopedMemoryArbitrationContext arbitrationContext(leafChild2.get()); ASSERT_TRUE(memoryArbitrationContext() != nullptr); - ASSERT_EQ(&memoryArbitrationContext()->requestor, leafChild2.get()); + ASSERT_EQ(memoryArbitrationContext()->requestor, leafChild2.get()); } ASSERT_TRUE(memoryArbitrationContext() == nullptr); }); @@ -657,8 +1019,44 @@ TEST_F(MemoryReclaimerTest, arbitrationContext) { ASSERT_TRUE(memoryArbitrationContext() == nullptr); } +TEST_F(MemoryReclaimerTest, scopedMemoryPoolArbitrationCtx) { + auto root = memory::memoryManager()->addRootPool( + "scopedArbitration", kMaxMemory, MemoryReclaimer::create()); + std::atomic totalUsedBytes{0}; + bool underArbitration{false}; + auto leafChild = root->addLeafChild( + "scopedArbitration", + true, + std::make_unique( + totalUsedBytes, true, &underArbitration)); + ASSERT_FALSE(underArbitration); + { + ScopedMemoryPoolArbitrationCtx arbitrationCtx(leafChild.get()); + ASSERT_TRUE(memoryArbitrationContext() == nullptr); + ASSERT_TRUE(underArbitration); + } + ASSERT_FALSE(underArbitration); + ASSERT_TRUE(memoryArbitrationContext() == nullptr); + + std::thread abitrationThread([&]() { + ASSERT_TRUE(memoryArbitrationContext() == nullptr); + { + ScopedMemoryPoolArbitrationCtx arbitrationCtx(leafChild.get()); + ASSERT_TRUE(memoryArbitrationContext() == nullptr); + ASSERT_TRUE(underArbitration); + } + ASSERT_FALSE(underArbitration); + ASSERT_TRUE(memoryArbitrationContext() == nullptr); + }); + abitrationThread.join(); + + ASSERT_FALSE(underArbitration); + + ASSERT_TRUE(memoryArbitrationContext() == nullptr); +} + TEST_F(MemoryReclaimerTest, concurrentRandomMockReclaims) { - auto root = defaultMemoryManager().addRootPool( + auto root = memory::memoryManager()->addRootPool( "concurrentRandomMockReclaims", kMaxMemory, MemoryReclaimer::create()); std::atomic totalUsedBytes{0}; @@ -681,6 +1079,7 @@ TEST_F(MemoryReclaimerTest, concurrentRandomMockReclaims) { } const int32_t kNumReclaims = 100; + uint64_t totalReclaimedBytes = 0; std::thread reclaimerThread([&]() { for (int i = 0; i < kNumReclaims; ++i) { const uint64_t oldUsedBytes = totalUsedBytes; @@ -696,7 +1095,8 @@ TEST_F(MemoryReclaimerTest, concurrentRandomMockReclaims) { bytesToReclaim = 0; } } - const auto reclaimedBytes = root->reclaim(bytesToReclaim, stats_); + const auto reclaimedBytes = root->reclaim(bytesToReclaim, 0, stats_); + totalReclaimedBytes += reclaimedBytes; if (reclaimedBytes < bytesToReclaim) { ASSERT_GT(bytesToReclaim, oldUsedBytes); } @@ -730,15 +1130,16 @@ TEST_F(MemoryReclaimerTest, concurrentRandomMockReclaims) { } reclaimerThread.join(); - uint64_t reclaimableBytes; - ASSERT_TRUE(root->reclaimableBytes(reclaimableBytes)); + uint64_t reclaimableBytes = root->reclaimableBytes().value(); ASSERT_EQ(reclaimableBytes, totalUsedBytes); - root->reclaim(0, stats_); + root->reclaim(0, 0, stats_); + ASSERT_EQ(totalReclaimedBytes + reclaimableBytes, stats_.reclaimedBytes); - ASSERT_TRUE(root->reclaimableBytes(reclaimableBytes)); + reclaimableBytes = root->reclaimableBytes().value(); ASSERT_EQ(reclaimableBytes, 0); ASSERT_EQ(totalUsedBytes, 0); + stats_.reset(); ASSERT_EQ(stats_, MemoryReclaimer::Stats{}); } @@ -756,5 +1157,10 @@ TEST_F(MemoryArbitrationTest, reclaimerStats) { ASSERT_NE(stats1, stats2); stats1.reset(); ASSERT_EQ(stats1, stats2); + MemoryReclaimer::Stats sum{1, 1, 1, 1}; + MemoryReclaimer::Stats toAdd{1, 1, 1, 1}; + sum += toAdd; + const MemoryReclaimer::Stats expected{2, 2, 2, 2}; + ASSERT_EQ(sum, expected); } } // namespace facebook::velox::memory diff --git a/velox/common/memory/tests/MemoryCapExceededTest.cpp b/velox/common/memory/tests/MemoryCapExceededTest.cpp index 49cbadc9f1647..b334d16a4c054 100644 --- a/velox/common/memory/tests/MemoryCapExceededTest.cpp +++ b/velox/common/memory/tests/MemoryCapExceededTest.cpp @@ -68,14 +68,15 @@ TEST_P(MemoryCapExceededTest, singleDriver) { // We look for these lines separately, since their order can change (not sure // why). std::vector expectedTexts = { - "Exceeded memory pool cap of 5.00MB with max 5.00MB when requesting " - "2.00MB, memory manager cap is UNLIMITED, requestor " - "'op.2.0.0.Aggregation' with current usage 3.70MB"}; + "Exceeded memory pool capacity after attempt to grow capacity through " + "arbitration. Requestor pool name 'op.2.0.0.Aggregation', request size " + "2.00MB, memory pool capacity 5.00MB, memory pool max capacity 5.00MB, " + "memory manager capacity 8.00GB, current usage 3.70MB"}; std::vector expectedDetailedTexts = { - "node.1 usage 1.00MB peak 1.00MB", - "op.1.0.0.FilterProject usage 12.00KB peak 12.00KB", - "node.2 usage 4.00MB peak 4.00MB", - "op.2.0.0.Aggregation usage 3.70MB peak 3.70MB", + "node.1 usage 12.00KB reserved 1.00MB peak 1.00MB", + "op.1.0.0.FilterProject usage 12.00KB reserved 1.00MB peak 12.00KB", + "node.2 usage 3.70MB reserved 4.00MB peak 4.00MB", + "op.2.0.0.Aggregation usage 3.70MB reserved 4.00MB peak 3.70MB", "Top 2 leaf memory pool usages:"}; std::vector data; @@ -95,10 +96,9 @@ TEST_P(MemoryCapExceededTest, singleDriver) { .singleAggregation({"c0"}, {"sum(p1)"}) .orderBy({"c0"}, false) .planNode(); - auto queryCtx = std::make_shared(executor_.get()); - queryCtx->testingOverrideMemoryPool( - memory::defaultMemoryManager().addRootPool( - queryCtx->queryId(), kMaxBytes)); + auto queryCtx = core::QueryCtx::create(executor_.get()); + queryCtx->testingOverrideMemoryPool(memory::memoryManager()->addRootPool( + queryCtx->queryId(), kMaxBytes, exec::MemoryReclaimer::create())); CursorParameters params; params.planNode = plan; params.queryCtx = queryCtx; @@ -154,10 +154,9 @@ TEST_P(MemoryCapExceededTest, multipleDrivers) { .values(data, true) .singleAggregation({"c0"}, {"sum(c1)"}) .planNode(); - auto queryCtx = std::make_shared(executor_.get()); - queryCtx->testingOverrideMemoryPool( - memory::defaultMemoryManager().addRootPool( - queryCtx->queryId(), kMaxBytes)); + auto queryCtx = core::QueryCtx::create(executor_.get()); + queryCtx->testingOverrideMemoryPool(memory::memoryManager()->addRootPool( + queryCtx->queryId(), kMaxBytes, exec::MemoryReclaimer::create())); const int32_t numDrivers = 10; CursorParameters params; @@ -184,29 +183,31 @@ TEST_P(MemoryCapExceededTest, multipleDrivers) { TEST_P(MemoryCapExceededTest, allocatorCapacityExceededError) { // Executes a plan with no memory pool capacity limit but very small memory // manager's limit. - std::vector, - std::vector>> - allocatorExpectations; - allocatorExpectations.push_back(std::pair{ - std::make_shared(64LL << 20), - std::vector{ - "allocateContiguous failed with .* pages", - "max capacity 128.00MB unlimited capacity used .* available .*", - ".* reservation .used .*MB, reserved .*MB, min 0B. counters", - "allocs .*, frees .*, reserves .*, releases .*, collisions .*"}}); - const memory::MmapAllocator::Options options = {.capacity = 64LL << 20}; - allocatorExpectations.push_back(std::pair{ - std::make_shared(options), - std::vector{ - "allocateContiguous failed with .* pages", - "max capacity 128.00MB unlimited capacity used .* available .*", - ".* reservation .used .*MB, reserved .*MB, min .*B. counters", - ".*, frees .*, reserves .*, releases .*, collisions .*"}}); - for (auto& allocExp : allocatorExpectations) { + struct { + int64_t allocatorCapacity; + bool useMmap; + std::vector expectedErrorMessages; + } testSettings[] = { + {64LL << 20, + false, + std::vector{ + "allocateContiguous failed with .* pages", + "max capacity 128.00MB unlimited capacity used .* available .*", + ".* reservation .used .*MB, reserved .*MB, min 0B. counters", + "allocs .*, frees .*, reserves .*, releases .*, collisions .*"}}, + {64LL << 20, + true, + std::vector{ + "allocateContiguous failed with .* pages", + "max capacity 128.00MB unlimited capacity used .* available .*", + ".* reservation .used .*MB, reserved .*MB, min .*B. counters", + ".*, frees .*, reserves .*, releases .*, collisions .*"}}}; + for (const auto& testData : testSettings) { memory::MemoryManager manager( - {.capacity = (int64_t)allocExp.first->capacity(), - .allocator = allocExp.first.get()}); + {.allocatorCapacity = (int64_t)testData.allocatorCapacity, + .useMmapAllocator = testData.useMmap, + .arbitratorCapacity = (int64_t)testData.allocatorCapacity, + .arbitratorReservedCapacity = 0}); vector_size_t size = 1'024; // This limit ensures that only the Aggregation Operator fails. @@ -229,7 +230,7 @@ TEST_P(MemoryCapExceededTest, allocatorCapacityExceededError) { .singleAggregation({"c0"}, {"sum(p1)"}) .orderBy({"c0"}, false) .planNode(); - auto queryCtx = std::make_shared(executor_.get()); + auto queryCtx = core::QueryCtx::create(executor_.get()); queryCtx->testingOverrideMemoryPool( manager.addRootPool(queryCtx->queryId(), kMaxBytes)); CursorParameters params; @@ -241,7 +242,7 @@ TEST_P(MemoryCapExceededTest, allocatorCapacityExceededError) { FAIL() << "Expected a MEM_CAP_EXCEEDED RuntimeException."; } catch (const VeloxException& e) { const auto errorMessage = e.message(); - for (const auto& expectedText : allocExp.second) { + for (const auto& expectedText : testData.expectedErrorMessages) { ASSERT_TRUE(someLineMatches(errorMessage, expectedText)) << "Expected error message to contain '" << expectedText << "', but received '" << errorMessage << "'."; diff --git a/velox/common/memory/tests/MemoryManagerTest.cpp b/velox/common/memory/tests/MemoryManagerTest.cpp index bdeb39b7eb3dc..84bf4a97ea63a 100644 --- a/velox/common/memory/tests/MemoryManagerTest.cpp +++ b/velox/common/memory/tests/MemoryManagerTest.cpp @@ -14,24 +14,26 @@ * limitations under the License. */ +#include #include +#include +#include #include "velox/common/base/VeloxException.h" #include "velox/common/base/tests/GTestUtils.h" #include "velox/common/memory/MallocAllocator.h" #include "velox/common/memory/Memory.h" +#include "velox/common/memory/SharedArbitrator.h" DECLARE_int32(velox_memory_num_shared_leaf_pools); DECLARE_bool(velox_enable_memory_usage_track_in_default_memory_pool); using namespace ::testing; -namespace facebook { -namespace velox { -namespace memory { +namespace facebook::velox::memory { namespace { -constexpr folly::StringPiece kDefaultRootName{"__default_root__"}; +constexpr folly::StringPiece kSysRootName{"__sys_root__"}; MemoryManager& toMemoryManager(MemoryManager& manager) { return *static_cast(&manager); @@ -41,18 +43,18 @@ MemoryManager& toMemoryManager(MemoryManager& manager) { class MemoryManagerTest : public testing::Test { protected: static void SetUpTestCase() { - MemoryArbitrator::registerAllFactories(); + SharedArbitrator::registerFactory(); } inline static const std::string arbitratorKind_{"SHARED"}; }; -TEST_F(MemoryManagerTest, Ctor) { +TEST_F(MemoryManagerTest, ctor) { const auto kSharedPoolCount = FLAGS_velox_memory_num_shared_leaf_pools; { MemoryManager manager{}; - ASSERT_EQ(manager.numPools(), 0); - ASSERT_EQ(manager.capacity(), MemoryAllocator::kDefaultCapacityBytes); + ASSERT_EQ(manager.numPools(), 2); + ASSERT_EQ(manager.capacity(), kMaxMemory); ASSERT_EQ(0, manager.getTotalBytes()); ASSERT_EQ(manager.alignment(), MemoryAllocator::kMaxAlignment); ASSERT_EQ(manager.testingDefaultRoot().alignment(), manager.alignment()); @@ -62,32 +64,36 @@ TEST_F(MemoryManagerTest, Ctor) { } { const auto kCapacity = 8L * 1024 * 1024; - auto allocator = std::make_shared(kCapacity); MemoryManager manager{ - {.capacity = kCapacity, .allocator = allocator.get()}}; + {.allocatorCapacity = kCapacity, + .arbitratorCapacity = kCapacity, + .arbitratorReservedCapacity = 0}}; ASSERT_EQ(kCapacity, manager.capacity()); - ASSERT_EQ(manager.numPools(), 0); + ASSERT_EQ(manager.numPools(), 2); ASSERT_EQ(manager.testingDefaultRoot().alignment(), manager.alignment()); } { const auto kCapacity = 8L * 1024 * 1024; - auto allocator = std::make_shared(kCapacity); MemoryManager manager{ - {.alignment = 0, .capacity = kCapacity, .allocator = allocator.get()}}; + {.alignment = 0, + .allocatorCapacity = kCapacity, + .arbitratorCapacity = kCapacity, + .arbitratorReservedCapacity = 0}}; ASSERT_EQ(manager.alignment(), MemoryAllocator::kMinAlignment); ASSERT_EQ(manager.testingDefaultRoot().alignment(), manager.alignment()); // TODO: replace with root pool memory tracker quota check. - ASSERT_EQ(kSharedPoolCount, manager.testingDefaultRoot().getChildCount()); + ASSERT_EQ( + kSharedPoolCount + 2, manager.testingDefaultRoot().getChildCount()); ASSERT_EQ(kCapacity, manager.capacity()); ASSERT_EQ(0, manager.getTotalBytes()); } { MemoryManagerOptions options; const auto kCapacity = 4L << 30; - auto allocator = std::make_shared(kCapacity); - options.capacity = kCapacity; - options.allocator = allocator.get(); + options.allocatorCapacity = kCapacity; + options.arbitratorCapacity = kCapacity; + options.arbitratorReservedCapacity = 0; std::string arbitratorKind = "SHARED"; options.arbitratorKind = arbitratorKind; MemoryManager manager{options}; @@ -96,50 +102,47 @@ TEST_F(MemoryManagerTest, Ctor) { ASSERT_EQ(arbitrator->stats().maxCapacityBytes, kCapacity); ASSERT_EQ( manager.toString(), - "Memory Manager[capacity 4.00GB alignment 64B usedBytes 0B number of pools 0\nList of root pools:\n\t__default_root__\nMemory Allocator[MALLOC capacity 4.00GB allocated bytes 0 allocated pages 0 mapped pages 0]\nARBITRATOR[SHARED CAPACITY[4.00GB] STATS[numRequests 0 numSucceeded 0 numAborted 0 numFailures 0 numNonReclaimableAttempts 0 queueTime 0us arbitrationTime 0us reclaimTime 0us shrunkMemory 0B reclaimedMemory 0B maxCapacity 4.00GB freeCapacity 4.00GB]]]"); - } - { - // Test construction failure due to inconsistent allocator capacity setting. - MemoryManagerOptions options; - const auto kCapacity = 8L * 1024 * 1024; - options.capacity = kCapacity; - auto allocator = std::make_shared(kCapacity + 1); - options.allocator = allocator.get(); - VELOX_ASSERT_THROW( - MemoryManager(options), - "MemoryAllocator capacity 8388609 must be the same as MemoryManager capacity 8388608"); + "Memory Manager[capacity 4.00GB alignment 64B usedBytes 0B number of " + "pools 2\nList of root pools:\n\t__sys_root__\n" + "Memory Allocator[MALLOC capacity 4.00GB allocated bytes 0 " + "allocated pages 0 mapped pages 0]\n" + "ARBITRATOR[SHARED CAPACITY[4.00GB] PENDING[0] " + "STATS[numRequests 0 numAborted 0 numFailures 0 " + "numNonReclaimableAttempts 0 numShrinks 0 queueTime 0us " + "arbitrationTime 0us reclaimTime 0us shrunkMemory 0B " + "reclaimedMemory 0B maxCapacity 4.00GB freeCapacity 4.00GB freeReservedCapacity 0B]]]"); } } namespace { class FakeTestArbitrator : public MemoryArbitrator { public: - explicit FakeTestArbitrator(const Config& config) + explicit FakeTestArbitrator( + const Config& config, + bool injectAddPoolFailure = false) : MemoryArbitrator( {.kind = config.kind, .capacity = config.capacity, - .memoryPoolInitCapacity = config.memoryPoolInitCapacity, - .memoryPoolTransferCapacity = config.memoryPoolTransferCapacity}) { - } + .extraConfigs = config.extraConfigs}), + injectAddPoolFailure_(injectAddPoolFailure) {} - void reserveMemory(MemoryPool* pool, uint64_t bytes) override { - VELOX_NYI(); + void addPool(const std::shared_ptr& /*unused*/) override { + VELOX_CHECK(!injectAddPoolFailure_, "Failed to add pool"); } - void releaseMemory(MemoryPool* pool) override { + void removePool(MemoryPool* /*unused*/) override {} + + bool growCapacity(MemoryPool* /*unused*/, uint64_t /*unused*/) override { VELOX_NYI(); } - bool growMemory( - MemoryPool* pool, - const std::vector>& candidatePools, - uint64_t targetBytes) override { + uint64_t shrinkCapacity(uint64_t /*unused*/, bool /*unused*/, bool /*unused*/) + override { VELOX_NYI(); } - uint64_t shrinkMemory( - const std::vector>& pools, - uint64_t targetBytes) override { + uint64_t shrinkCapacity(MemoryPool* /*unused*/, uint64_t /*unused*/) + override { VELOX_NYI(); } @@ -154,6 +157,9 @@ class FakeTestArbitrator : public MemoryArbitrator { std::string kind() const override { return "FAKE"; } + + private: + const bool injectAddPoolFailure_{false}; }; } // namespace @@ -168,12 +174,27 @@ TEST_F(MemoryManagerTest, createWithCustomArbitrator) { [&] { MemoryArbitrator::unregisterFactory(kindString); }); MemoryManagerOptions options; options.arbitratorKind = kindString; - options.capacity = 8L << 20; - options.queryMemoryCapacity = 256L << 20; - auto allocator = std::make_shared(options.capacity); - options.allocator = allocator.get(); + options.allocatorCapacity = 8L << 20; + options.arbitratorCapacity = 256L << 20; + MemoryManager manager{options}; + ASSERT_EQ(manager.arbitrator()->capacity(), options.allocatorCapacity); + ASSERT_EQ(manager.allocator()->capacity(), options.allocatorCapacity); +} + +TEST_F(MemoryManagerTest, addPoolFailure) { + const std::string kindString = "FAKE"; + MemoryArbitrator::Factory factory = + [](const MemoryArbitrator::Config& config) { + return std::make_unique( + config, /*injectAddPoolFailure*/ true); + }; + MemoryArbitrator::registerFactory(kindString, factory); + auto guard = folly::makeGuard( + [&] { MemoryArbitrator::unregisterFactory(kindString); }); + MemoryManagerOptions options; + options.arbitratorKind = kindString; MemoryManager manager{options}; - ASSERT_EQ(manager.arbitrator()->capacity(), options.capacity); + VELOX_ASSERT_THROW(manager.addRootPool(), "Failed to add pool"); } TEST_F(MemoryManagerTest, addPool) { @@ -206,14 +227,11 @@ TEST_F(MemoryManagerTest, addPool) { TEST_F(MemoryManagerTest, addPoolWithArbitrator) { MemoryManagerOptions options; const auto kCapacity = 32L << 30; - auto allocator = std::make_shared(kCapacity); - options.allocator = allocator.get(); - options.capacity = kCapacity; + options.allocatorCapacity = kCapacity; options.arbitratorKind = arbitratorKind_; // The arbitrator capacity will be overridden by the memory manager's // capacity. - options.capacity = options.capacity; - const uint64_t initialPoolCapacity = options.capacity / 32; + const uint64_t initialPoolCapacity = options.allocatorCapacity / 32; options.memoryPoolInitCapacity = initialPoolCapacity; MemoryManager manager{options}; @@ -248,13 +266,14 @@ TEST_F(MemoryManagerTest, addPoolWithArbitrator) { ASSERT_EQ(aggregationPool->capacity(), initialPoolCapacity); } +// TODO: remove this test when remove deprecatedDefaultMemoryManager. TEST_F(MemoryManagerTest, defaultMemoryManager) { - auto& managerA = toMemoryManager(defaultMemoryManager()); - auto& managerB = toMemoryManager(defaultMemoryManager()); - const auto kSharedPoolCount = FLAGS_velox_memory_num_shared_leaf_pools; - ASSERT_EQ(managerA.numPools(), 0); + auto& managerA = toMemoryManager(deprecatedDefaultMemoryManager()); + auto& managerB = toMemoryManager(deprecatedDefaultMemoryManager()); + const auto kSharedPoolCount = FLAGS_velox_memory_num_shared_leaf_pools + 2; + ASSERT_EQ(managerA.numPools(), 2); ASSERT_EQ(managerA.testingDefaultRoot().getChildCount(), kSharedPoolCount); - ASSERT_EQ(managerB.numPools(), 0); + ASSERT_EQ(managerB.numPools(), 2); ASSERT_EQ(managerB.testingDefaultRoot().getChildCount(), kSharedPoolCount); auto child1 = managerA.addLeafPool("child_1"); @@ -265,53 +284,71 @@ TEST_F(MemoryManagerTest, defaultMemoryManager) { kSharedPoolCount + 2, managerA.testingDefaultRoot().getChildCount()); EXPECT_EQ( kSharedPoolCount + 2, managerB.testingDefaultRoot().getChildCount()); - ASSERT_EQ(managerA.numPools(), 2); - ASSERT_EQ(managerB.numPools(), 2); + ASSERT_EQ(managerA.numPools(), 4); + ASSERT_EQ(managerB.numPools(), 4); auto pool = managerB.addRootPool(); - ASSERT_EQ(managerA.numPools(), 3); - ASSERT_EQ(managerB.numPools(), 3); + ASSERT_EQ(managerA.numPools(), 5); + ASSERT_EQ(managerB.numPools(), 5); ASSERT_EQ( managerA.toString(), - "Memory Manager[capacity UNLIMITED alignment 64B usedBytes 0B number of pools 3\nList of root pools:\n\t__default_root__\n\tdefault_root_0\nMemory Allocator[MALLOC capacity UNLIMITED allocated bytes 0 allocated pages 0 mapped pages 0]\nARBIRTATOR[NOOP CAPACITY[UNLIMITED]]]"); + "Memory Manager[capacity UNLIMITED alignment 64B usedBytes 0B number of pools 5\nList of root pools:\n\t__sys_root__\n\tdefault_root_0\n\trefcount 2\nMemory Allocator[MALLOC capacity UNLIMITED allocated bytes 0 allocated pages 0 mapped pages 0]\nARBIRTATOR[NOOP CAPACITY[UNLIMITED]]]"); ASSERT_EQ( managerB.toString(), - "Memory Manager[capacity UNLIMITED alignment 64B usedBytes 0B number of pools 3\nList of root pools:\n\t__default_root__\n\tdefault_root_0\nMemory Allocator[MALLOC capacity UNLIMITED allocated bytes 0 allocated pages 0 mapped pages 0]\nARBIRTATOR[NOOP CAPACITY[UNLIMITED]]]"); + "Memory Manager[capacity UNLIMITED alignment 64B usedBytes 0B number of pools 5\nList of root pools:\n\t__sys_root__\n\tdefault_root_0\n\trefcount 2\nMemory Allocator[MALLOC capacity UNLIMITED allocated bytes 0 allocated pages 0 mapped pages 0]\nARBIRTATOR[NOOP CAPACITY[UNLIMITED]]]"); child1.reset(); EXPECT_EQ( kSharedPoolCount + 1, managerA.testingDefaultRoot().getChildCount()); child2.reset(); EXPECT_EQ(kSharedPoolCount, managerB.testingDefaultRoot().getChildCount()); - ASSERT_EQ(managerA.numPools(), 1); - ASSERT_EQ(managerB.numPools(), 1); + ASSERT_EQ(managerA.numPools(), 3); + ASSERT_EQ(managerB.numPools(), 3); pool.reset(); - ASSERT_EQ(managerA.numPools(), 0); - ASSERT_EQ(managerB.numPools(), 0); + ASSERT_EQ(managerA.numPools(), 2); + ASSERT_EQ(managerB.numPools(), 2); ASSERT_EQ( managerA.toString(), - "Memory Manager[capacity UNLIMITED alignment 64B usedBytes 0B number of pools 0\nList of root pools:\n\t__default_root__\nMemory Allocator[MALLOC capacity UNLIMITED allocated bytes 0 allocated pages 0 mapped pages 0]\nARBIRTATOR[NOOP CAPACITY[UNLIMITED]]]"); + "Memory Manager[capacity UNLIMITED alignment 64B usedBytes 0B number of pools 2\nList of root pools:\n\t__sys_root__\nMemory Allocator[MALLOC capacity UNLIMITED allocated bytes 0 allocated pages 0 mapped pages 0]\nARBIRTATOR[NOOP CAPACITY[UNLIMITED]]]"); ASSERT_EQ( managerB.toString(), - "Memory Manager[capacity UNLIMITED alignment 64B usedBytes 0B number of pools 0\nList of root pools:\n\t__default_root__\nMemory Allocator[MALLOC capacity UNLIMITED allocated bytes 0 allocated pages 0 mapped pages 0]\nARBIRTATOR[NOOP CAPACITY[UNLIMITED]]]"); + "Memory Manager[capacity UNLIMITED alignment 64B usedBytes 0B number of pools 2\nList of root pools:\n\t__sys_root__\nMemory Allocator[MALLOC capacity UNLIMITED allocated bytes 0 allocated pages 0 mapped pages 0]\nARBIRTATOR[NOOP CAPACITY[UNLIMITED]]]"); + const std::string detailedManagerStr = managerA.toString(true); + ASSERT_THAT( + detailedManagerStr, + testing::HasSubstr( + "Memory Manager[capacity UNLIMITED alignment 64B usedBytes 0B number of pools 2\nList of root pools:\n__sys_root__ usage 0B reserved 0B peak 0B\n")); + ASSERT_THAT( + detailedManagerStr, + testing::HasSubstr("__sys_spilling__ usage 0B reserved 0B peak 0B\n")); + ASSERT_THAT( + detailedManagerStr, + testing::HasSubstr("__sys_tracing__ usage 0B reserved 0B peak 0B\n")); + for (int i = 0; i < 32; ++i) { + ASSERT_THAT( + managerA.toString(true), + testing::HasSubstr(fmt::format( + "__sys_shared_leaf__{} usage 0B reserved 0B peak 0B\n", i))); + } } +// TODO: remove this test when remove deprecatedAddDefaultLeafMemoryPool. TEST(MemoryHeaderTest, addDefaultLeafMemoryPool) { - auto& manager = toMemoryManager(defaultMemoryManager()); - const auto kSharedPoolCount = FLAGS_velox_memory_num_shared_leaf_pools; + auto& manager = toMemoryManager(deprecatedDefaultMemoryManager()); + const auto kSharedPoolCount = FLAGS_velox_memory_num_shared_leaf_pools + 2; ASSERT_EQ(manager.testingDefaultRoot().getChildCount(), kSharedPoolCount); { - auto poolA = addDefaultLeafMemoryPool(); + auto poolA = deprecatedAddDefaultLeafMemoryPool(); ASSERT_EQ(poolA->kind(), MemoryPool::Kind::kLeaf); - auto poolB = addDefaultLeafMemoryPool(); + auto poolB = deprecatedAddDefaultLeafMemoryPool(); ASSERT_EQ(poolB->kind(), MemoryPool::Kind::kLeaf); EXPECT_EQ( kSharedPoolCount + 2, manager.testingDefaultRoot().getChildCount()); { - auto poolC = addDefaultLeafMemoryPool(); + auto poolC = deprecatedAddDefaultLeafMemoryPool(); ASSERT_EQ(poolC->kind(), MemoryPool::Kind::kLeaf); EXPECT_EQ( kSharedPoolCount + 3, manager.testingDefaultRoot().getChildCount()); { - auto poolD = addDefaultLeafMemoryPool(); + auto poolD = deprecatedAddDefaultLeafMemoryPool(); ASSERT_EQ(poolD->kind(), MemoryPool::Kind::kLeaf); EXPECT_EQ( kSharedPoolCount + 4, manager.testingDefaultRoot().getChildCount()); @@ -324,7 +361,7 @@ TEST(MemoryHeaderTest, addDefaultLeafMemoryPool) { } EXPECT_EQ(kSharedPoolCount, manager.testingDefaultRoot().getChildCount()); - auto namedPool = addDefaultLeafMemoryPool("namedPool"); + auto namedPool = deprecatedAddDefaultLeafMemoryPool("namedPool"); ASSERT_EQ(namedPool->name(), "namedPool"); } @@ -351,7 +388,7 @@ TEST_F(MemoryManagerTest, memoryPoolManagement) { MemoryManagerOptions options; options.alignment = alignment; MemoryManager manager{options}; - ASSERT_EQ(manager.numPools(), 0); + ASSERT_EQ(manager.numPools(), 2); const int numPools = 100; std::vector> userRootPools; std::vector> userLeafPools; @@ -376,30 +413,38 @@ TEST_F(MemoryManagerTest, memoryPoolManagement) { ASSERT_FALSE(rootUnamedPool->name().empty()); ASSERT_EQ(rootUnamedPool->kind(), MemoryPool::Kind::kAggregate); ASSERT_EQ(rootUnamedPool->parent(), nullptr); - ASSERT_EQ(manager.numPools(), numPools + 2); + ASSERT_EQ(manager.numPools(), 1 + numPools + 2 + 1); userLeafPools.clear(); leafUnamedPool.reset(); - ASSERT_EQ(manager.numPools(), numPools / 2 + 1); + ASSERT_EQ(manager.numPools(), 1 + numPools / 2 + 1 + 1); userRootPools.clear(); - ASSERT_EQ(manager.numPools(), 1); + ASSERT_EQ(manager.numPools(), 1 + 2); rootUnamedPool.reset(); - ASSERT_EQ(manager.numPools(), 0); + ASSERT_EQ(manager.numPools(), 2); } // TODO: when run sequentially, e.g. `buck run dwio/memory/...`, this has side // effects for other tests using process singleton memory manager. Might need to // use folly::Singleton for isolation by tag. TEST_F(MemoryManagerTest, globalMemoryManager) { - auto& manager = MemoryManager::getInstance(); - auto& managerII = MemoryManager::getInstance(); - const auto kSharedPoolCount = FLAGS_velox_memory_num_shared_leaf_pools; + initializeMemoryManager({}); + auto* globalManager = memoryManager(); + ASSERT_TRUE(globalManager != nullptr); + VELOX_ASSERT_THROW(initializeMemoryManager({}), ""); + ASSERT_EQ(memoryManager(), globalManager); + MemoryManager::testingSetInstance({}); + auto* manager = memoryManager(); + ASSERT_NE(manager, globalManager); + ASSERT_EQ(manager, memoryManager()); + auto* managerII = memoryManager(); + const auto kSharedPoolCount = FLAGS_velox_memory_num_shared_leaf_pools + 2; { - auto& rootI = manager.testingDefaultRoot(); + auto& rootI = manager->testingDefaultRoot(); const std::string childIName("some_child"); auto childI = rootI.addLeafChild(childIName); ASSERT_EQ(rootI.getChildCount(), kSharedPoolCount + 1); - auto& rootII = managerII.testingDefaultRoot(); + auto& rootII = managerII->testingDefaultRoot(); ASSERT_EQ(kSharedPoolCount + 1, rootII.getChildCount()); std::vector pools{}; rootII.visitChildren([&pools](MemoryPool* child) { @@ -415,42 +460,20 @@ TEST_F(MemoryManagerTest, globalMemoryManager) { } ASSERT_EQ(matchedCount, 1); - auto childII = manager.addLeafPool("another_child"); + auto childII = manager->addLeafPool("another_child"); ASSERT_EQ(childII->kind(), MemoryPool::Kind::kLeaf); ASSERT_EQ(rootI.getChildCount(), kSharedPoolCount + 2); - ASSERT_EQ(childII->parent()->name(), kDefaultRootName.str()); + ASSERT_EQ(childII->parent()->name(), kSysRootName.str()); childII.reset(); ASSERT_EQ(rootI.getChildCount(), kSharedPoolCount + 1); ASSERT_EQ(rootII.getChildCount(), kSharedPoolCount + 1); - auto userRootChild = manager.addRootPool("rootChild"); + auto userRootChild = manager->addRootPool("rootChild"); ASSERT_EQ(userRootChild->kind(), MemoryPool::Kind::kAggregate); ASSERT_EQ(rootI.getChildCount(), kSharedPoolCount + 1); ASSERT_EQ(rootII.getChildCount(), kSharedPoolCount + 1); - ASSERT_EQ(manager.numPools(), 2); + ASSERT_EQ(manager->numPools(), 2 + 2); } - ASSERT_EQ(manager.numPools(), 0); - { - auto& manager = MemoryManager::getInstance(); - auto& defaultManager = defaultMemoryManager(); - ASSERT_EQ(&manager, &defaultManager); - auto pool = addDefaultLeafMemoryPool(); - ASSERT_EQ(pool->kind(), MemoryPool::Kind::kLeaf); - ASSERT_EQ(pool->parent()->name(), kDefaultRootName.str()); - ASSERT_EQ(manager.numPools(), 1); - ASSERT_EQ( - manager.testingDefaultRoot().getChildCount(), kSharedPoolCount + 1); - pool.reset(); - ASSERT_EQ(manager.testingDefaultRoot().getChildCount(), kSharedPoolCount); - } - ASSERT_EQ(manager.numPools(), 0); -} - -TEST_F(MemoryManagerTest, GlobalMemoryManagerQuota) { - auto& manager = MemoryManager::getInstance(); - MemoryManager::getInstance({.alignment = 32}); - - auto& coercedManager = MemoryManager::getInstance({.alignment = 64}); - ASSERT_EQ(manager.alignment(), coercedManager.alignment()); + ASSERT_EQ(manager->numPools(), 2); } TEST_F(MemoryManagerTest, alignmentOptionCheck) { @@ -548,9 +571,9 @@ TEST_F(MemoryManagerTest, concurrentPoolAccess) { } stopCheck = true; checkThread.join(); - ASSERT_EQ(manager.numPools(), pools.size()); + ASSERT_EQ(manager.numPools(), pools.size() + 2); pools.clear(); - ASSERT_EQ(manager.numPools(), 0); + ASSERT_EQ(manager.numPools(), 2); } TEST_F(MemoryManagerTest, quotaEnforcement) { @@ -576,17 +599,16 @@ TEST_F(MemoryManagerTest, quotaEnforcement) { {2 << 20, 0, 768, true}}; for (const auto& testData : testSettings) { - auto allocator = - std::make_shared(testData.memoryQuotaBytes); - MemoryAllocator::setDefaultInstance(allocator.get()); SCOPED_TRACE(testData.debugString()); - std::vector contiguousAllocations = {false, true}; - for (const auto& contiguousAlloc : contiguousAllocations) { + const std::vector contiguousAllocations = {false, true}; + for (const auto contiguousAlloc : contiguousAllocations) { SCOPED_TRACE(fmt::format("contiguousAlloc {}", contiguousAlloc)); const int alignment = 32; MemoryManagerOptions options; options.alignment = alignment; - options.capacity = testData.memoryQuotaBytes; + options.allocatorCapacity = testData.memoryQuotaBytes; + options.arbitratorCapacity = testData.memoryQuotaBytes; + options.arbitratorReservedCapacity = 0; MemoryManager manager{options}; auto pool = manager.addLeafPool("quotaEnforcement"); void* smallBuffer{nullptr}; @@ -626,17 +648,56 @@ TEST_F(MemoryManagerTest, quotaEnforcement) { } } -TEST_F(MemoryManagerTest, testCheckUsageLeak) { - FLAGS_velox_memory_leak_check_enabled = true; - auto& manager = MemoryManager::getInstance( - memory::MemoryManagerOptions{.checkUsageLeak = false}); +TEST_F(MemoryManagerTest, disableMemoryPoolTracking) { + const std::string kSharedKind{"SHARED"}; + const std::string kNoopKind{""}; + MemoryManagerOptions options; + options.disableMemoryPoolTracking = true; + options.allocatorCapacity = 64LL << 20; + options.arbitratorCapacity = 64LL << 20; + std::vector arbitratorKinds{kNoopKind, kSharedKind}; + for (auto arbitratorKind : arbitratorKinds) { + options.arbitratorKind = arbitratorKind; + MemoryManager manager{options}; + auto root0 = manager.addRootPool("root_0", 35LL << 20); + auto leaf0 = root0->addLeafChild("leaf_0"); - auto rootPool = manager.addRootPool("duplicateRootPool", kMaxMemory); - auto leafPool = manager.addLeafPool("duplicateLeafPool", true); - ASSERT_FALSE(rootPool->testingCheckUsageLeak()); - ASSERT_FALSE(leafPool->testingCheckUsageLeak()); -} + // Not throwing since there is no duplicate check. + auto root0Dup = manager.addRootPool("root_0", 35LL << 20); + + // 1TB capacity is allowed since there is no capacity check. + auto root1 = manager.addRootPool("root_1", 1LL << 40); + auto leaf1 = root1->addLeafChild("leaf_1"); -} // namespace memory -} // namespace velox -} // namespace facebook + ASSERT_EQ(root0->capacity(), 35LL << 20); + if (arbitratorKind == kSharedKind) { + ASSERT_EQ(root0Dup->capacity(), 29LL << 20); + ASSERT_EQ(root1->capacity(), 0); + } else { + ASSERT_EQ(root0Dup->capacity(), 35LL << 20); + ASSERT_EQ(root1->capacity(), 1LL << 40); + } + + ASSERT_EQ(manager.capacity(), 64LL << 20); + ASSERT_EQ(manager.shrinkPools(), 0); + // Default 1 system pool with 1 leaf child + ASSERT_EQ(manager.numPools(), 2); + + VELOX_ASSERT_THROW( + leaf0->allocate(38LL << 20), "Exceeded memory pool capacity"); + if (arbitratorKind == kSharedKind) { + VELOX_ASSERT_THROW( + leaf1->allocate(256LL << 20), "Exceeded memory pool capacity"); + } else { + VELOX_ASSERT_THROW( + leaf1->allocate(256LL << 20), "Exceeded memory allocator limit"); + } + + ASSERT_NO_THROW(leaf0.reset()); + ASSERT_NO_THROW(leaf1.reset()); + ASSERT_NO_THROW(root0.reset()); + ASSERT_NO_THROW(root0Dup.reset()); + ASSERT_NO_THROW(root1.reset()); + } +} +} // namespace facebook::velox::memory diff --git a/velox/common/memory/tests/MemoryPoolBenchmark.cpp b/velox/common/memory/tests/MemoryPoolBenchmark.cpp index ae13d673841e5..82296fb7e74fa 100644 --- a/velox/common/memory/tests/MemoryPoolBenchmark.cpp +++ b/velox/common/memory/tests/MemoryPoolBenchmark.cpp @@ -239,7 +239,7 @@ BENCHMARK(FlatSticks, iters) { } int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; folly::runBenchmarks(); return 0; } diff --git a/velox/common/memory/tests/MemoryPoolTest.cpp b/velox/common/memory/tests/MemoryPoolTest.cpp index 2b9c45a64bd27..fdaa7611d506e 100644 --- a/velox/common/memory/tests/MemoryPoolTest.cpp +++ b/velox/common/memory/tests/MemoryPoolTest.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -22,7 +23,9 @@ #include "velox/common/caching/SsdCache.h" #include "velox/common/memory/MallocAllocator.h" #include "velox/common/memory/Memory.h" +#include "velox/common/memory/MemoryPool.h" #include "velox/common/memory/MmapAllocator.h" +#include "velox/common/memory/SharedArbitrator.h" #include "velox/common/testutil/TestValue.h" DECLARE_bool(velox_memory_leak_check_enabled); @@ -37,9 +40,7 @@ constexpr int64_t KB = 1024L; constexpr int64_t MB = 1024L * KB; constexpr int64_t GB = 1024L * MB; -namespace facebook { -namespace velox { -namespace memory { +namespace facebook::velox::memory { struct TestParam { bool useMmap; @@ -73,7 +74,7 @@ class MemoryPoolTest : public testing::TestWithParam { protected: static constexpr uint64_t kDefaultCapacity = 8 * GB; // 8GB static void SetUpTestCase() { - MemoryArbitrator::registerAllFactories(); + SharedArbitrator::registerFactory(); FLAGS_velox_memory_leak_check_enabled = true; TestValue::enable(); } @@ -94,37 +95,22 @@ class MemoryPoolTest : public testing::TestWithParam { } void setupMemory( - MemoryManagerOptions options = {.capacity = kDefaultCapacity}) { - if (useMmap_) { - MmapAllocator::Options opts{(uint64_t)options.capacity}; - allocator_ = std::make_shared(opts); - if (useCache_) { - cache_ = AsyncDataCache::create(allocator_.get()); - MemoryAllocator::setDefaultInstance(allocator_.get()); - } else { - MemoryAllocator::setDefaultInstance(allocator_.get()); - } - } else { - allocator_ = std::make_shared(options.capacity); - if (useCache_) { - cache_ = AsyncDataCache::create(allocator_.get()); - MemoryAllocator::setDefaultInstance(allocator_.get()); - } else { - MemoryAllocator::setDefaultInstance(allocator_.get()); - } - } - options.allocator = allocator_.get(); - // MemoryAllocator, depending on implementation, might round up capacity. - options.capacity = allocator_->capacity(); + MemoryManagerOptions options = { + .debugEnabled = true, + .allocatorCapacity = kDefaultCapacity, + .arbitratorCapacity = kDefaultCapacity, + .arbitratorReservedCapacity = 1LL << 30}) { + options.useMmapAllocator = useMmap_; manager_ = std::make_shared(options); + if (useCache_) { + cache_ = AsyncDataCache::create(manager_->allocator()); + } } void TearDown() override { if (useCache_) { cache_->shutdown(); } - allocator_->testingClearFailureInjection(); - MmapAllocator::setDefaultInstance(nullptr); } void reset() { @@ -139,7 +125,7 @@ class MemoryPoolTest : public testing::TestWithParam { void abortPool(MemoryPool* pool) { try { VELOX_FAIL("Manual MemoryPool Abortion"); - } catch (const VeloxException& error) { + } catch (const VeloxException&) { pool->abort(std::current_exception()); } } @@ -149,27 +135,31 @@ class MemoryPoolTest : public testing::TestWithParam { const bool useCache_; const bool isLeafThreadSafe_; folly::Random::DefaultGenerator rng_; - std::shared_ptr allocator_; std::shared_ptr manager_; std::shared_ptr cache_; MemoryReclaimer::Stats stats_; }; -TEST_P(MemoryPoolTest, Ctor) { +TEST_P(MemoryPoolTest, ctor) { constexpr uint16_t kAlignment = 64; - setupMemory({.alignment = 64, .capacity = kDefaultCapacity}); + setupMemory({.alignment = 64, .allocatorCapacity = kDefaultCapacity}); MemoryManager& manager = *getMemoryManager(); const int64_t capacity = 4 * GB; auto root = manager.addRootPool("Ctor", 4 * GB); ASSERT_EQ(root->kind(), MemoryPool::Kind::kAggregate); - ASSERT_EQ(root->currentBytes(), 0); + ASSERT_EQ(root->usedBytes(), 0); + ASSERT_EQ(root->reservedBytes(), 0); ASSERT_EQ(root->parent(), nullptr); ASSERT_EQ(root->root(), root.get()); ASSERT_EQ(root->capacity(), capacity); + VELOX_ASSERT_THROW( + static_cast(root.get()) + ->setDestructionCallback([](MemoryPool*) {}), + ""); { auto fakeRoot = std::make_shared( - &manager, "fake_root", MemoryPool::Kind::kAggregate, nullptr); + &manager, "fake_root", MemoryPool::Kind::kAggregate, nullptr, nullptr); // We can't construct an aggregate memory pool with non-thread safe. ASSERT_ANY_THROW(std::make_shared( &manager, @@ -177,13 +167,12 @@ TEST_P(MemoryPoolTest, Ctor) { MemoryPool::Kind::kAggregate, nullptr, nullptr, - nullptr, MemoryPool::Options{.threadSafe = false})); ASSERT_EQ("fake_root", fakeRoot->name()); ASSERT_EQ( static_cast(root.get())->testingAllocator(), fakeRoot->testingAllocator()); - ASSERT_EQ(0, fakeRoot->currentBytes()); + ASSERT_EQ(0, fakeRoot->usedBytes()); ASSERT_EQ(fakeRoot->parent(), nullptr); } { @@ -191,18 +180,26 @@ TEST_P(MemoryPoolTest, Ctor) { ASSERT_EQ(child->parent(), root.get()); ASSERT_EQ(child->root(), root.get()); ASSERT_EQ(child->capacity(), capacity); + VELOX_ASSERT_THROW( + static_cast(child.get()) + ->setDestructionCallback([](MemoryPool*) {}), + ""); auto& favoriteChild = dynamic_cast(*child); ASSERT_EQ("child", favoriteChild.name()); ASSERT_EQ( static_cast(root.get())->testingAllocator(), favoriteChild.testingAllocator()); - ASSERT_EQ(favoriteChild.currentBytes(), 0); + ASSERT_EQ(favoriteChild.usedBytes(), 0); } { auto aggregateChild = root->addAggregateChild("aggregateChild"); ASSERT_EQ(aggregateChild->parent(), root.get()); ASSERT_EQ(aggregateChild->root(), root.get()); ASSERT_EQ(aggregateChild->capacity(), capacity); + VELOX_ASSERT_THROW( + static_cast(aggregateChild.get()) + ->setDestructionCallback([](MemoryPool*) {}), + ""); auto grandChild = aggregateChild->addLeafChild("child", isLeafThreadSafe_); ASSERT_EQ(grandChild->parent(), aggregateChild.get()); ASSERT_EQ(grandChild->root(), root.get()); @@ -214,7 +211,7 @@ TEST_P(MemoryPoolTest, Ctor) { "Memory pool rootWithZeroMaxCapacity max capacity can't be zero"); } -TEST_P(MemoryPoolTest, AddChild) { +TEST_P(MemoryPoolTest, addChild) { MemoryManager& manager = *getMemoryManager(); auto root = manager.addRootPool("root"); ASSERT_EQ(root->parent(), nullptr); @@ -231,13 +228,36 @@ TEST_P(MemoryPoolTest, AddChild) { }); ASSERT_THAT( nodes, UnorderedElementsAreArray({childOne.get(), childTwo.get()})); + // Child pool name collision. ASSERT_THROW(root->addAggregateChild("child_one"), VeloxRuntimeError); ASSERT_EQ(root->getChildCount(), 2); + + constexpr int64_t kChunkSize{128}; + void* buff = childOne->allocate(kChunkSize); + // Add child when 'reservedBytes != 0', in which case 'usedBytes()' will call + // 'visitChildren()'. + VELOX_ASSERT_THROW(root->addAggregateChild("child_one"), ""); + childOne->free(buff, kChunkSize); + ASSERT_EQ(root->getChildCount(), 2); + childOne.reset(); ASSERT_EQ(root->getChildCount(), 1); childOne = root->addLeafChild("child_one", isLeafThreadSafe_); ASSERT_EQ(root->getChildCount(), 2); + ASSERT_EQ(root->treeMemoryUsage(), "root usage 0B reserved 0B peak 1.00MB\n"); + ASSERT_EQ( + root->treeMemoryUsage(true), "root usage 0B reserved 0B peak 1.00MB\n"); + const std::string treeUsageWithEmptyPool = root->treeMemoryUsage(false); + ASSERT_THAT( + treeUsageWithEmptyPool, + testing::HasSubstr("root usage 0B reserved 0B peak 1.00MB\n")); + ASSERT_THAT( + treeUsageWithEmptyPool, + testing::HasSubstr("child_one usage 0B reserved 0B peak 0B\n")); + ASSERT_THAT( + treeUsageWithEmptyPool, + testing::HasSubstr("child_two usage 0B reserved 0B peak 0B\n")); } TEST_P(MemoryPoolTest, dropChild) { @@ -298,13 +318,12 @@ MachinePageCount numPagesNeeded( } void testMmapMemoryAllocation( - MmapAllocator* mmapAllocator, + int64_t capacity, MachinePageCount allocPages, size_t allocCount, bool threadSafe) { MemoryManager manager{ - {.capacity = (int64_t)(mmapAllocator->capacity()), - .allocator = mmapAllocator}}; + {.allocatorCapacity = capacity, .useMmapAllocator = true}}; const auto kPageSize = 4 * KB; auto root = manager.addRootPool(); @@ -313,6 +332,7 @@ void testMmapMemoryAllocation( std::vector allocations; uint64_t totalPageAllocated = 0; uint64_t totalPageMapped = 0; + auto* mmapAllocator = static_cast(manager.allocator()); const auto pageIncrement = numPagesNeeded(mmapAllocator, allocPages); const auto isSizeClassAlloc = allocPages <= mmapAllocator->sizeClasses().back(); @@ -347,51 +367,68 @@ void testMmapMemoryAllocation( } } -TEST_P(MemoryPoolTest, SmallMmapMemoryAllocation) { - MmapAllocator::Options options; - options.capacity = 8 * GB; - auto mmapAllocator = std::make_shared(options); - testMmapMemoryAllocation(mmapAllocator.get(), 6, 100, isLeafThreadSafe_); +TEST_P(MemoryPoolTest, smallMmapMemoryAllocation) { + testMmapMemoryAllocation(8 * GB, 6, 100, isLeafThreadSafe_); } -TEST_P(MemoryPoolTest, BigMmapMemoryAllocation) { - MmapAllocator::Options options; - options.capacity = 8 * GB; - auto mmapAllocator = std::make_shared(options); - testMmapMemoryAllocation( - mmapAllocator.get(), - mmapAllocator->sizeClasses().back() + 56, - 20, - isLeafThreadSafe_); +TEST_P(MemoryPoolTest, bigMmapMemoryAllocation) { + testMmapMemoryAllocation(8 * GB, 256 + 56, 20, isLeafThreadSafe_); } -// Mainly tests how it updates the memory usage in Memorypool-> -TEST_P(MemoryPoolTest, AllocTest) { +// Mainly tests how it updates the memory usage in memory pool. +TEST_P(MemoryPoolTest, allocTest) { auto manager = getMemoryManager(); auto root = manager->addRootPool(); - auto child = root->addLeafChild("elastic_quota", isLeafThreadSafe_); + auto child = root->addLeafChild("allocTest", isLeafThreadSafe_); const int64_t kChunkSize{32L * MB}; void* oneChunk = child->allocate(kChunkSize); ASSERT_EQ(reinterpret_cast(oneChunk) % child->alignment(), 0); - ASSERT_EQ(kChunkSize, child->currentBytes()); + ASSERT_EQ(kChunkSize, child->usedBytes()); ASSERT_EQ(kChunkSize, child->stats().peakBytes); void* threeChunks = child->allocate(3 * kChunkSize); - ASSERT_EQ(4 * kChunkSize, child->currentBytes()); + ASSERT_EQ(4 * kChunkSize, child->usedBytes()); ASSERT_EQ(4 * kChunkSize, child->stats().peakBytes); child->free(threeChunks, 3 * kChunkSize); - ASSERT_EQ(kChunkSize, child->currentBytes()); + ASSERT_EQ(kChunkSize, child->usedBytes()); ASSERT_EQ(4 * kChunkSize, child->stats().peakBytes); child->free(oneChunk, kChunkSize); - ASSERT_EQ(0, child->currentBytes()); + ASSERT_EQ(0, child->usedBytes()); ASSERT_EQ(4 * kChunkSize, child->stats().peakBytes); } +TEST_P(MemoryPoolTest, usedBytes) { + auto manager = getMemoryManager(); + auto root = manager->addRootPool(); + + auto child1 = root->addLeafChild("usedBytes1", isLeafThreadSafe_); + auto child2 = root->addLeafChild("usedBytes2", isLeafThreadSafe_); + + const int64_t kChunkSize{128}; + + void* buf1 = child1->allocate(kChunkSize); + void* buf2 = child2->allocate(kChunkSize); + ASSERT_EQ(child1->reservedBytes(), 1 << 20); + ASSERT_EQ(child1->usedBytes(), kChunkSize); + ASSERT_EQ(child2->reservedBytes(), 1 << 20); + ASSERT_EQ(child2->usedBytes(), kChunkSize); + + ASSERT_EQ(root->reservedBytes(), 2 << 20); + ASSERT_EQ(root->usedBytes(), 2 * kChunkSize); + + child1->free(buf1, kChunkSize); + ASSERT_EQ(root->reservedBytes(), 1 << 20); + ASSERT_EQ(root->usedBytes(), kChunkSize); + child2->free(buf2, kChunkSize); + ASSERT_EQ(root->reservedBytes(), 0); + ASSERT_EQ(root->usedBytes(), 0); +} + TEST_P(MemoryPoolTest, DISABLED_memoryLeakCheck) { gflags::FlagSaver flagSaver; testing::FLAGS_gtest_death_test_style = "fast"; @@ -401,29 +438,145 @@ TEST_P(MemoryPoolTest, DISABLED_memoryLeakCheck) { auto child = root->addLeafChild("elastic_quota", isLeafThreadSafe_); const int64_t kChunkSize{32L * MB}; void* oneChunk = child->allocate(kChunkSize); - FLAGS_velox_memory_leak_check_enabled = true; ASSERT_DEATH(child.reset(), ""); child->free(oneChunk, kChunkSize); } -TEST_P(MemoryPoolTest, DISABLED_growBeyondMaxCapacity) { - gflags::FlagSaver flagSaver; - testing::FLAGS_gtest_death_test_style = "fast"; +TEST_P(MemoryPoolTest, growFailures) { auto manager = getMemoryManager(); + // Grow beyond limit. { auto poolWithoutLimit = manager->addRootPool("poolWithoutLimit"); ASSERT_EQ(poolWithoutLimit->capacity(), kMaxMemory); - ASSERT_DEATH( - poolWithoutLimit->grow(1), "Can't grow with unlimited capacity"); + ASSERT_EQ(poolWithoutLimit->usedBytes(), 0); + ASSERT_EQ(poolWithoutLimit->reservedBytes(), 0); + VELOX_ASSERT_THROW( + poolWithoutLimit->grow(1, 0), "Can't grow with unlimited capacity"); + ASSERT_EQ(poolWithoutLimit->usedBytes(), 0); + ASSERT_EQ(poolWithoutLimit->reservedBytes(), 0); + VELOX_ASSERT_THROW( + poolWithoutLimit->grow(1, 1'000), "Can't grow with unlimited capacity"); + ASSERT_EQ(poolWithoutLimit->usedBytes(), 0); } { const int64_t capacity = 4 * GB; auto poolWithLimit = manager->addRootPool("poolWithLimit", capacity); ASSERT_EQ(poolWithLimit->capacity(), capacity); - ASSERT_EQ(poolWithLimit->shrink(poolWithLimit->currentBytes()), capacity); - ASSERT_EQ(poolWithLimit->grow(capacity / 2), capacity / 2); - ASSERT_DEATH( - poolWithLimit->grow(capacity), "Can't grow beyond the max capacity"); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_EQ(poolWithLimit->shrink(poolWithLimit->reservedBytes()), capacity); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_TRUE(poolWithLimit->grow(capacity / 2, 0)); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + ASSERT_FALSE(poolWithLimit->grow(capacity, 0)); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + ASSERT_EQ(poolWithLimit->capacity(), capacity / 2); + ASSERT_FALSE(poolWithLimit->grow(capacity, 1'000)); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + } + + // Insufficient capacity for new reservation. + { + const int64_t capacity = 4 * GB; + auto poolWithLimit = manager->addRootPool("poolWithLimit", capacity); + ASSERT_EQ(poolWithLimit->capacity(), capacity); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_EQ(poolWithLimit->shrink(poolWithLimit->capacity()), capacity); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_EQ(poolWithLimit->capacity(), 0); + + ASSERT_FALSE(poolWithLimit->grow(capacity / 2, capacity)); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + ASSERT_EQ(poolWithLimit->capacity(), 0); + + ASSERT_FALSE(poolWithLimit->grow(0, capacity)); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + ASSERT_EQ(poolWithLimit->capacity(), 0); + ASSERT_EQ(poolWithLimit->reservedBytes(), 0); + ASSERT_EQ(poolWithLimit->usedBytes(), 0); + } +} + +TEST_P(MemoryPoolTest, grow) { + auto manager = getMemoryManager(); + const int64_t capacity = 4 * GB; + auto root = manager->addRootPool("grow", capacity); + root->shrink(capacity / 2); + ASSERT_EQ(root->capacity(), capacity / 2); + + auto leaf = root->addLeafChild("leafPool"); + void* buf = leaf->allocate(1 * MB); + ASSERT_EQ(root->capacity(), capacity / 2); + ASSERT_EQ(root->reservedBytes(), 1 * MB); + + ASSERT_TRUE(root->grow(0, 2 * MB)); + ASSERT_EQ(root->reservedBytes(), 3 * MB); + ASSERT_EQ(root->capacity(), capacity / 2); + + ASSERT_TRUE(root->grow(0, 4 * MB)); + ASSERT_EQ(root->reservedBytes(), 7 * MB); + ASSERT_EQ(root->capacity(), capacity / 2); + + ASSERT_TRUE(root->grow(1 * MB, 2 * MB)); + ASSERT_EQ(root->reservedBytes(), 9 * MB); + ASSERT_EQ(root->capacity(), capacity / 2 + 1 * MB); + + ASSERT_TRUE(root->grow(6 * MB, 4 * MB)); + ASSERT_EQ(root->reservedBytes(), 13 * MB); + ASSERT_EQ(root->capacity(), capacity / 2 + 7 * MB); + + static_cast(root.get())->testingSetReservation(1 * MB); + leaf->free(buf, 1 * MB); +} + +TEST_P(MemoryPoolTest, releasableMemory) { + struct TestParam { + int64_t usedBytes; + int64_t reservedBytes; + }; + std::vector testParams{ + {2345, 98760}, + {1, 1024}, + {4096, 4096}, + {1 * MB, 16 * MB}, + {6 * MB, 7 * MB}, + {123 * MB, 200 * MB}, + {100 * MB, 50 * MB}}; + auto root = getMemoryManager()->addRootPool("releasableMemory", 4 * GB); + for (auto i = 0; i < testParams.size() - 1; i++) { + auto leaf0 = root->addLeafChild("leafPool-0"); + leaf0->maybeReserve(testParams[i].reservedBytes); + void* buffer0 = leaf0->allocate(testParams[i].usedBytes); + const auto reservedBytes0 = leaf0->reservedBytes(); + const auto releasableBytes0 = leaf0->releasableReservation(); + + auto leaf1 = root->addLeafChild("leafPool-1"); + leaf1->maybeReserve(testParams[i + 1].reservedBytes); + void* buffer1 = leaf1->allocate(testParams[i + 1].usedBytes); + const auto reservedBytes1 = leaf1->reservedBytes(); + const auto releasableBytes1 = leaf1->releasableReservation(); + + const auto releasableBytesRoot = root->releasableReservation(); + const auto reservedBytesRoot = root->reservedBytes(); + ASSERT_EQ(releasableBytesRoot, releasableBytes0 + releasableBytes1); + + leaf0->release(); + ASSERT_EQ(reservedBytes0 - leaf0->reservedBytes(), releasableBytes0); + ASSERT_EQ(reservedBytesRoot - root->reservedBytes(), releasableBytes0); + leaf1->release(); + ASSERT_EQ(reservedBytes1 - leaf1->reservedBytes(), releasableBytes1); + ASSERT_EQ(reservedBytesRoot - root->reservedBytes(), releasableBytesRoot); + + leaf0->free(buffer0, testParams[i].usedBytes); + leaf1->free(buffer1, testParams[i + 1].usedBytes); } } @@ -436,17 +589,16 @@ TEST_P(MemoryPoolTest, ReallocTestSameSize) { const int64_t kChunkSize{32L * MB}; // Realloc the same size. - void* oneChunk = pool->allocate(kChunkSize); - ASSERT_EQ(kChunkSize, pool->currentBytes()); + ASSERT_EQ(kChunkSize, pool->usedBytes()); ASSERT_EQ(kChunkSize, pool->stats().peakBytes); void* anotherChunk = pool->reallocate(oneChunk, kChunkSize, kChunkSize); - ASSERT_EQ(kChunkSize, pool->currentBytes()); + ASSERT_EQ(kChunkSize, pool->usedBytes()); ASSERT_EQ(2 * kChunkSize, pool->stats().peakBytes); pool->free(anotherChunk, kChunkSize); - ASSERT_EQ(0, pool->currentBytes()); + ASSERT_EQ(0, pool->usedBytes()); ASSERT_EQ(2 * kChunkSize, pool->stats().peakBytes); } @@ -459,15 +611,15 @@ TEST_P(MemoryPoolTest, ReallocTestHigher) { const int64_t kChunkSize{32L * MB}; // Realloc higher. void* oneChunk = pool->allocate(kChunkSize); - EXPECT_EQ(kChunkSize, pool->currentBytes()); + EXPECT_EQ(kChunkSize, pool->usedBytes()); EXPECT_EQ(kChunkSize, pool->stats().peakBytes); void* threeChunks = pool->reallocate(oneChunk, kChunkSize, 3 * kChunkSize); - EXPECT_EQ(3 * kChunkSize, pool->currentBytes()); + EXPECT_EQ(3 * kChunkSize, pool->usedBytes()); EXPECT_EQ(4 * kChunkSize, pool->stats().peakBytes); pool->free(threeChunks, 3 * kChunkSize); - EXPECT_EQ(0, pool->currentBytes()); + EXPECT_EQ(0, pool->usedBytes()); EXPECT_EQ(4 * kChunkSize, pool->stats().peakBytes); } @@ -479,15 +631,15 @@ TEST_P(MemoryPoolTest, ReallocTestLower) { const int64_t kChunkSize{32L * MB}; // Realloc lower. void* threeChunks = pool->allocate(3 * kChunkSize); - EXPECT_EQ(3 * kChunkSize, pool->currentBytes()); + EXPECT_EQ(3 * kChunkSize, pool->usedBytes()); EXPECT_EQ(3 * kChunkSize, pool->stats().peakBytes); void* oneChunk = pool->reallocate(threeChunks, 3 * kChunkSize, kChunkSize); - EXPECT_EQ(kChunkSize, pool->currentBytes()); + EXPECT_EQ(kChunkSize, pool->usedBytes()); EXPECT_EQ(4 * kChunkSize, pool->stats().peakBytes); pool->free(oneChunk, kChunkSize); - EXPECT_EQ(0, pool->currentBytes()); + EXPECT_EQ(0, pool->usedBytes()); EXPECT_EQ(4 * kChunkSize, pool->stats().peakBytes); } @@ -517,7 +669,7 @@ TEST_P(MemoryPoolTest, allocateZeroFilled) { for (int32_t i = 0; i < allocationPtrs.size(); ++i) { pool->free(allocationPtrs[i], allocationSizes[i]); } - ASSERT_EQ(0, pool->currentBytes()); + ASSERT_EQ(0, pool->usedBytes()); } TEST_P(MemoryPoolTest, alignmentCheck) { @@ -528,7 +680,8 @@ TEST_P(MemoryPoolTest, alignmentCheck) { MemoryAllocator::kMaxAlignment}; for (const auto& alignment : alignments) { SCOPED_TRACE(fmt::format("alignment:{}", alignment)); - setupMemory({.alignment = alignment, .capacity = kDefaultCapacity}); + setupMemory( + {.alignment = alignment, .allocatorCapacity = kDefaultCapacity}); auto manager = getMemoryManager(); auto pool = manager->addLeafPool("alignmentCheck"); ASSERT_EQ( @@ -543,20 +696,23 @@ TEST_P(MemoryPoolTest, alignmentCheck) { } pool->free(ptr, bytesToAlloc); } - ASSERT_EQ(0, pool->currentBytes()); + ASSERT_EQ(0, pool->usedBytes()); } } -TEST_P(MemoryPoolTest, MemoryCapExceptions) { +TEST_P(MemoryPoolTest, memoryCapExceptions) { const uint64_t kMaxCap = 128L * MB; - setupMemory({.capacity = kMaxCap}); + setupMemory( + {.allocatorCapacity = kMaxCap, + .arbitratorCapacity = kMaxCap, + .arbitratorReservedCapacity = kMaxCap / 2}); auto manager = getMemoryManager(); // Capping memory pool. { auto root = manager->addRootPool("MemoryCapExceptions", kMaxCap); auto pool = root->addLeafChild("static_quota", isLeafThreadSafe_); { - ASSERT_EQ(0, pool->currentBytes()); + ASSERT_EQ(0, pool->usedBytes()); try { pool->allocate(129L * MB); } catch (const velox::VeloxRuntimeError& ex) { @@ -564,10 +720,11 @@ TEST_P(MemoryPoolTest, MemoryCapExceptions) { ASSERT_EQ(error_code::kMemCapExceeded.c_str(), ex.errorCode()); ASSERT_TRUE(ex.isRetriable()); ASSERT_EQ( - "Exceeded memory pool cap of 128.00MB with max 128.00MB when " - "requesting 136.00MB, memory manager cap is 128.00MB, requestor " - "'static_quota' with current usage 0B\nMemoryCapExceptions usage " - "0B peak 0B\n", + "Exceeded memory pool capacity after attempt to grow capacity " + "through arbitration. Requestor pool name 'static_quota', request " + "size 136.00MB, memory pool capacity 128.00MB, memory pool max " + "capacity 128.00MB, memory manager capacity 128.00MB, current " + "usage 0B\nMemoryCapExceptions usage 0B reserved 0B peak 0B\n", ex.message()); } } @@ -576,38 +733,83 @@ TEST_P(MemoryPoolTest, MemoryCapExceptions) { // Capping allocator. { auto root = - manager->addRootPool("MemoryCapExceptions", 2 * allocator_->capacity()); + manager->addRootPool("MemoryCapExceptions", 2 * manager->capacity()); auto pool = root->addLeafChild("static_quota", isLeafThreadSafe_); { - ASSERT_EQ(0, pool->currentBytes()); + ASSERT_EQ(0, pool->usedBytes()); try { - pool->allocate(allocator_->capacity() + 1); + pool->allocate(manager->capacity() + 1); } catch (const velox::VeloxRuntimeError& ex) { ASSERT_EQ(error_source::kErrorSourceRuntime.c_str(), ex.errorSource()); ASSERT_EQ(error_code::kMemAllocError.c_str(), ex.errorCode()); ASSERT_TRUE(ex.isRetriable()); if (useMmap_) { - ASSERT_EQ( - fmt::format( - "allocate failed with 128.00MB from Memory Pool[" - "static_quota LEAF root[MemoryCapExceptions] " - "parent[MemoryCapExceptions] MMAP track-usage {}]", - isLeafThreadSafe_ ? "thread-safe" : "non-thread-safe"), - ex.message()); + if (useCache_) { + ASSERT_EQ( + fmt::format( + "allocate failed with 128.00MB from Memory Pool[" + "static_quota LEAF root[MemoryCapExceptions] " + "parent[MemoryCapExceptions] MMAP track-usage {}] Failed to" + " evict from cache state: AsyncDataCache:\nCache size: 0B " + "tinySize: 0B large size: 0B\nCache entries: 0 read pins: " + "0 write pins: 0 pinned shared: 0B pinned exclusive: 0B\n " + "num write wait: 0 empty entries: 0\nCache access miss: 0 " + "hit: 0 hit bytes: 0B eviction: 0 savable eviction: 0 eviction checks: 0 " + "aged out: 0 stales: 0\nPrefetch entries: 0 bytes: 0B\nAlloc Megaclocks 0\n" + "Allocated pages: 0 cached pages: 0\n", + isLeafThreadSafe_ ? "thread-safe" : "non-thread-safe"), + ex.message()); + } else { + ASSERT_EQ( + fmt::format( + "allocate failed with 128.00MB from Memory Pool[" + "static_quota LEAF root[MemoryCapExceptions] " + "parent[MemoryCapExceptions] MMAP track-usage {}] " + "Exceeded memory allocator limit when allocating 32769 " + "new pages for total allocation of 32769 pages, the memory" + " allocator capacity is 32768 pages", + isLeafThreadSafe_ ? "thread-safe" : "non-thread-safe"), + ex.message()); + } } else { - ASSERT_EQ( - fmt::format( - "allocate failed with 128.00MB from Memory Pool" - "[static_quota LEAF root[MemoryCapExceptions] " - "parent[MemoryCapExceptions] MALLOC track-usage {}]" - "", - isLeafThreadSafe_ ? "thread-safe" : "non-thread-safe"), - ex.message()); + if (useCache_) { + ASSERT_EQ( + fmt::format( + "allocate failed with 128.00MB from Memory Pool" + "[static_quota LEAF root[MemoryCapExceptions] " + "parent[MemoryCapExceptions] MALLOC track-usage {}]" + "" + " Failed to evict from cache state: AsyncDataCache:\nCache " + "size: 0B tinySize: 0B large size: 0B\nCache entries: 0 " + "read pins: 0 write pins: 0 pinned shared: 0B pinned " + "exclusive: 0B\n num write wait: 0 empty entries: 0\nCache " + "access miss: 0 hit: 0 hit bytes: 0B eviction: 0 savable eviction: 0 eviction " + "checks: 0 aged out: 0 stales: 0\nPrefetch entries: 0 bytes: 0B\nAlloc Megaclocks" + " 0\nAllocated pages: 0 cached pages: 0\n", + isLeafThreadSafe_ ? "thread-safe" : "non-thread-safe"), + ex.message()); + } else { + ASSERT_EQ( + fmt::format( + "allocate failed with 128.00MB from Memory Pool" + "[static_quota LEAF root[MemoryCapExceptions] " + "parent[MemoryCapExceptions] MALLOC track-usage {}]" + "" + " Failed to allocateBytes 128.00MB: Exceeded memory " + "allocator limit of 128.00MB", + isLeafThreadSafe_ ? "thread-safe" : "non-thread-safe"), + ex.message()); + } } } } @@ -616,42 +818,44 @@ TEST_P(MemoryPoolTest, MemoryCapExceptions) { TEST(MemoryPoolTest, GetAlignment) { { - auto allocator = std::make_shared( - MemoryAllocator::kDefaultCapacityBytes); + MemoryManagerOptions options; + options.allocatorCapacity = kMaxMemory; EXPECT_EQ( MemoryAllocator::kMaxAlignment, - MemoryManager{{.allocator = allocator.get()}} - .addRootPool() - ->alignment()); + MemoryManager{options}.addRootPool()->alignment()); } { - auto allocator = std::make_shared( - MemoryAllocator::kDefaultCapacityBytes); - MemoryManager manager{{.alignment = 64, .allocator = allocator.get()}}; + MemoryManagerOptions options; + options.allocatorCapacity = kMaxMemory; + options.alignment = 64; + MemoryManager manager{options}; EXPECT_EQ(64, manager.addRootPool()->alignment()); } } TEST_P(MemoryPoolTest, MemoryManagerGlobalCap) { - setupMemory({.capacity = 32L * MB}); + setupMemory( + {.allocatorCapacity = 32L * MB, + .arbitratorCapacity = 32L * MB, + .arbitratorReservedCapacity = 16L * MB}); auto manager = getMemoryManager(); - const auto kAllocCap = allocator_->capacity(); + const auto kAllocCap = manager->capacity(); auto root = manager->addRootPool(); auto pool = root->addAggregateChild("unbounded"); auto child = pool->addLeafChild("unbounded", isLeafThreadSafe_); void* oneChunk = child->allocate(kAllocCap); - ASSERT_EQ(root->currentBytes(), kAllocCap); + ASSERT_EQ(root->reservedBytes(), kAllocCap); EXPECT_THROW(child->allocate(kAllocCap), velox::VeloxRuntimeError); - ASSERT_EQ(root->currentBytes(), kAllocCap); + ASSERT_EQ(root->reservedBytes(), kAllocCap); EXPECT_THROW( child->reallocate(oneChunk, kAllocCap, 2 * kAllocCap), velox::VeloxRuntimeError); - ASSERT_EQ(root->currentBytes(), kAllocCap); + ASSERT_EQ(root->reservedBytes(), kAllocCap); child->free(oneChunk, kAllocCap); } // Tests how child updates itself and its parent's memory usage -// and what it returns for currentBytes()/getMaxBytes and +// and what it returns for reservedBytes()/getMaxBytes and // with memoryUsageTracker. TEST_P(MemoryPoolTest, childUsageTest) { MemoryManager& manager = *getMemoryManager(); @@ -659,12 +863,12 @@ TEST_P(MemoryPoolTest, childUsageTest) { auto pool = root->addAggregateChild("main_pool"); auto verifyUsage = [](std::vector>& tree, - std::vector currentBytes, + std::vector usedBytes, std::vector maxBytes) { ASSERT_TRUE( - tree.size() == currentBytes.size() && tree.size() == maxBytes.size()); + tree.size() == usedBytes.size() && tree.size() == maxBytes.size()); for (unsigned i = 0, e = tree.size(); i != e; ++i) { - EXPECT_EQ(tree[i]->currentBytes(), currentBytes[i]) << i; + EXPECT_EQ(tree[i]->usedBytes(), usedBytes[i]) << i; EXPECT_EQ(tree[i]->stats().peakBytes, maxBytes[i]) << i; } }; @@ -696,19 +900,17 @@ TEST_P(MemoryPoolTest, childUsageTest) { void* p3Chunk0 = tree[3]->allocate(16); verifyUsage( - tree, - {1048576, 1048576, 0, 64, 0, 0, 0}, - {1048576, 1048576, 0, 64, 0, 0, 0}); + tree, {64, 64, 0, 64, 0, 0, 0}, {1048576, 1048576, 0, 64, 0, 0, 0}); void* p5Chunk0 = tree[5]->allocate(64); verifyUsage( tree, - {2097152, 1048576, 1048576, 64, 0, 64, 0}, + {128, 64, 64, 64, 0, 64, 0}, {2097152, 1048576, 1048576, 64, 0, 64, 0}); tree[3]->free(p3Chunk0, 16); verifyUsage( tree, - {1048576, 0, 1048576, 0, 0, 64, 0}, + {64, 0, 64, 0, 0, 64, 0}, {2097152, 1048576, 1048576, 64, 0, 64, 0}); tree[5]->free(p5Chunk0, 64); @@ -718,12 +920,12 @@ TEST_P(MemoryPoolTest, childUsageTest) { // Release all memory pool-> tree.clear(); - std::vector expectedCurrentBytes({0, 0, 0, 0, 0, 0, 0}); + std::vector expectedReservedBytes({0, 0, 0, 0, 0, 0, 0}); std::vector expectedMaxBytes({128, 64, 64, 64, 0, 64, 0}); // Verify the stats still holds the correct stats. for (unsigned i = 0, e = tree.size(); i != e; ++i) { - ASSERT_GE(tree[i]->currentBytes(), expectedCurrentBytes[i]); + ASSERT_GE(tree[i]->reservedBytes(), expectedReservedBytes[i]); ASSERT_GE(tree[i]->stats().peakBytes, expectedMaxBytes[i]); } } @@ -764,8 +966,7 @@ TEST_P(MemoryPoolTest, allocatorOverflow) { TEST_P(MemoryPoolTest, contiguousAllocate) { auto manager = getMemoryManager(); auto pool = manager->addLeafPool("contiguousAllocate"); - const auto largestSizeClass = - MemoryAllocator::getInstance()->largestSizeClass(); + const auto largestSizeClass = manager->allocator()->largestSizeClass(); struct { MachinePageCount numAllocPages; std::string debugString() const { @@ -823,7 +1024,7 @@ TEST_P(MemoryPoolTest, contiguousAllocate) { const MachinePageCount minSizeClass = folly::Random().oneIn(4) ? 0 : std::min( - MemoryAllocator::getInstance()->largestSizeClass(), + manager->allocator()->largestSizeClass(), folly::Random().rand32() % kMaxAllocationPages); pool->allocateContiguous(pagesToAllocate, allocation); numAllocatedPages += allocation.numPages(); @@ -846,9 +1047,13 @@ TEST_P(MemoryPoolTest, contiguousAllocate) { } TEST_P(MemoryPoolTest, contiguousAllocateExceedLimit) { - setupMemory({.capacity = (int64_t)(AllocationTraits::pageBytes(1 << 10))}); + const auto memCapacity = (int64_t)(AllocationTraits::pageBytes(1 << 10)); + setupMemory( + {.allocatorCapacity = memCapacity, + .arbitratorCapacity = memCapacity, + .arbitratorReservedCapacity = memCapacity / 2}); auto manager = getMemoryManager(); - const auto kMemoryCapBytes = allocator_->capacity(); + const auto kMemoryCapBytes = manager->capacity(); const auto kMaxNumPages = AllocationTraits::numPages(kMemoryCapBytes); auto root = manager->addRootPool("contiguousAllocateExceedLimit", kMemoryCapBytes); @@ -876,7 +1081,7 @@ TEST_P(MemoryPoolTest, badContiguousAllocation) { TEST_P(MemoryPoolTest, nonContiguousAllocate) { auto manager = getMemoryManager(); auto pool = manager->addLeafPool("nonContiguousAllocate"); - const auto& sizeClasses = MemoryAllocator::getInstance()->sizeClasses(); + const auto& sizeClasses = manager->allocator()->sizeClasses(); for (const auto& sizeClass : sizeClasses) { SCOPED_TRACE(fmt::format("sizeClass:{}", sizeClass)); struct { @@ -917,8 +1122,7 @@ TEST_P(MemoryPoolTest, nonContiguousAllocate) { testData.numAllocPages, allocation, std::min( - testData.minSizeClass, - MemoryAllocator::getInstance()->largestSizeClass())); + testData.minSizeClass, manager->allocator()->largestSizeClass())); ASSERT_FALSE(allocation.empty()); ASSERT_EQ(allocation.pool(), pool.get()); ASSERT_GT(allocation.numRuns(), 0); @@ -946,7 +1150,7 @@ TEST_P(MemoryPoolTest, nonContiguousAllocate) { const MachinePageCount minSizeClass = folly::Random().oneIn(4) ? 0 : std::min( - MemoryAllocator::getInstance()->largestSizeClass(), + manager->allocator()->largestSizeClass(), folly::Random().rand32() % kMaxAllocationPages); pool->allocateNonContiguous(pagesToAllocate, allocation, minSizeClass); numAllocatedPages += allocation.numPages(); @@ -960,10 +1164,15 @@ TEST_P(MemoryPoolTest, nonContiguousAllocate) { } TEST_P(MemoryPoolTest, allocationFailStats) { - setupMemory({.capacity = 16 * KB}); + setupMemory( + {.allocatorCapacity = 16 * KB, + .allocationSizeThresholdWithReservation = false, + .arbitratorCapacity = 16 * KB, + .arbitratorReservedCapacity = 16 * KB, + .memoryPoolReservedCapacity = 16 * KB}); auto manager = getMemoryManager(); auto pool = manager->addLeafPool("allocationFailStats"); - auto allocatorCapacity = allocator_->capacity(); + auto allocatorCapacity = manager->capacity(); EXPECT_THROW(pool->allocate(allocatorCapacity + 1), VeloxException); EXPECT_EQ(1, pool->stats().numAllocs); @@ -1091,8 +1300,8 @@ TEST_P(MemoryPoolTest, persistentNonContiguousAllocateFailure) { std::string debugString() const { return fmt::format( "numOldPages:{}, numNewPages:{}, injectedFailure:{}", - numOldPages, - numNewPages, + static_cast(numOldPages), + static_cast(numNewPages), injectedFailure); } } testSettings[] = {// Cap failure injection. @@ -1177,11 +1386,12 @@ TEST_P(MemoryPoolTest, persistentNonContiguousAllocateFailure) { pool->allocateNonContiguous(testData.numOldPages, allocation); } ASSERT_GE(allocation.numPages(), testData.numOldPages); - allocator_->testingSetFailureInjection(testData.injectedFailure, true); + manager->allocator()->testingSetFailureInjection( + testData.injectedFailure, true); ASSERT_THROW( pool->allocateNonContiguous(testData.numNewPages, allocation), VeloxRuntimeError); - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); } } @@ -1189,73 +1399,92 @@ TEST_P(MemoryPoolTest, transientNonContiguousAllocateFailure) { struct { MachinePageCount numOldPages; MachinePageCount numNewPages; + MachinePageCount expectedAllocatedPages; MemoryAllocator::InjectedFailure injectedFailure; std::string debugString() const { return fmt::format( - "numOldPages:{}, numNewPages:{}, injectedFailure:{}", + "numOldPages:{}, numNewPages:{}, expectedAllocatedPages:{}, injectedFailure:{}", numOldPages, numNewPages, + expectedAllocatedPages, injectedFailure); } - } testSettings[] = {// Cap failure injection. - {0, 100, MemoryAllocator::InjectedFailure::kCap}, - {0, - Allocation::PageRun::kMaxPagesInRun / 2, - MemoryAllocator::InjectedFailure::kCap}, - {0, - Allocation::PageRun::kMaxPagesInRun, - MemoryAllocator::InjectedFailure::kCap}, - {100, 100, MemoryAllocator::InjectedFailure::kCap}, - {Allocation::PageRun::kMaxPagesInRun / 2, - Allocation::PageRun::kMaxPagesInRun / 2, - MemoryAllocator::InjectedFailure::kCap}, - {Allocation::PageRun::kMaxPagesInRun, - Allocation::PageRun::kMaxPagesInRun, - MemoryAllocator::InjectedFailure::kCap}, - {200, 100, MemoryAllocator::InjectedFailure::kCap}, - {Allocation::PageRun::kMaxPagesInRun / 2 + 100, - Allocation::PageRun::kMaxPagesInRun / 2, - MemoryAllocator::InjectedFailure::kCap}, - {Allocation::PageRun::kMaxPagesInRun, - Allocation::PageRun::kMaxPagesInRun - 1, - MemoryAllocator::InjectedFailure::kCap}, - {Allocation::PageRun::kMaxPagesInRun, - Allocation::PageRun::kMaxPagesInRun / 2, - MemoryAllocator::InjectedFailure::kCap}, - // Allocate failure injection. - {0, 100, MemoryAllocator::InjectedFailure::kAllocate}, - {0, - Allocation::PageRun::kMaxPagesInRun / 2, - MemoryAllocator::InjectedFailure::kAllocate}, - {0, - Allocation::PageRun::kMaxPagesInRun, - MemoryAllocator::InjectedFailure::kCap}, - {100, 100, MemoryAllocator::InjectedFailure::kAllocate}, - {Allocation::PageRun::kMaxPagesInRun / 2, - Allocation::PageRun::kMaxPagesInRun / 2, - MemoryAllocator::InjectedFailure::kAllocate}, - {Allocation::PageRun::kMaxPagesInRun, - Allocation::PageRun::kMaxPagesInRun, - MemoryAllocator::InjectedFailure::kAllocate}, - {200, 100, MemoryAllocator::InjectedFailure::kAllocate}, - {Allocation::PageRun::kMaxPagesInRun / 2 + 100, - Allocation::PageRun::kMaxPagesInRun / 2, - MemoryAllocator::InjectedFailure::kAllocate}, - {Allocation::PageRun::kMaxPagesInRun, - Allocation::PageRun::kMaxPagesInRun - 1, - MemoryAllocator::InjectedFailure::kAllocate}, - {Allocation::PageRun::kMaxPagesInRun, - Allocation::PageRun::kMaxPagesInRun / 2, - MemoryAllocator::InjectedFailure::kAllocate}, - // Madvise failure injection. - {0, 100, MemoryAllocator::InjectedFailure::kMadvise}, - {0, - Allocation::PageRun::kMaxPagesInRun / 2, - MemoryAllocator::InjectedFailure::kMadvise}, - {0, - Allocation::PageRun::kMaxPagesInRun, - MemoryAllocator::InjectedFailure::kMadvise}, - {200, 100, MemoryAllocator::InjectedFailure::kMadvise}}; + } testSettings[] = { + // Cap failure injection. + {0, 100, 100, MemoryAllocator::InjectedFailure::kCap}, + {0, + Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + MemoryAllocator::InjectedFailure::kCap}, + {0, + Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun + 1, + MemoryAllocator::InjectedFailure::kCap}, + {100, 100, 100, MemoryAllocator::InjectedFailure::kCap}, + {Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + MemoryAllocator::InjectedFailure::kCap}, + {Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun + 1, + MemoryAllocator::InjectedFailure::kCap}, + {200, 100, 100, MemoryAllocator::InjectedFailure::kCap}, + {Allocation::PageRun::kMaxPagesInRun / 2 + 100, + Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + MemoryAllocator::InjectedFailure::kCap}, + {Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun - 1, + Allocation::PageRun::kMaxPagesInRun + 1, + MemoryAllocator::InjectedFailure::kCap}, + {Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + MemoryAllocator::InjectedFailure::kCap}, + // Allocate failure injection. + {0, 100, 100, MemoryAllocator::InjectedFailure::kAllocate}, + {0, + Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + MemoryAllocator::InjectedFailure::kAllocate}, + {0, + Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun + 1, + MemoryAllocator::InjectedFailure::kCap}, + {100, 100, 100, MemoryAllocator::InjectedFailure::kAllocate}, + {Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + MemoryAllocator::InjectedFailure::kAllocate}, + {Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun + 1, + MemoryAllocator::InjectedFailure::kAllocate}, + {200, 100, 100, MemoryAllocator::InjectedFailure::kAllocate}, + {Allocation::PageRun::kMaxPagesInRun / 2 + 100, + Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + MemoryAllocator::InjectedFailure::kAllocate}, + {Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun - 1, + Allocation::PageRun::kMaxPagesInRun + 1, + MemoryAllocator::InjectedFailure::kAllocate}, + {Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + MemoryAllocator::InjectedFailure::kAllocate}, + // Madvise failure injection. + {0, 100, 100, MemoryAllocator::InjectedFailure::kMadvise}, + {0, + Allocation::PageRun::kMaxPagesInRun / 2, + Allocation::PageRun::kMaxPagesInRun / 2, + MemoryAllocator::InjectedFailure::kMadvise}, + {0, + Allocation::PageRun::kMaxPagesInRun, + Allocation::PageRun::kMaxPagesInRun + 1, + MemoryAllocator::InjectedFailure::kMadvise}, + {200, 100, 100, MemoryAllocator::InjectedFailure::kMadvise}}; for (const auto& testData : testSettings) { SCOPED_TRACE(fmt::format( "{}, useMmap:{}, useCache:{}", @@ -1278,16 +1507,16 @@ TEST_P(MemoryPoolTest, transientNonContiguousAllocateFailure) { pool->allocateNonContiguous(testData.numOldPages, allocation); } ASSERT_GE(allocation.numPages(), testData.numOldPages); - allocator_->testingSetFailureInjection(testData.injectedFailure); + manager->allocator()->testingSetFailureInjection(testData.injectedFailure); if (useCache_) { pool->allocateNonContiguous(testData.numNewPages, allocation); - ASSERT_EQ(allocation.numPages(), testData.numNewPages); + ASSERT_EQ(allocation.numPages(), testData.expectedAllocatedPages); } else { ASSERT_THROW( pool->allocateNonContiguous(testData.numNewPages, allocation), VeloxRuntimeError); } - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); } } @@ -1435,7 +1664,8 @@ TEST_P(MemoryPoolTest, persistentContiguousAllocateFailure) { if (testData.numOldPages > 0) { pool->allocateContiguous(testData.numOldPages, allocation); } - allocator_->testingSetFailureInjection(testData.injectedFailure, true); + manager->allocator()->testingSetFailureInjection( + testData.injectedFailure, true); ASSERT_EQ(allocation.numPages(), testData.numOldPages); if ((testData.numOldPages >= testData.numNewPages) && testData.injectedFailure != MemoryAllocator::InjectedFailure::kMmap) { @@ -1446,7 +1676,7 @@ TEST_P(MemoryPoolTest, persistentContiguousAllocateFailure) { pool->allocateContiguous(testData.numNewPages, allocation), VeloxRuntimeError); } - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); } } @@ -1556,7 +1786,7 @@ TEST_P(MemoryPoolTest, transientContiguousAllocateFailure) { if (testData.numOldPages > 0) { pool->allocateContiguous(testData.numOldPages, allocation); } - allocator_->testingSetFailureInjection(testData.injectedFailure); + manager->allocator()->testingSetFailureInjection(testData.injectedFailure); ASSERT_EQ(allocation.numPages(), testData.numOldPages); // NOTE: AsyncDataCache will retry on the transient memory allocation // failures from the underlying allocator. @@ -1570,14 +1800,17 @@ TEST_P(MemoryPoolTest, transientContiguousAllocateFailure) { pool->allocateContiguous(testData.numNewPages, allocation), VeloxRuntimeError); } - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); } } TEST_P(MemoryPoolTest, contiguousAllocateExceedMemoryPoolLimit) { const MachinePageCount kMaxNumPages = 1 << 10; const auto kMemoryCapBytes = kMaxNumPages * AllocationTraits::kPageSize; - setupMemory({.capacity = 1 << 30}); + setupMemory( + {.allocatorCapacity = 1 << 30, + .arbitratorCapacity = 1 << 30, + .arbitratorReservedCapacity = 128 * MB}); auto manager = getMemoryManager(); auto root = manager->addRootPool("contiguousAllocateExceedLimit", kMemoryCapBytes); @@ -1654,7 +1887,8 @@ TEST_P(MemoryPoolTest, persistentContiguousGrowAllocateFailure) { testData.numInitialPages, allocation, testData.numGrowPages + testData.numInitialPages); - allocator_->testingSetFailureInjection(testData.injectedFailure, true); + manager->allocator()->testingSetFailureInjection( + testData.injectedFailure, true); ASSERT_EQ(allocation.numPages(), testData.numInitialPages); ASSERT_EQ( allocation.maxSize(), @@ -1664,7 +1898,7 @@ TEST_P(MemoryPoolTest, persistentContiguousGrowAllocateFailure) { pool->growContiguous(testData.numGrowPages, allocation), testData.expectedErrorMessage); ASSERT_EQ(allocation.numPages(), testData.numInitialPages); - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); } } @@ -1709,7 +1943,8 @@ TEST_P(MemoryPoolTest, transientContiguousGrowAllocateFailure) { testData.numInitialPages, allocation, testData.numGrowPages + testData.numInitialPages); - allocator_->testingSetFailureInjection(testData.injectedFailure, false); + manager->allocator()->testingSetFailureInjection( + testData.injectedFailure, false); ASSERT_EQ(allocation.numPages(), testData.numInitialPages); ASSERT_EQ( allocation.maxSize(), @@ -1732,14 +1967,17 @@ TEST_P(MemoryPoolTest, transientContiguousGrowAllocateFailure) { VeloxRuntimeError); ASSERT_EQ(allocation.numPages(), testData.numInitialPages); } - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); } } TEST_P(MemoryPoolTest, contiguousAllocateGrowExceedMemoryPoolLimit) { const MachinePageCount kMaxNumPages = 1 << 10; const auto kMemoryCapBytes = kMaxNumPages * AllocationTraits::kPageSize; - setupMemory({.capacity = 1 << 30}); + setupMemory( + {.allocatorCapacity = 1 << 30, + .arbitratorCapacity = 1 << 30, + .arbitratorReservedCapacity = 128 * MB}); auto manager = getMemoryManager(); auto root = manager->addRootPool( "contiguousAllocateGrowExceedMemoryPoolLimit", kMemoryCapBytes); @@ -1757,9 +1995,9 @@ TEST_P(MemoryPoolTest, contiguousAllocateGrowExceedMemoryPoolLimit) { ASSERT_EQ(allocation.numPages(), kMaxNumPages / 2); } -TEST_P(MemoryPoolTest, badNonContiguousAllocation) { +TEST_P(MemoryPoolTest, nonContiguousAllocationBounds) { auto manager = getMemoryManager(); - auto pool = manager->addLeafPool("badNonContiguousAllocation"); + auto pool = manager->addLeafPool("nonContiguousAllocationBounds"); Allocation allocation; // Bad zero page allocation size. ASSERT_THROW(pool->allocateNonContiguous(0, allocation), VeloxRuntimeError); @@ -1767,23 +2005,25 @@ TEST_P(MemoryPoolTest, badNonContiguousAllocation) { // Set the num of pages to allocate exceeds one PageRun limit. constexpr MachinePageCount kNumPages = Allocation::PageRun::kMaxPagesInRun + 1; - ASSERT_THROW( - pool->allocateNonContiguous(kNumPages, allocation), VeloxRuntimeError); + pool->allocateNonContiguous(kNumPages, allocation); + ASSERT_GE(allocation.numPages(), kNumPages); + pool->freeNonContiguous(allocation); pool->allocateNonContiguous(kNumPages - 1, allocation); ASSERT_GE(allocation.numPages(), kNumPages - 1); pool->freeNonContiguous(allocation); + pool->allocateNonContiguous( + Allocation::PageRun::kMaxPagesInRun * 2, allocation); + ASSERT_GE(allocation.numPages(), Allocation::PageRun::kMaxPagesInRun * 2); + pool->freeNonContiguous(allocation); } TEST_P(MemoryPoolTest, nonContiguousAllocateExceedLimit) { const int64_t kMemoryCapBytes = AllocationTraits::pageBytes(1 << 10); - std::shared_ptr allocator; - if (useMmap_) { - MmapAllocator::Options mmapOpt{.capacity = (uint64_t)kMemoryCapBytes}; - allocator = std::make_shared(mmapOpt); - } else { - allocator = std::make_shared(kMemoryCapBytes); - } - setupMemory({.capacity = kMemoryCapBytes, .allocator = allocator.get()}); + setupMemory( + {.allocatorCapacity = kMemoryCapBytes, + .useMmapAllocator = useMmap_, + .arbitratorCapacity = kMemoryCapBytes, + .arbitratorReservedCapacity = kMemoryCapBytes / 2}); auto manager = getMemoryManager(); const MachinePageCount kMaxNumPages = AllocationTraits::numPages(kMemoryCapBytes); @@ -1808,14 +2048,14 @@ TEST_P(MemoryPoolTest, nonContiguousAllocateExceedLimit) { TEST_P(MemoryPoolTest, nonContiguousAllocateError) { auto manager = getMemoryManager(); auto pool = manager->addLeafPool("nonContiguousAllocateError"); - allocator_->testingSetFailureInjection( + manager->allocator()->testingSetFailureInjection( MemoryAllocator::InjectedFailure::kAllocate, true); constexpr MachinePageCount kAllocSize = 8; std::unique_ptr allocation(new Allocation()); ASSERT_THROW( pool->allocateNonContiguous(kAllocSize, *allocation), VeloxRuntimeError); ASSERT_TRUE(allocation->empty()); - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); pool->allocateNonContiguous(kAllocSize, *allocation); pool->freeNonContiguous(*allocation); ASSERT_TRUE(allocation->empty()); @@ -1852,7 +2092,7 @@ TEST_P(MemoryPoolTest, mmapAllocatorCapAllocationError) { auto pool = root->addLeafChild( "mmapAllocatorCapAllocationError", isLeafThreadSafe_); - allocator_->testingSetFailureInjection( + manager->allocator()->testingSetFailureInjection( MemoryAllocator::InjectedFailure::kCap, testData.persistentErrorInjection); // Async data cache will retry transient memory allocation failure. @@ -1863,7 +2103,7 @@ TEST_P(MemoryPoolTest, mmapAllocatorCapAllocationError) { } else { ASSERT_THROW(pool->allocate(testData.allocateBytes), VeloxRuntimeError); } - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); } } @@ -1901,7 +2141,7 @@ TEST_P(MemoryPoolTest, mmapAllocatorCapAllocationZeroFilledError) { auto pool = root->addLeafChild( "mmapAllocatorCapAllocationZeroFilledError", isLeafThreadSafe_); - allocator_->testingSetFailureInjection( + manager->allocator()->testingSetFailureInjection( MemoryAllocator::InjectedFailure::kCap, testData.persistentErrorInjection); // Async data cache will retry transient memory allocation failure. @@ -1915,7 +2155,7 @@ TEST_P(MemoryPoolTest, mmapAllocatorCapAllocationZeroFilledError) { pool->allocateZeroFilled(testData.numEntries, testData.sizeEach), VeloxRuntimeError); } - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); } } @@ -1950,7 +2190,7 @@ TEST_P(MemoryPoolTest, mmapAllocatorCapReallocateError) { auto pool = root->addLeafChild( "mmapAllocatorCapReallocateError", isLeafThreadSafe_); - allocator_->testingSetFailureInjection( + manager->allocator()->testingSetFailureInjection( MemoryAllocator::InjectedFailure::kCap, testData.persistentErrorInjection); // Async data cache will retry transient memory allocation failure. @@ -1963,7 +2203,7 @@ TEST_P(MemoryPoolTest, mmapAllocatorCapReallocateError) { pool->reallocate(nullptr, 0, testData.allocateBytes), VeloxRuntimeError); } - allocator_->testingClearFailureInjection(); + manager->allocator()->testingClearFailureInjection(); } } @@ -2144,17 +2384,17 @@ TEST_P(MemoryPoolTest, concurrentUpdateToDifferentPools) { ASSERT_EQ(root->availableReservation(), 0); for (auto& child : childPools) { - ASSERT_EQ(child->currentBytes(), 0); + ASSERT_EQ(child->usedBytes(), 0); child->release(); ASSERT_EQ(child->reservedBytes(), 0); ASSERT_EQ(child->availableReservation(), 0); - ASSERT_EQ(child->currentBytes(), 0); + ASSERT_EQ(child->usedBytes(), 0); ASSERT_LE(child->stats().peakBytes, child->stats().cumulativeBytes); } ASSERT_LE(root->stats().peakBytes, root->stats().cumulativeBytes); childPools.clear(); ASSERT_LE(root->stats().peakBytes, root->stats().cumulativeBytes); - ASSERT_EQ(root->stats().currentBytes, 0); + ASSERT_EQ(root->stats().usedBytes, 0); } TEST_P(MemoryPoolTest, concurrentUpdatesToTheSamePool) { @@ -2194,17 +2434,17 @@ TEST_P(MemoryPoolTest, concurrentUpdatesToTheSamePool) { ASSERT_EQ(root->availableReservation(), 0); for (auto& child : childPools) { - ASSERT_EQ(child->currentBytes(), 0); + ASSERT_EQ(child->usedBytes(), 0); child->release(); ASSERT_EQ(child->reservedBytes(), 0); ASSERT_EQ(child->availableReservation(), 0); - ASSERT_EQ(child->currentBytes(), 0); + ASSERT_EQ(child->usedBytes(), 0); ASSERT_LE(child->stats().peakBytes, child->stats().cumulativeBytes); } ASSERT_LE(root->stats().peakBytes, root->stats().cumulativeBytes); childPools.clear(); ASSERT_LE(root->stats().peakBytes, root->stats().cumulativeBytes); - ASSERT_EQ(root->stats().currentBytes, 0); + ASSERT_EQ(root->stats().usedBytes, 0); } TEST_P(MemoryPoolTest, concurrentUpdateToSharedPools) { @@ -2216,7 +2456,7 @@ TEST_P(MemoryPoolTest, concurrentUpdateToSharedPools) { folly::Random::DefaultGenerator rng; rng.seed(1234); - const int32_t kNumOpsPerThread = 1'000; + const int32_t kNumOpsPerThread = 200; std::vector threads; threads.reserve(kNumThreads); for (size_t i = 0; i < kNumThreads; ++i) { @@ -2234,7 +2474,7 @@ TEST_P(MemoryPoolTest, concurrentUpdateToSharedPools) { } for (auto pool : manager.testingSharedLeafPools()) { - EXPECT_EQ(pool->currentBytes(), 0); + EXPECT_EQ(pool->usedBytes(), 0); } } @@ -2307,9 +2547,9 @@ TEST_P(MemoryPoolTest, concurrentPoolStructureAccess) { } TEST(MemoryPoolTest, visitChildren) { - auto allocator = - std::make_shared(MemoryAllocator::kDefaultCapacityBytes); - MemoryManager manager{{.allocator = allocator.get()}}; + MemoryManagerOptions options; + options.allocatorCapacity = kMaxMemory; + MemoryManager manager{options}; auto root = manager.addRootPool("root"); const int numChildren = 10; @@ -2355,13 +2595,13 @@ TEST(MemoryPoolTest, debugMode) { uint64_t size) { for (const auto& pair : records) { EXPECT_EQ(pair.second.size, size); - EXPECT_FALSE(pair.second.callStack.empty()); } }; - auto allocator = - std::make_shared(MemoryAllocator::kDefaultCapacityBytes); - MemoryManager manager{{.debugEnabled = true, .allocator = allocator.get()}}; + MemoryManagerOptions options; + options.allocatorCapacity = kMaxMemory; + options.debugEnabled = true; + MemoryManager manager{options}; auto pool = manager.addRootPool("root")->addLeafChild("child"); const auto& allocRecords = std::dynamic_pointer_cast(pool) ->testingDebugAllocRecords(); @@ -2410,11 +2650,8 @@ TEST(MemoryPoolTest, debugModeWithFilter) { const std::vector kAllocSizes = {128, 8 * KB, 2 * MB}; const std::vector debugEnabledSet{true, false}; for (const auto& debugEnabled : debugEnabledSet) { - auto allocator = std::make_shared(kMaxMemory); MemoryManager manager{ - {.capacity = kMaxMemory, - .debugEnabled = debugEnabled, - .allocator = allocator.get()}}; + {.debugEnabled = debugEnabled, .allocatorCapacity = kMaxMemory}}; // leaf child created from MemoryPool, not match filter MemoryPoolImpl::setDebugPoolNameRegex("NO-MATCH"); @@ -2543,7 +2780,7 @@ TEST_P(MemoryPoolTest, shrinkAndGrowAPIs) { ASSERT_EQ(aggregationPool->freeBytes(), capacity); } if (capacity == 0) { - ASSERT_ANY_THROW(leafPool->allocate(allocationSize)); + VELOX_ASSERT_THROW(leafPool->allocate(allocationSize), ""); ASSERT_EQ(leafPool->shrink(0), 0); ASSERT_EQ(leafPool->shrink(allocationSize), 0); continue; @@ -2553,15 +2790,15 @@ TEST_P(MemoryPoolTest, shrinkAndGrowAPIs) { ASSERT_EQ(rootPool->freeBytes(), 0); ASSERT_EQ(leafPool->freeBytes(), 0); ASSERT_EQ(aggregationPool->freeBytes(), 0); - ASSERT_ANY_THROW(leafPool->shrink(0)); - ASSERT_ANY_THROW(leafPool->shrink(allocationSize)); - ASSERT_ANY_THROW(leafPool->shrink(kMaxMemory)); - ASSERT_ANY_THROW(aggregationPool->shrink(0)); - ASSERT_ANY_THROW(aggregationPool->shrink(allocationSize)); - ASSERT_ANY_THROW(aggregationPool->shrink(kMaxMemory)); - ASSERT_ANY_THROW(rootPool->shrink(0)); - ASSERT_ANY_THROW(rootPool->shrink(allocationSize)); - ASSERT_ANY_THROW(rootPool->shrink(kMaxMemory)); + VELOX_ASSERT_THROW(leafPool->shrink(0), ""); + VELOX_ASSERT_THROW(leafPool->shrink(allocationSize), ""); + VELOX_ASSERT_THROW(leafPool->shrink(kMaxMemory), ""); + VELOX_ASSERT_THROW(aggregationPool->shrink(0), ""); + VELOX_ASSERT_THROW(aggregationPool->shrink(allocationSize), ""); + VELOX_ASSERT_THROW(aggregationPool->shrink(kMaxMemory), ""); + VELOX_ASSERT_THROW(rootPool->shrink(0), ""); + VELOX_ASSERT_THROW(rootPool->shrink(allocationSize), ""); + VELOX_ASSERT_THROW(rootPool->shrink(kMaxMemory), ""); leafPool->free(buffer, allocationSize); continue; } @@ -2627,11 +2864,14 @@ TEST_P(MemoryPoolTest, shrinkAndGrowAPIs) { for (int i = 0; i < step; ++i) { const int expectedCapacity = (i + 1) * allocationSize; if (i % 3 == 0) { - ASSERT_EQ(leafPool->grow(allocationSize), expectedCapacity); + ASSERT_TRUE(leafPool->grow(allocationSize, 0)); + ASSERT_EQ(leafPool->capacity(), expectedCapacity); } else if (i % 3 == 1) { - ASSERT_EQ(aggregationPool->grow(allocationSize), expectedCapacity); + ASSERT_TRUE(aggregationPool->grow(allocationSize, 0)); + ASSERT_EQ(leafPool->capacity(), expectedCapacity); } else { - ASSERT_EQ(rootPool->grow(allocationSize), expectedCapacity); + ASSERT_TRUE(rootPool->grow(allocationSize, 0)); + ASSERT_EQ(leafPool->capacity(), expectedCapacity); } ASSERT_EQ(leafPool->capacity(), expectedCapacity); ASSERT_EQ(aggregationPool->capacity(), expectedCapacity); @@ -2776,12 +3016,10 @@ TEST_P(MemoryPoolTest, reclaimAPIsWithDefaultReclaimer) { } } for (auto& pool : pools) { - uint64_t reclaimableBytes{100}; - ASSERT_FALSE(pool->reclaimableBytes(reclaimableBytes)); - ASSERT_EQ(reclaimableBytes, 0); - ASSERT_EQ(pool->reclaim(0, stats_), 0); - ASSERT_EQ(pool->reclaim(100, stats_), 0); - ASSERT_EQ(pool->reclaim(kMaxMemory, stats_), 0); + ASSERT_FALSE(pool->reclaimableBytes().has_value()); + ASSERT_EQ(pool->reclaim(0, 0, stats_), 0); + ASSERT_EQ(pool->reclaim(100, 0, stats_), 0); + ASSERT_EQ(pool->reclaim(kMaxMemory, 0, stats_), 0); } for (const auto& allocation : allocations) { allocation.pool->free(allocation.buffer, allocation.size); @@ -2824,7 +3062,7 @@ TEST_P(MemoryPoolTest, statsAndToString) { void* buf1 = leafChild1->allocate(bufferSize); ASSERT_EQ( leafChild1->stats().toString(), - "currentBytes:1.00KB peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:1.00KB reservedBytes:1.00MB peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); ASSERT_EQ( leafChild1->toString(), fmt::format( @@ -2833,7 +3071,7 @@ TEST_P(MemoryPoolTest, statsAndToString) { isLeafThreadSafe_ ? "thread-safe" : "non-thread-safe")); ASSERT_EQ( leafChild2->stats().toString(), - "currentBytes:0B peakBytes:0B cumulativeBytes:0B numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:0B reservedBytes:0B peakBytes:0B cumulativeBytes:0B numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); ASSERT_EQ( leafChild1->toString(), fmt::format( @@ -2842,36 +3080,36 @@ TEST_P(MemoryPoolTest, statsAndToString) { isLeafThreadSafe_ ? "thread-safe" : "non-thread-safe")); ASSERT_EQ( aggregateChild->stats().toString(), - "currentBytes:0B peakBytes:0B cumulativeBytes:0B numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:0B reservedBytes:0B peakBytes:0B cumulativeBytes:0B numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); ASSERT_EQ( root->stats().toString(), - "currentBytes:1.00MB peakBytes:1.00MB cumulativeBytes:1.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:1.00KB reservedBytes:1.00MB peakBytes:1.00MB cumulativeBytes:1.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); void* buf2 = leafChild2->allocate(bufferSize); ASSERT_EQ( leafChild1->stats().toString(), - "currentBytes:1.00KB peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:1.00KB reservedBytes:1.00MB peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); ASSERT_EQ( leafChild2->stats().toString(), - "currentBytes:1.00KB peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:1.00KB reservedBytes:1.00MB peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); ASSERT_EQ( aggregateChild->stats().toString(), - "currentBytes:1.00MB peakBytes:1.00MB cumulativeBytes:1.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:1.00KB reservedBytes:1.00MB peakBytes:1.00MB cumulativeBytes:1.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); ASSERT_EQ( root->stats().toString(), - "currentBytes:2.00MB peakBytes:2.00MB cumulativeBytes:2.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:2.00KB reservedBytes:2.00MB peakBytes:2.00MB cumulativeBytes:2.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); leafChild1->free(buf1, bufferSize); ASSERT_EQ( leafChild1->stats().toString(), - "currentBytes:0B peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:1 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:0B reservedBytes:0B peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:1 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); ASSERT_EQ( leafChild2->stats().toString(), - "currentBytes:1.00KB peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:1.00KB reservedBytes:1.00MB peakBytes:1.00KB cumulativeBytes:1.00KB numAllocs:1 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); ASSERT_EQ( aggregateChild->stats().toString(), - "currentBytes:1.00MB peakBytes:1.00MB cumulativeBytes:1.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:1.00KB reservedBytes:1.00MB peakBytes:1.00MB cumulativeBytes:1.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); ASSERT_EQ( root->stats().toString(), - "currentBytes:1.00MB peakBytes:2.00MB cumulativeBytes:2.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0"); + "usedBytes:1.00KB reservedBytes:1.00MB peakBytes:2.00MB cumulativeBytes:2.00MB numAllocs:0 numFrees:0 numReserves:0 numReleases:0 numShrinks:0 numReclaims:0 numCollisions:0 numCapacityGrowths:0"); leafChild2->free(buf2, bufferSize); std::vector bufs; for (int i = 0; i < 10; ++i) { @@ -2884,14 +3122,16 @@ TEST_P(MemoryPoolTest, statsAndToString) { ASSERT_EQ(root->stats().peakBytes, 2097152); ASSERT_EQ(root->peakBytes(), 2097152); ASSERT_EQ(root->stats().cumulativeBytes, 3145728); - ASSERT_EQ(root->stats().currentBytes, 1048576); + ASSERT_EQ(root->stats().usedBytes, 10240); + ASSERT_EQ(root->stats().numCapacityGrowths, 0); ASSERT_EQ(leafChild1->stats().numAllocs, 11); ASSERT_EQ(leafChild1->stats().numFrees, 1); - ASSERT_EQ(leafChild1->stats().currentBytes, 10240); + ASSERT_EQ(leafChild1->stats().usedBytes, 10240); ASSERT_EQ(leafChild1->stats().peakBytes, 10240); ASSERT_EQ(leafChild1->stats().cumulativeBytes, 11264); ASSERT_EQ(leafChild1->stats().numReserves, 0); ASSERT_EQ(leafChild1->stats().numReleases, 0); + ASSERT_EQ(leafChild1->stats().numCapacityGrowths, 0); for (auto* buf : bufs) { leafChild1->free(buf, bufferSize); } @@ -2902,32 +3142,36 @@ TEST_P(MemoryPoolTest, statsAndToString) { ASSERT_EQ(root->stats().peakBytes, 2097152); ASSERT_EQ(root->peakBytes(), 2097152); ASSERT_EQ(root->stats().cumulativeBytes, 3145728); - ASSERT_EQ(root->stats().currentBytes, 0); + ASSERT_EQ(root->stats().usedBytes, 0); + ASSERT_EQ(root->stats().numCapacityGrowths, 0); ASSERT_EQ(leafChild1->stats().numAllocs, 11); ASSERT_EQ(leafChild1->stats().numFrees, 11); - ASSERT_EQ(leafChild1->stats().currentBytes, 0); + ASSERT_EQ(leafChild1->stats().usedBytes, 0); ASSERT_EQ(leafChild1->stats().peakBytes, 10240); ASSERT_EQ(leafChild1->stats().cumulativeBytes, 11264); ASSERT_EQ(leafChild1->stats().numReserves, 0); ASSERT_EQ(leafChild1->stats().numReleases, 0); + ASSERT_EQ(leafChild1->stats().numCapacityGrowths, 0); leafChild1->maybeReserve(bufferSize); ASSERT_EQ(leafChild1->stats().numAllocs, 11); ASSERT_EQ(leafChild1->stats().numFrees, 11); - ASSERT_EQ(leafChild1->stats().currentBytes, 0); + ASSERT_EQ(leafChild1->stats().usedBytes, 0); ASSERT_EQ(leafChild1->stats().peakBytes, 10240); ASSERT_EQ(leafChild1->peakBytes(), 10240); ASSERT_EQ(leafChild1->stats().cumulativeBytes, 11264); ASSERT_EQ(leafChild1->stats().numReserves, 1); ASSERT_EQ(leafChild1->stats().numReleases, 0); + ASSERT_EQ(leafChild1->stats().numCapacityGrowths, 0); leafChild1->release(); ASSERT_EQ(leafChild1->stats().numAllocs, 11); ASSERT_EQ(leafChild1->stats().numFrees, 11); - ASSERT_EQ(leafChild1->stats().currentBytes, 0); + ASSERT_EQ(leafChild1->stats().usedBytes, 0); ASSERT_EQ(leafChild1->stats().cumulativeBytes, 11264); ASSERT_EQ(leafChild1->stats().peakBytes, 10240); ASSERT_EQ(leafChild1->peakBytes(), 10240); ASSERT_EQ(leafChild1->stats().numReserves, 1); ASSERT_EQ(leafChild1->stats().numReleases, 1); + ASSERT_EQ(leafChild1->stats().numCapacityGrowths, 0); } struct Buffer { @@ -2937,7 +3181,13 @@ struct Buffer { TEST_P(MemoryPoolTest, memoryUsageUpdateCheck) { constexpr int64_t kMaxSize = 1 << 30; // 1GB - setupMemory({.capacity = kMaxSize}); + // setupMemory({.allocatorCapacity = kMaxSize}); + setupMemory( + {.allocatorCapacity = kMaxSize, + .allocationSizeThresholdWithReservation = false, + .arbitratorCapacity = kMaxSize, + .arbitratorReservedCapacity = 128 << 20}); + auto manager = getMemoryManager(); auto root = manager->addRootPool("memoryUsageUpdate", kMaxSize); @@ -2946,96 +3196,109 @@ TEST_P(MemoryPoolTest, memoryUsageUpdateCheck) { ASSERT_THROW(child1->allocate(2 * kMaxSize), VeloxRuntimeError); - ASSERT_EQ(root->stats().currentBytes, 0); + ASSERT_EQ(root->stats().usedBytes, 0); + ASSERT_EQ(root->stats().reservedBytes, 0); ASSERT_EQ(root->stats().cumulativeBytes, 0); ASSERT_EQ(root->reservedBytes(), 0); std::vector buffers; buffers.emplace_back(Buffer{child1->allocate(1000), 1000}); // The memory pool do alignment internally. - ASSERT_EQ(child1->stats().currentBytes, 1024); - ASSERT_EQ(child1->currentBytes(), 1024); + ASSERT_EQ(child1->stats().usedBytes, 1024); + ASSERT_EQ(root->stats().reservedBytes, MB); + ASSERT_EQ(child1->usedBytes(), 1024); ASSERT_EQ(child1->reservedBytes(), MB); + ASSERT_EQ(child1->stats().reservedBytes, MB); ASSERT_EQ(child1->stats().cumulativeBytes, 1024); - ASSERT_EQ(root->currentBytes(), MB); + ASSERT_EQ(root->reservedBytes(), MB); ASSERT_EQ(root->stats().cumulativeBytes, MB); ASSERT_EQ(MB - 1024, child1->availableReservation()); buffers.emplace_back(Buffer{child1->allocate(1000), 1000}); - ASSERT_EQ(child1->stats().currentBytes, 2048); - ASSERT_EQ(child1->currentBytes(), 2048); - ASSERT_EQ(root->currentBytes(), MB); - ASSERT_EQ(root->stats().currentBytes, MB); + ASSERT_EQ(child1->stats().usedBytes, 2048); + ASSERT_EQ(child1->usedBytes(), 2048); + ASSERT_EQ(child1->stats().reservedBytes, MB); + ASSERT_EQ(root->reservedBytes(), MB); + ASSERT_EQ(root->stats().usedBytes, 2048); ASSERT_EQ(root->stats().cumulativeBytes, MB); + ASSERT_EQ(root->stats().reservedBytes, MB); buffers.emplace_back(Buffer{child1->allocate(MB), MB}); - ASSERT_EQ(child1->stats().currentBytes, 2048 + MB); - ASSERT_EQ(child1->currentBytes(), 2048 + MB); - ASSERT_EQ(root->currentBytes(), 2 * MB); - ASSERT_EQ(root->stats().currentBytes, 2 * MB); + ASSERT_EQ(child1->stats().usedBytes, 2048 + MB); + ASSERT_EQ(child1->stats().reservedBytes, 2 * MB); + ASSERT_EQ(child1->usedBytes(), 2048 + MB); + ASSERT_EQ(root->reservedBytes(), 2 * MB); + ASSERT_EQ(root->stats().usedBytes, 2048 + MB); ASSERT_EQ(root->stats().cumulativeBytes, 2 * MB); + ASSERT_EQ(root->stats().reservedBytes, 2 * MB); buffers.emplace_back(Buffer{child1->allocate(100 * MB), 100 * MB}); - ASSERT_EQ(child1->currentBytes(), 2048 + 101 * MB); - ASSERT_EQ(child1->stats().currentBytes, 2048 + 101 * MB); + ASSERT_EQ(child1->usedBytes(), 2048 + 101 * MB); + ASSERT_EQ(child1->stats().usedBytes, 2048 + 101 * MB); + ASSERT_EQ(child1->stats().reservedBytes, 104 * MB); ASSERT_EQ(child1->reservedBytes(), 104 * MB); ASSERT_EQ( child1->availableReservation(), - child1->reservedBytes() - child1->currentBytes()); + child1->reservedBytes() - child1->usedBytes()); // Larger sizes round up to next 8MB. - ASSERT_EQ(root->currentBytes(), 104 * MB); - ASSERT_EQ(root->stats().currentBytes, 104 * MB); + ASSERT_EQ(root->reservedBytes(), 104 * MB); + ASSERT_EQ(root->stats().usedBytes, 2048 + 101 * MB); ASSERT_EQ(root->stats().cumulativeBytes, 104 * MB); + ASSERT_EQ(root->stats().reservedBytes, 104 * MB); ASSERT_EQ(root->availableReservation(), 0); child1->free(buffers[0].data, buffers[0].length); - ASSERT_EQ(child1->currentBytes(), 1024 + 101 * MB); - ASSERT_EQ(child1->stats().currentBytes, 1024 + 101 * MB); + ASSERT_EQ(child1->usedBytes(), 1024 + 101 * MB); + ASSERT_EQ(child1->stats().usedBytes, 1024 + 101 * MB); ASSERT_EQ(child1->stats().cumulativeBytes, 2048 + 101 * MB); ASSERT_EQ(child1->reservedBytes(), 104 * MB); ASSERT_EQ( child1->availableReservation(), - child1->reservedBytes() - child1->currentBytes()); - ASSERT_EQ(root->currentBytes(), 104 * MB); - ASSERT_EQ(root->stats().currentBytes, 104 * MB); + child1->reservedBytes() - child1->usedBytes()); + ASSERT_EQ(root->reservedBytes(), 104 * MB); + ASSERT_EQ(root->stats().usedBytes, 1024 + 101 * MB); ASSERT_EQ(root->stats().cumulativeBytes, 104 * MB); + ASSERT_EQ(root->stats().reservedBytes, 104 * MB); ASSERT_EQ(root->availableReservation(), 0); child1->free(buffers[2].data, buffers[2].length); - ASSERT_EQ(child1->currentBytes(), 1024 + 100 * MB); - ASSERT_EQ(child1->stats().currentBytes, 1024 + 100 * MB); + ASSERT_EQ(child1->usedBytes(), 1024 + 100 * MB); + ASSERT_EQ(child1->stats().usedBytes, 1024 + 100 * MB); ASSERT_EQ(child1->stats().cumulativeBytes, 2048 + 101 * MB); ASSERT_EQ(child1->reservedBytes(), 104 * MB); ASSERT_EQ( child1->availableReservation(), - child1->reservedBytes() - child1->currentBytes()); - ASSERT_EQ(root->currentBytes(), 104 * MB); - ASSERT_EQ(root->stats().currentBytes, 104 * MB); + child1->reservedBytes() - child1->usedBytes()); + ASSERT_EQ(root->reservedBytes(), 104 * MB); + ASSERT_EQ(root->stats().usedBytes, 1024 + 100 * MB); ASSERT_EQ(root->stats().cumulativeBytes, 104 * MB); + ASSERT_EQ(root->stats().reservedBytes, 104 * MB); ASSERT_EQ(root->availableReservation(), 0); child1->free(buffers[3].data, buffers[3].length); - ASSERT_EQ(child1->currentBytes(), 1024); - ASSERT_EQ(child1->stats().currentBytes, 1024); + ASSERT_EQ(child1->usedBytes(), 1024); + ASSERT_EQ(child1->stats().usedBytes, 1024); ASSERT_EQ(child1->stats().cumulativeBytes, 2048 + 101 * MB); ASSERT_EQ(child1->reservedBytes(), MB); ASSERT_EQ( child1->availableReservation(), - child1->reservedBytes() - child1->currentBytes()); - ASSERT_EQ(root->currentBytes(), MB); - ASSERT_EQ(root->stats().currentBytes, MB); + child1->reservedBytes() - child1->usedBytes()); + ASSERT_EQ(root->reservedBytes(), MB); + ASSERT_EQ(root->stats().usedBytes, 1024); ASSERT_EQ(root->stats().cumulativeBytes, 104 * MB); + ASSERT_EQ(root->stats().reservedBytes, MB); ASSERT_EQ(root->availableReservation(), 0); child1->free(buffers[1].data, buffers[1].length); - ASSERT_EQ(child1->currentBytes(), 0); - ASSERT_EQ(child1->stats().currentBytes, 0); + ASSERT_EQ(child1->usedBytes(), 0); + ASSERT_EQ(child1->stats().usedBytes, 0); ASSERT_EQ(child1->stats().cumulativeBytes, 2048 + 101 * MB); ASSERT_EQ(child1->reservedBytes(), 0); ASSERT_EQ(child1->availableReservation(), 0); - ASSERT_EQ(root->currentBytes(), 0); - ASSERT_EQ(root->stats().currentBytes, 0); + ASSERT_EQ(root->reservedBytes(), 0); + ASSERT_EQ(root->stats().usedBytes, 0); ASSERT_EQ(root->stats().cumulativeBytes, 104 * MB); + ASSERT_EQ(root->stats().reservedBytes, 0); ASSERT_EQ(root->availableReservation(), 0); ASSERT_EQ(root->stats().numAllocs, 0); @@ -3057,7 +3320,10 @@ TEST_P(MemoryPoolTest, memoryUsageUpdateCheck) { TEST_P(MemoryPoolTest, maybeReserve) { constexpr int64_t kMaxSize = 1 << 30; // 1GB - setupMemory({.capacity = kMaxSize}); + setupMemory( + {.allocatorCapacity = kMaxSize, + .arbitratorCapacity = kMaxSize, + .arbitratorReservedCapacity = kMaxSize / 8}); auto manager = getMemoryManager(); auto root = manager->addRootPool("reserve", kMaxSize); @@ -3068,67 +3334,79 @@ TEST_P(MemoryPoolTest, maybeReserve) { child->maybeReserve(100 * MB); // The reservation child shows up as a reservation on the child and as an // allocation on the parent. - ASSERT_EQ(child->currentBytes(), 0); - ASSERT_EQ(child->stats().currentBytes, 0); + ASSERT_EQ(child->usedBytes(), 0); + ASSERT_EQ(child->stats().usedBytes, 0); ASSERT_EQ(child->stats().cumulativeBytes, 0); + ASSERT_EQ(child->stats().reservedBytes, 104 * MB); ASSERT_EQ(child->availableReservation(), 104 * MB); - ASSERT_EQ(root->currentBytes(), 104 * MB); - ASSERT_EQ(root->stats().currentBytes, 104 * MB); + ASSERT_EQ(root->reservedBytes(), 104 * MB); + ASSERT_EQ(root->stats().usedBytes, 0); + ASSERT_EQ(root->stats().reservedBytes, 104 * MB); ASSERT_EQ(root->availableReservation(), 0); std::vector buffers; buffers.emplace_back(Buffer{child->allocate(60 * MB), 60 * MB}); - ASSERT_EQ(child->currentBytes(), 60 * MB); - ASSERT_EQ(child->stats().currentBytes, 60 * MB); + ASSERT_EQ(child->usedBytes(), 60 * MB); + ASSERT_EQ(child->stats().usedBytes, 60 * MB); ASSERT_EQ(child->reservedBytes(), 104 * MB); + ASSERT_EQ(child->stats().reservedBytes, 104 * MB); ASSERT_EQ( child->availableReservation(), - child->reservedBytes() - child->currentBytes()); - ASSERT_EQ(root->currentBytes(), 104 * MB); + child->reservedBytes() - child->usedBytes()); + ASSERT_EQ(root->reservedBytes(), 104 * MB); + ASSERT_EQ(root->stats().reservedBytes, 104 * MB); ASSERT_EQ(root->availableReservation(), 0); buffers.emplace_back(Buffer{child->allocate(70 * MB), 70 * MB}); - ASSERT_EQ(child->currentBytes(), 130 * MB); - ASSERT_EQ(child->stats().currentBytes, 130 * MB); + ASSERT_EQ(child->usedBytes(), 130 * MB); + ASSERT_EQ(child->stats().usedBytes, 130 * MB); ASSERT_EQ(child->reservedBytes(), 136 * MB); + ASSERT_EQ(child->stats().reservedBytes, 136 * MB); ASSERT_EQ( child->availableReservation(), - child->reservedBytes() - child->currentBytes()); + child->reservedBytes() - child->usedBytes()); // Extended and rounded up the reservation to then next 8MB. - ASSERT_EQ(root->currentBytes(), 136 * MB); + ASSERT_EQ(root->reservedBytes(), 136 * MB); + ASSERT_EQ(root->stats().reservedBytes, 136 * MB); ASSERT_EQ(root->availableReservation(), 0); child->free(buffers[0].data, buffers[0].length); - ASSERT_EQ(child->currentBytes(), 70 * MB); - ASSERT_EQ(child->stats().currentBytes, 70 * MB); + ASSERT_EQ(child->usedBytes(), 70 * MB); + ASSERT_EQ(child->stats().usedBytes, 70 * MB); // Extended and rounded up the reservation to then next 8MB. ASSERT_EQ(child->reservedBytes(), 104 * MB); + ASSERT_EQ(child->stats().reservedBytes, 104 * MB); ASSERT_EQ( child->availableReservation(), - child->reservedBytes() - child->currentBytes()); - ASSERT_EQ(root->currentBytes(), 104 * MB); + child->reservedBytes() - child->usedBytes()); + ASSERT_EQ(root->reservedBytes(), 104 * MB); + ASSERT_EQ(root->stats().reservedBytes, 104 * MB); ASSERT_EQ(root->availableReservation(), 0); child->free(buffers[1].data, buffers[1].length); // The reservation goes down to the explicitly made reservation. - ASSERT_EQ(child->currentBytes(), 0); - ASSERT_EQ(child->stats().currentBytes, 0); + ASSERT_EQ(child->usedBytes(), 0); + ASSERT_EQ(child->stats().usedBytes, 0); ASSERT_EQ(child->reservedBytes(), 104 * MB); + ASSERT_EQ(child->stats().reservedBytes, 104 * MB); ASSERT_EQ( child->availableReservation(), - child->reservedBytes() - child->currentBytes()); - ASSERT_EQ(root->currentBytes(), 104 * MB); + child->reservedBytes() - child->usedBytes()); + ASSERT_EQ(root->reservedBytes(), 104 * MB); + ASSERT_EQ(root->stats().reservedBytes, 104 * MB); ASSERT_EQ(root->availableReservation(), 0); child->release(); - ASSERT_EQ(child->currentBytes(), 0); - ASSERT_EQ(child->stats().currentBytes, 0); + ASSERT_EQ(child->usedBytes(), 0); + ASSERT_EQ(child->stats().usedBytes, 0); ASSERT_EQ(child->reservedBytes(), 0); + ASSERT_EQ(child->stats().reservedBytes, 0); ASSERT_EQ(child->availableReservation(), 0); - ASSERT_EQ(root->currentBytes(), 0); + ASSERT_EQ(root->reservedBytes(), 0); ASSERT_EQ(root->availableReservation(), 0); + ASSERT_EQ(root->stats().reservedBytes, 0); ASSERT_EQ(root->stats().numAllocs, 0); ASSERT_EQ(root->stats().numFrees, 0); @@ -3149,7 +3427,11 @@ TEST_P(MemoryPoolTest, maybeReserve) { TEST_P(MemoryPoolTest, maybeReserveFailWithAbort) { constexpr int64_t kMaxSize = 1 * GB; // 1GB - setupMemory({.capacity = kMaxSize, .arbitratorKind = "SHARED"}); + setupMemory( + {.allocatorCapacity = kMaxSize, + .arbitratorCapacity = kMaxSize, + .arbitratorReservedCapacity = kMaxSize / 8, + .arbitratorKind = "SHARED"}); MemoryManager& manager = *getMemoryManager(); auto root = manager.addRootPool( "maybeReserveFailWithAbort", kMaxSize, MemoryReclaimer::create()); @@ -3161,39 +3443,17 @@ TEST_P(MemoryPoolTest, maybeReserveFailWithAbort) { ASSERT_TRUE(child->aborted()); ASSERT_TRUE(root->aborted()); VELOX_ASSERT_THROW( - child->maybeReserve(2 * kMaxSize), "This memory pool has been aborted."); -} - -// Model implementation of a GrowCallback. -bool grow(int64_t size, int64_t hardLimit, MemoryPool& pool) { - static std::mutex mutex; - // The calls from different threads on the same tracker must be serialized. - std::lock_guard l(mutex); - // The total includes the allocation that exceeded the limit. This function's - // job is to raise the limit to >= current + size. - auto current = pool.reservedBytes(); - auto limit = pool.capacity(); - if (current + size <= limit) { - // No need to increase. It could be another thread already - // increased the cap far enough while this thread was waiting to - // enter the lock_guard. - return true; - } - if (current + size > hardLimit) { - // The caller will revert the allocation that called this and signal an - // error. - return false; - } - // We set the new limit to be the requested size. - static_cast(&pool)->testingSetCapacity(current + size); - return true; + child->maybeReserve(2 * kMaxSize), "Manual MemoryPool Abortion"); } DEBUG_ONLY_TEST_P(MemoryPoolTest, raceBetweenFreeAndFailedAllocation) { if (!isLeafThreadSafe_) { return; } - setupMemory({.capacity = 1 * GB}); + setupMemory( + {.allocatorCapacity = 1 * GB, + .arbitratorCapacity = 1 * GB, + .arbitratorReservedCapacity = 128 * MB}); auto manager = getMemoryManager(); auto root = manager->addRootPool("grow", 64 * MB); auto child = root->addLeafChild("grow", isLeafThreadSafe_); @@ -3206,19 +3466,20 @@ DEBUG_ONLY_TEST_P(MemoryPoolTest, raceBetweenFreeAndFailedAllocation) { std::function([&](MemoryPool* /*unused*/) { ++reservationAttempts; // On the first reservation attempt for the second buffer allocation, - // trigger to free the first allocated buffer which will cause the first - // reservation attempt fails. The quantized reservation size of the - // first attempt is 16MB which requires 20MB after the first buffer - // free. + // trigger to free the first allocated buffer which will cause the + // first reservation attempt fails. The quantized reservation size of + // the first attempt is 16MB which requires 20MB after the first + // buffer free. if (reservationAttempts == 1) { // Inject to free the first allocated buffer while the child->free(buffer1, 17 * MB); return; } // On the second reservation attempt for the second buffer allocation, - // reduce the memory pool's capacity to trigger the memory pool capacity - // exceeded exception error which might leave unused reservation bytes - // but zero used reservation if we don't do the cleanup properly. + // reduce the memory pool's capacity to trigger the memory pool + // capacity exceeded exception error which might leave unused + // reservation bytes but zero used reservation if we don't do the + // cleanup properly. if (reservationAttempts == 2) { static_cast(root.get())->testingSetCapacity(16 * MB); return; @@ -3312,7 +3573,9 @@ TEST_P(MemoryPoolTest, abortAPI) { } ASSERT_TRUE(rootPool->aborted()); { - abortPool(rootPool.get()); + VELOX_ASSERT_THROW( + abortPool(rootPool.get()), + "Trying to set another abort error on an already aborted pool."); ASSERT_TRUE(rootPool->aborted()); } ASSERT_TRUE(rootPool->aborted()); @@ -3374,7 +3637,9 @@ TEST_P(MemoryPoolTest, abortAPI) { "aggregateAbortAPI", MemoryReclaimer::create()); ASSERT_TRUE(aggregatePool->aborted()); { - abortPool(aggregatePool.get()); + VELOX_ASSERT_THROW( + abortPool(aggregatePool.get()), + "Trying to set another abort error on an already aborted pool."); ASSERT_TRUE(aggregatePool->aborted()); ASSERT_TRUE(leafPool->aborted()); ASSERT_TRUE(rootPool->aborted()); @@ -3383,7 +3648,9 @@ TEST_P(MemoryPoolTest, abortAPI) { ASSERT_TRUE(leafPool->aborted()); ASSERT_TRUE(rootPool->aborted()); { - abortPool(rootPool.get()); + VELOX_ASSERT_THROW( + abortPool(rootPool.get()), + "Trying to set another abort error on an already aborted pool."); ASSERT_TRUE(aggregatePool->aborted()); ASSERT_TRUE(leafPool->aborted()); ASSERT_TRUE(rootPool->aborted()); @@ -3526,7 +3793,7 @@ TEST_P(MemoryPoolTest, abort) { // Allocate some buffer from leaf. void* buf1 = leafPool->allocate(128); - ASSERT_EQ(leafPool->currentBytes(), 128); + ASSERT_EQ(leafPool->usedBytes(), 128); // Abort the pool. ContinueFuture future; @@ -3554,20 +3821,71 @@ TEST_P(MemoryPoolTest, abort) { { VELOX_ASSERT_THROW(leafPool->allocate(capacity * 2), ""); } // Allocate without trigger memory reservation increment. void* buf2 = leafPool->allocate(128); - ASSERT_EQ(leafPool->currentBytes(), 256); + ASSERT_EQ(leafPool->usedBytes(), 256); leafPool->free(buf1, 128); leafPool->free(buf2, 128); - ASSERT_EQ(leafPool->currentBytes(), 0); + ASSERT_EQ(leafPool->usedBytes(), 0); ASSERT_EQ(leafPool->capacity(), capacity); } } } +TEST_P(MemoryPoolTest, overuseUnderArbitration) { + constexpr int64_t kMaxSize = 128 * MB; // 1GB + setupMemory( + {.allocatorCapacity = kMaxSize, + .arbitratorCapacity = kMaxSize, + .arbitratorReservedCapacity = 4 * MB, + .arbitratorKind = "SHARED"}); + MemoryManager& manager = *getMemoryManager(); + auto root = manager.addRootPool( + "overuseUnderArbitration", kMaxSize, MemoryReclaimer::create()); + auto child = root->addLeafChild("overuseUnderArbitration"); + // maybeReserve returns false if reservation fails. + ASSERT_FALSE(child->maybeReserve(2 * kMaxSize)); + ASSERT_EQ(child->usedBytes(), 0); + ASSERT_EQ(child->reservedBytes(), 0); + ScopedMemoryArbitrationContext scopedMemoryArbitration(child.get()); + ASSERT_TRUE(underMemoryArbitration()); + ASSERT_TRUE(child->maybeReserve(2 * kMaxSize)); + ASSERT_EQ(child->usedBytes(), 0); + ASSERT_EQ(child->reservedBytes(), 2 * kMaxSize); + child->release(); + ASSERT_EQ(child->usedBytes(), 0); + ASSERT_EQ(child->reservedBytes(), 0); +} + +TEST_P(MemoryPoolTest, allocationWithCoveredCollateral) { + // Verify that the memory pool's reservation is correctly updated when an + // allocation call is attempted with collateral that covers the allocation + // (that is, the collateral is larger than the requested allocation). + auto manager = getMemoryManager(); + auto root = manager->addRootPool("root", kMaxMemory, nullptr); + ASSERT_TRUE(root->trackUsage()); + auto pool = + root->addLeafChild("allocationWithCoveredCollateral", isLeafThreadSafe_); + ASSERT_TRUE(pool->trackUsage()); + // Check non-contiguous allocation. + ASSERT_EQ(pool->reservedBytes(), 0); + Allocation allocation; + pool->allocateNonContiguous(100, allocation); + auto prevReservedBytes = pool->usedBytes(); + pool->allocateNonContiguous(50, allocation); + ASSERT_LT(pool->usedBytes(), prevReservedBytes); + pool->freeNonContiguous(allocation); + + // Check contiguous allocation. + ContiguousAllocation contiguousAllocation; + pool->allocateContiguous(100, contiguousAllocation); + prevReservedBytes = pool->usedBytes(); + pool->allocateContiguous(50, contiguousAllocation); + ASSERT_LT(pool->usedBytes(), prevReservedBytes); + pool->freeContiguous(contiguousAllocation); +} + VELOX_INSTANTIATE_TEST_SUITE_P( MemoryPoolTestSuite, MemoryPoolTest, testing::ValuesIn(MemoryPoolTest::getTestParams())); -} // namespace memory -} // namespace velox -} // namespace facebook +} // namespace facebook::velox::memory diff --git a/velox/common/memory/tests/MockSharedArbitratorTest.cpp b/velox/common/memory/tests/MockSharedArbitratorTest.cpp index 8f092cc64e204..8860a46562d5f 100644 --- a/velox/common/memory/tests/MockSharedArbitratorTest.cpp +++ b/velox/common/memory/tests/MockSharedArbitratorTest.cpp @@ -16,8 +16,10 @@ #include +#include #include #include +#include #include "folly/experimental/EventCount.h" #include "folly/futures/Barrier.h" #include "velox/common/base/tests/GTestUtils.h" @@ -26,6 +28,7 @@ #include "velox/common/memory/MemoryArbitrator.h" #include "velox/common/memory/SharedArbitrator.h" #include "velox/common/testutil/TestValue.h" +#include "velox/exec/OperatorUtils.h" #include "velox/exec/tests/utils/PlanBuilder.h" #include "velox/exec/tests/utils/TempDirectoryPath.h" @@ -39,12 +42,36 @@ using namespace facebook::velox::exec::test; namespace facebook::velox::memory { namespace { + +// Class to write runtime stats in the tests to the stats container. +class TestRuntimeStatWriter : public BaseRuntimeStatWriter { + public: + explicit TestRuntimeStatWriter( + std::unordered_map& stats) + : stats_{stats} {} + + void addRuntimeStat(const std::string& name, const RuntimeCounter& value) + override { + addOperatorRuntimeStats(name, value, stats_); + } + + private: + std::unordered_map& stats_; +}; + constexpr int64_t KB = 1024L; constexpr int64_t MB = 1024L * KB; constexpr uint64_t kMemoryCapacity = 512 * MB; +constexpr uint64_t kReservedMemoryCapacity = 128 * MB; constexpr uint64_t kMemoryPoolInitCapacity = 16 * MB; -constexpr uint64_t kMemoryPoolTransferCapacity = 8 * MB; +// TODO(jtan6): Remove after complete transfer capacity deprecation +constexpr uint64_t kMemoryPoolTransferCapacity = 0; +constexpr uint64_t kMemoryPoolReservedCapacity = 8 * MB; +constexpr uint64_t kFastExponentialGrowthCapacityLimit = 32 * MB; +constexpr double kSlowCapacityGrowPct = 0.25; +constexpr uint64_t kMemoryPoolMinFreeCapacity = 8 * MB; +constexpr double kMemoryPoolMinFreeCapacityPct = 0.25; class MemoryReclaimer; class MockMemoryOperator; @@ -161,8 +188,11 @@ class MockMemoryOperator { return op_->reclaimableBytes(pool, reclaimableBytes); } - uint64_t reclaim(MemoryPool* pool, uint64_t targetBytes, Stats& stats) - override { + uint64_t reclaim( + MemoryPool* pool, + uint64_t targetBytes, + uint64_t /*unused*/, + Stats& stats) override { ++numReclaims_; if (!reclaimable_) { return 0; @@ -171,7 +201,9 @@ class MockMemoryOperator { reclaimInjectCb_(pool, targetBytes); } reclaimTargetBytes_.push_back(targetBytes); - return op_->reclaim(pool, targetBytes); + auto reclaimBytes = op_->reclaim(pool, targetBytes); + stats.reclaimedBytes += reclaimBytes; + return reclaimBytes; } void enterArbitration() override { @@ -236,27 +268,25 @@ class MockMemoryOperator { void free(void* buffer) { size_t size; - { - std::lock_guard l(mu_); - VELOX_CHECK_EQ(allocations_.count(buffer), 1); - size = allocations_[buffer]; - totalBytes_ -= size; - allocations_.erase(buffer); - } + std::lock_guard l(mu_); + VELOX_CHECK_EQ(allocations_.count(buffer), 1); + size = allocations_[buffer]; + totalBytes_ -= size; + allocations_.erase(buffer); pool_->free(buffer, size); } void freeAll() { - std::unordered_map allocations; + std::unordered_map allocationsToFree; { std::lock_guard l(mu_); for (auto entry : allocations_) { totalBytes_ -= entry.second; } - allocations.swap(allocations_); VELOX_CHECK_EQ(totalBytes_, 0); + allocationsToFree.swap(allocations_); } - for (auto entry : allocations) { + for (auto entry : allocationsToFree) { pool_->free(entry.first, entry.second); } } @@ -308,24 +338,22 @@ class MockMemoryOperator { for (const auto& allocation : allocationsToFree) { pool_->free(allocation.buffer, allocation.size); } - return pool_->shrink(targetBytes); + return bytesReclaimed; } void abort(MemoryPool* pool) { - std::vector allocationsToFree; + std::unordered_map allocationsToFree; { std::lock_guard l(mu_); VELOX_CHECK_NOT_NULL(pool_); VELOX_CHECK_EQ(pool->name(), pool_->name()); - for (auto allocEntry : allocations_) { - allocationsToFree.push_back( - Allocation{allocEntry.first, allocEntry.second}); - totalBytes_ -= allocEntry.second; + for (const auto& allocation : allocations_) { + totalBytes_ -= allocation.second; } - allocations_.clear(); + allocationsToFree.swap(allocations_); } - for (const auto& allocation : allocationsToFree) { - pool_->free(allocation.buffer, allocation.size); + for (auto entry : allocationsToFree) { + pool_->free(entry.first, entry.second); } } @@ -397,26 +425,32 @@ class MockSharedArbitrationTest : public testing::Test { } void setupMemory( - int64_t memoryCapacity = 0, - uint64_t memoryPoolInitCapacity = kMaxMemory, - uint64_t memoryPoolTransferCapacity = 0, - std::function arbitrationStateCheckCb = nullptr) { - if (memoryPoolInitCapacity == kMaxMemory) { - memoryPoolInitCapacity = kMemoryPoolInitCapacity; - } - if (memoryPoolTransferCapacity == 0) { - memoryPoolTransferCapacity = kMemoryPoolTransferCapacity; - } - memoryCapacity = (memoryCapacity != 0) ? memoryCapacity : kMemoryCapacity; - allocator_ = std::make_shared(memoryCapacity); + int64_t memoryCapacity = kMemoryCapacity, + int64_t reservedMemoryCapacity = kReservedMemoryCapacity, + uint64_t memoryPoolInitCapacity = kMemoryPoolInitCapacity, + uint64_t memoryPoolReserveCapacity = kMemoryPoolReservedCapacity, + uint64_t memoryPoolTransferCapacity = kMemoryPoolTransferCapacity, + uint64_t fastExponentialGrowthCapacityLimit = + kFastExponentialGrowthCapacityLimit, + double slowCapacityGrowPct = kSlowCapacityGrowPct, + uint64_t memoryPoolMinFreeCapacity = kMemoryPoolMinFreeCapacity, + double memoryPoolMinFreeCapacityPct = kMemoryPoolMinFreeCapacityPct, + std::function arbitrationStateCheckCb = nullptr, + bool globalArtbitrationEnabled = true) { MemoryManagerOptions options; - options.allocator = allocator_.get(); - options.capacity = allocator_->capacity(); + options.allocatorCapacity = memoryCapacity; + options.arbitratorReservedCapacity = reservedMemoryCapacity; std::string arbitratorKind = "SHARED"; options.arbitratorKind = arbitratorKind; - options.capacity = options.capacity; options.memoryPoolInitCapacity = memoryPoolInitCapacity; + options.memoryPoolReservedCapacity = memoryPoolReserveCapacity; options.memoryPoolTransferCapacity = memoryPoolTransferCapacity; + options.fastExponentialGrowthCapacityLimit = + fastExponentialGrowthCapacityLimit; + options.slowCapacityGrowPct = slowCapacityGrowPct; + options.memoryPoolMinFreeCapacity = memoryPoolMinFreeCapacity; + options.memoryPoolMinFreeCapacityPct = memoryPoolMinFreeCapacityPct; + options.globalArbitrationEnabled = globalArtbitrationEnabled; options.arbitrationStateCheckCb = std::move(arbitrationStateCheckCb); options.checkUsageLeak = true; manager_ = std::make_unique(options); @@ -444,10 +478,11 @@ class MockSharedArbitrationTest : public testing::Test { tasks_.clear(); } - std::shared_ptr allocator_; std::unique_ptr manager_; SharedArbitrator* arbitrator_; std::vector> tasks_; + std::unique_ptr executor_ = + std::make_unique(4); }; MockMemoryOperator* MockSharedArbitrationTest::addMemoryOp( @@ -469,20 +504,18 @@ void verifyArbitratorStats( const MemoryArbitrator::Stats& stats, uint64_t maxCapacityBytes, uint64_t freeCapacityBytes = 0, + uint64_t freeReservedCapacityBytes = 0, uint64_t numRequests = 0, - uint64_t numSucceeded = 0, uint64_t numFailures = 0, uint64_t numReclaimedBytes = 0, uint64_t numShrunkBytes = 0, - uint64_t arbitrationTimeUs = 0, - uint64_t queueTimeUs = 0) { + uint64_t arbitrationTimeUs = 0) { ASSERT_EQ(stats.numRequests, numRequests); - ASSERT_EQ(stats.numSucceeded, numSucceeded); ASSERT_EQ(stats.numFailures, numFailures); ASSERT_EQ(stats.numReclaimedBytes, numReclaimedBytes); ASSERT_EQ(stats.numShrunkBytes, numShrunkBytes); ASSERT_GE(stats.arbitrationTimeUs, arbitrationTimeUs); - ASSERT_GE(stats.queueTimeUs, queueTimeUs); + ASSERT_EQ(stats.freeReservedCapacityBytes, freeReservedCapacityBytes); ASSERT_EQ(stats.freeCapacityBytes, freeCapacityBytes); ASSERT_EQ(stats.maxCapacityBytes, maxCapacityBytes); } @@ -500,23 +533,128 @@ void verifyReclaimerStats( } } +TEST_F(MockSharedArbitrationTest, extraConfigs) { + // Testing default values + std::unordered_map emptyConfigs; + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getReservedCapacity(emptyConfigs), 0); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getReservedCapacity(emptyConfigs), 0); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getMemoryPoolInitialCapacity(emptyConfigs), + 256 << 20); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getMemoryPoolTransferCapacity( + emptyConfigs), + 128 << 20); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getMemoryReclaimMaxWaitTimeMs( + emptyConfigs), + 0); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getGlobalArbitrationEnabled(emptyConfigs), + SharedArbitrator::ExtraConfig::kDefaultGlobalArbitrationEnabled); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getCheckUsageLeak(emptyConfigs), + SharedArbitrator::ExtraConfig::kDefaultCheckUsageLeak); + + // Testing custom values + std::unordered_map configs; + configs[std::string(SharedArbitrator::ExtraConfig::kReservedCapacity)] = + "100B"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolInitialCapacity)] = "512MB"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolReservedCapacity)] = "200B"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolTransferCapacity)] = "256MB"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryReclaimMaxWaitTime)] = "5000ms"; + configs[std::string( + SharedArbitrator::ExtraConfig::kGlobalArbitrationEnabled)] = "true"; + configs[std::string(SharedArbitrator::ExtraConfig::kCheckUsageLeak)] = + "false"; + ASSERT_EQ(SharedArbitrator::ExtraConfig::getReservedCapacity(configs), 100); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getMemoryPoolInitialCapacity(configs), + 512 << 20); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getMemoryPoolReservedCapacity(configs), + 200); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getMemoryPoolTransferCapacity(configs), + 256 << 20); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::getMemoryReclaimMaxWaitTimeMs(configs), + 5000); + ASSERT_TRUE( + SharedArbitrator::ExtraConfig::getGlobalArbitrationEnabled(configs)); + ASSERT_FALSE(SharedArbitrator::ExtraConfig::getCheckUsageLeak(configs)); + + // Testing invalid values + configs[std::string(SharedArbitrator::ExtraConfig::kReservedCapacity)] = + "invalid"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolInitialCapacity)] = "invalid"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolReservedCapacity)] = "invalid"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolTransferCapacity)] = "invalid"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryReclaimMaxWaitTime)] = "invalid"; + configs[std::string( + SharedArbitrator::ExtraConfig::kGlobalArbitrationEnabled)] = "invalid"; + configs[std::string(SharedArbitrator::ExtraConfig::kCheckUsageLeak)] = + "invalid"; + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::getReservedCapacity(configs), + "Invalid capacity string 'invalid'"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::getMemoryPoolInitialCapacity(configs), + "Invalid capacity string 'invalid'"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::getMemoryPoolReservedCapacity(configs), + "Invalid capacity string 'invalid'"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::getMemoryPoolTransferCapacity(configs), + "Invalid capacity string 'invalid'"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::getMemoryReclaimMaxWaitTimeMs(configs), + "Invalid duration 'invalid'"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::getGlobalArbitrationEnabled(configs), + "Failed while parsing SharedArbitrator configs"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::getCheckUsageLeak(configs), + "Failed while parsing SharedArbitrator configs"); +} + TEST_F(MockSharedArbitrationTest, constructor) { + const int reservedCapacity = arbitrator_->stats().freeReservedCapacityBytes; + const int nonReservedCapacity = + arbitrator_->stats().freeCapacityBytes - reservedCapacity; std::vector> tasks; + int remainingFreeCapacity = arbitrator_->stats().freeCapacityBytes; for (int i = 0; i <= kMemoryCapacity / kMemoryPoolInitCapacity; ++i) { auto task = addTask(kMemoryCapacity); ASSERT_NE(task->pool()->reclaimer(), nullptr); - if (i < kMemoryCapacity / kMemoryPoolInitCapacity) { + if (i < nonReservedCapacity / kMemoryPoolInitCapacity) { ASSERT_EQ(task->capacity(), kMemoryPoolInitCapacity); } else { - ASSERT_EQ(task->capacity(), 0); + ASSERT_EQ(task->capacity(), kMemoryPoolReservedCapacity) << i; } + remainingFreeCapacity -= task->capacity(); tasks.push_back(std::move(task)); } auto stats = arbitrator_->stats(); - verifyArbitratorStats(stats, kMemoryCapacity); + ASSERT_EQ(remainingFreeCapacity, stats.freeCapacityBytes); + ASSERT_EQ(remainingFreeCapacity, stats.freeReservedCapacityBytes); + verifyArbitratorStats( + stats, kMemoryCapacity, remainingFreeCapacity, remainingFreeCapacity); tasks.clear(); stats = arbitrator_->stats(); - verifyArbitratorStats(stats, kMemoryCapacity, kMemoryCapacity); + verifyArbitratorStats( + stats, kMemoryCapacity, kMemoryCapacity, reservedCapacity); } TEST_F(MockSharedArbitrationTest, arbitrationStateCheck) { @@ -524,11 +662,11 @@ TEST_F(MockSharedArbitrationTest, arbitrationStateCheck) { const int minPoolCapacity = 32 * MB; std::atomic checkCount{0}; MemoryArbitrationStateCheckCB checkCountCb = [&](MemoryPool& pool) { - const std::string re("MockTask.*"); - ASSERT_TRUE(RE2::FullMatch(pool.name(), re)); + const std::string re("RootPool.*"); + ASSERT_TRUE(RE2::FullMatch(pool.name(), re)) << pool.name(); ++checkCount; }; - setupMemory(memCapacity, 0, 0, checkCountCb); + setupMemory(memCapacity, 0, 0, 0, 0, 0, 0, 0, 0, checkCountCb); const int numTasks{5}; std::vector> tasks; @@ -553,31 +691,62 @@ TEST_F(MockSharedArbitrationTest, arbitrationStateCheck) { MemoryArbitrationStateCheckCB badCheckCb = [&](MemoryPool& /*unused*/) { VELOX_FAIL("bad check"); }; - setupMemory(memCapacity, 0, 0, badCheckCb); + setupMemory(memCapacity, 0, 0, 0, 0, 0, 0, 0, 0, badCheckCb); std::shared_ptr task = addTask(kMemoryCapacity); ASSERT_EQ(task->capacity(), 0); MockMemoryOperator* memOp = task->addMemoryOp(); VELOX_ASSERT_THROW(memOp->allocate(128), "bad check"); } +TEST_F(MockSharedArbitrationTest, asyncArbitrationWork) { + const int memoryCapacity = 512 * MB; + const int poolCapacity = 256 * MB; + setupMemory(memoryCapacity, 0, poolCapacity, 0); + + std::atomic_int reclaimedCount{0}; + std::shared_ptr task = addTask(poolCapacity); + MockMemoryOperator* memoryOp = + addMemoryOp(task, true, [&](MemoryPool* pool, uint64_t /*unsed*/) { + struct Result { + bool succeeded{true}; + + explicit Result(bool _succeeded) : succeeded(_succeeded) {} + }; + auto asyncReclaimTask = createAsyncMemoryReclaimTask([&]() { + memoryOp->allocate(poolCapacity); + return std::make_unique(true); + }); + executor_->add([&]() { asyncReclaimTask->prepare(); }); + std::this_thread::sleep_for(std::chrono::seconds(1)); // NOLINT + const auto result = asyncReclaimTask->move(); + ASSERT_TRUE(result->succeeded); + memoryOp->freeAll(); + ++reclaimedCount; + }); + memoryOp->allocate(poolCapacity); + memoryOp->allocate(poolCapacity); + ASSERT_EQ(reclaimedCount, 1); +} + TEST_F(MockSharedArbitrationTest, arbitrationFailsTask) { - auto nonReclaimTask = addTask(384 * MB); - auto nonReclaimOp = nonReclaimTask->addMemoryOp(false); - auto buf = nonReclaimOp->allocate(384 * MB); + auto nonReclaimTask = addTask(328 * MB); + auto* nonReclaimOp = nonReclaimTask->addMemoryOp(false); + auto* buf = nonReclaimOp->allocate(320 * MB); // growTask is (192 + 128) = 320MB which is less than nonReclaimTask 384MB. - // This makes sure nonReclaimTask gets picked as the victim during handleOOM() - auto growTask = addTask(192 * MB); - auto growOp = growTask->addMemoryOp(false); - auto bufGrow = growOp->allocate(128 * MB); - EXPECT_NO_THROW(manager_->growPool(growOp->pool(), 64 * MB)); + // This makes sure nonReclaimTask gets picked as the victim during + // handleOOM(). + auto growTask = addTask(328 * MB); + auto* growOp = growTask->addMemoryOp(false); + auto* bufGrow1 = growOp->allocate(64 * MB); + auto* bufGrow2 = growOp->allocate(128 * MB); ASSERT_NE(nonReclaimTask->error(), nullptr); try { std::rethrow_exception(nonReclaimTask->error()); } catch (const VeloxRuntimeError& e) { ASSERT_EQ(velox::error_code::kMemAborted, e.errorCode()); ASSERT_TRUE( - std::string(e.what()).find("usage 384.00MB peak 384.00MB") != + std::string(e.what()).find("aborted when requestor") != std::string::npos); } catch (...) { FAIL(); @@ -586,89 +755,1278 @@ TEST_F(MockSharedArbitrationTest, arbitrationFailsTask) { growOp->freeAll(); } -TEST_F(MockSharedArbitrationTest, shrinkMemory) { - std::vector> pools; - ASSERT_THROW(arbitrator_->shrinkMemory(pools, 128), VeloxException); -} +TEST_F(MockSharedArbitrationTest, shrinkPools) { + const int64_t memoryCapacity = 32 << 20; + const int64_t reservedMemoryCapacity = 8 << 20; + const uint64_t memoryPoolInitCapacity = 8 << 20; + const uint64_t memoryPoolReserveCapacity = 2 << 20; + setupMemory( + memoryCapacity, + reservedMemoryCapacity, + memoryPoolInitCapacity, + memoryPoolReserveCapacity); + + struct TestTask { + uint64_t capacity{0}; + bool reclaimable{false}; + uint64_t allocateBytes{0}; + + uint64_t expectedInitialCapacity{0}; + bool expectedAbortAfterShrink{false}; -TEST_F(MockSharedArbitrationTest, singlePoolGrowWithoutArbitration) { - auto* memOp = addMemoryOp(); - const int allocateSize = 1 * MB; - while (memOp->capacity() < kMemoryCapacity) { - memOp->allocate(allocateSize); + std::string debugString() const { + return fmt::format( + "capacity: {}, reclaimable: {}, allocateBytes: {}, expectedInitialCapacity: {}, expectedAbortAfterShrink: {}", + succinctBytes(capacity), + reclaimable, + succinctBytes(allocateBytes), + succinctBytes(expectedInitialCapacity), + expectedAbortAfterShrink); + } + }; + + struct { + std::vector testTasks; + uint64_t targetBytes; + uint64_t expectedFreedBytes; + uint64_t expectedFreeCapacity; + uint64_t expectedReservedFreeCapacity; + bool allowSpill; + bool allowAbort; + + std::string debugString() const { + std::stringstream tasksOss; + for (const auto& testTask : testTasks) { + tasksOss << "["; + tasksOss << testTask.debugString(); + tasksOss << "], \n"; + } + return fmt::format( + "testTasks: \n[{}], \ntargetBytes: {}, \nexpectedFreedBytes: {}, " + "\nexpectedFreeCapacity: {}, \nexpectedReservedFreeCapacity: {}, \n" + "allowSpill: {}, \nallowAbort: {}", + tasksOss.str(), + succinctBytes(targetBytes), + succinctBytes(expectedFreedBytes), + succinctBytes(expectedFreeCapacity), + succinctBytes(expectedReservedFreeCapacity), + allowSpill, + allowAbort); + } + } testSettings[] = { + {{{memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, + true, + memoryPoolReserveCapacity, + memoryPoolReserveCapacity, + false}}, + 0, + 0, + 6 << 20, + 6 << 20, + true, + false}, + + {{{memoryPoolInitCapacity, + true, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, + true, + memoryPoolReserveCapacity, + memoryPoolReserveCapacity, + false}}, + 0, + 8 << 20, + 14 << 20, + reservedMemoryCapacity, + true, + false}, + + {{{memoryPoolInitCapacity, + true, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, + true, + memoryPoolReserveCapacity, + memoryPoolReserveCapacity, + false}}, + 0, + 0, + 6 << 20, + 6 << 20, + false, + false}, + + {{{memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, + true, + memoryPoolReserveCapacity, + memoryPoolReserveCapacity, + false}}, + 0, + 0, + 6 << 20, + 6 << 20, + true, + false}, + + {{{memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, true}, + {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, true}, + {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, true}, + {memoryPoolInitCapacity, true, 0, memoryPoolReserveCapacity, true}}, + 0, + 26 << 20, + memoryCapacity, + reservedMemoryCapacity, + false, + true}, + + {{{memoryPoolInitCapacity, + true, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, true}, + {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, true}, + {memoryPoolInitCapacity, + true, + memoryPoolReserveCapacity, + memoryPoolReserveCapacity, + true}}, + 0, + 26 << 20, + memoryCapacity, + reservedMemoryCapacity, + true, + true}, + + {{{memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, true, 0, memoryPoolReserveCapacity, false}}, + 16 << 20, + 0, + 6 << 20, + 6 << 20, + false, + false}, + + {{{memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, false, 1 << 10, memoryPoolInitCapacity, true}, + {memoryPoolInitCapacity, false, 1 << 10, memoryPoolInitCapacity, true}, + {memoryPoolInitCapacity, true, 0, memoryPoolReserveCapacity, false}}, + 16 << 20, + 16 << 20, + 22 << 20, + reservedMemoryCapacity, + true, + true}, + + {{{memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, + {memoryPoolInitCapacity, false, 1 << 10, memoryPoolInitCapacity, true}, + {memoryPoolInitCapacity, false, 1 << 10, memoryPoolInitCapacity, true}, + {memoryPoolInitCapacity, true, 0, memoryPoolReserveCapacity, false}}, + 14 << 20, + 16 << 20, + 22 << 20, + reservedMemoryCapacity, + true, + true}, + + {{{memoryPoolInitCapacity, + true, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, + true, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, + true, + memoryPoolReserveCapacity, + memoryPoolReserveCapacity, + false}}, + 12 << 20, + 12 << 20, + 18 << 20, + reservedMemoryCapacity, + true, + false}, + + {{{memoryPoolInitCapacity, + true, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + true}, + {memoryPoolInitCapacity, + true, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + true}, + {memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + true}, + {memoryPoolInitCapacity, + true, + memoryPoolReserveCapacity, + memoryPoolReserveCapacity, + false}}, + 24 << 20, + 24 << 20, + 30 << 20, + reservedMemoryCapacity, + false, + true}, + + {{{memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, + false, + memoryPoolReserveCapacity, + memoryPoolReserveCapacity, + false}}, + 14 << 20, + 0, + 6 << 20, + 6 << 20, + true, + false}, + + {{{memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, + false, + memoryPoolInitCapacity, + memoryPoolInitCapacity, + false}, + {memoryPoolInitCapacity, + true, + memoryPoolReserveCapacity, + memoryPoolReserveCapacity, + false}}, + 14 << 20, + 0, + 6 << 20, + 6 << 20, + true, + false}}; + + struct MockTaskContainer { + std::shared_ptr task; + MockMemoryOperator* op; + TestTask testTask; + }; + + std::function checkTaskException = + [](MockTask* task, bool expectedAbort) { + if (!expectedAbort) { + ASSERT_EQ(task->error(), nullptr); + return; + } + ASSERT_NE(task->error(), nullptr); + VELOX_ASSERT_THROW( + std::rethrow_exception(task->error()), + "Memory pool aborted to reclaim used memory, current usage"); + }; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + + std::vector taskContainers; + for (const auto& testTask : testData.testTasks) { + auto task = addTask(testTask.capacity); + auto* op = addMemoryOp(task, testTask.reclaimable); + ASSERT_EQ(op->capacity(), testTask.expectedInitialCapacity); + if (testTask.allocateBytes != 0) { + op->allocate(testTask.allocateBytes); + } + ASSERT_LE(op->capacity(), testTask.capacity); + taskContainers.push_back({task, op, testTask}); + } + + ASSERT_EQ( + manager_->shrinkPools( + testData.targetBytes, testData.allowSpill, testData.allowAbort), + testData.expectedFreedBytes); + + for (const auto& taskContainer : taskContainers) { + checkTaskException( + taskContainer.task.get(), + taskContainer.testTask.expectedAbortAfterShrink); + } + + uint64_t totalCapacity{0}; + for (const auto& taskContainer : taskContainers) { + totalCapacity += taskContainer.task->capacity(); + } + ASSERT_EQ( + arbitrator_->stats().freeCapacityBytes, testData.expectedFreeCapacity); + ASSERT_EQ( + arbitrator_->stats().freeReservedCapacityBytes, + testData.expectedReservedFreeCapacity); + ASSERT_EQ( + totalCapacity + arbitrator_->stats().freeCapacityBytes, + arbitrator_->capacity()); } +} - verifyArbitratorStats( - arbitrator_->stats(), - kMemoryCapacity, +// This test verifies local arbitration runs from the same query has to wait for +// serial execution mode. +DEBUG_ONLY_TEST_F( + MockSharedArbitrationTest, + localArbitrationRunsFromSameQuery) { + const int64_t memoryCapacity = 512 << 20; + const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; + setupMemory(memoryCapacity, 0, memoryPoolInitCapacity, 0); + auto runTask = addTask(memoryCapacity); + auto* runPool = runTask->addMemoryOp(true); + auto* waitPool = runTask->addMemoryOp(true); + + std::atomic_bool allocationWaitFlag{true}; + folly::EventCount allocationWait; + std::atomic_bool localArbitrationWaitFlag{true}; + folly::EventCount localArbitrationWait; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runLocalArbitration", + std::function( + ([&](const SharedArbitrator* /*unused*/) { + if (!allocationWaitFlag.exchange(false)) { + return; + } + allocationWait.notifyAll(); + localArbitrationWait.await( + [&]() { return !localArbitrationWaitFlag.load(); }); + }))); + + std::atomic_int allocationCount{0}; + auto runThread = std::thread([&]() { + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + runPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 0); + ++allocationCount; + }); + + auto waitThread = std::thread([&]() { + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + waitPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationLockWaitWallNanos] + .count, + 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kLocalArbitrationLockWaitWallNanos].sum, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].sum, 1); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 0); + ++allocationCount; + }); + + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + ASSERT_EQ(allocationCount, 0); + + localArbitrationWaitFlag = false; + localArbitrationWait.notifyAll(); + + runThread.join(); + waitThread.join(); + ASSERT_EQ(allocationCount, 2); +} + +// This test verifies local arbitration runs from different queries don't have +// to block waiting each other. +DEBUG_ONLY_TEST_F( + MockSharedArbitrationTest, + localArbitrationRunsFromDifferentQueries) { + const int64_t memoryCapacity = 512 << 20; + const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; + setupMemory( + memoryCapacity, + 0, + memoryPoolInitCapacity, + 0, + 0, + kFastExponentialGrowthCapacityLimit, + kSlowCapacityGrowPct, 0, - (kMemoryCapacity - kMemoryPoolInitCapacity) / kMemoryPoolTransferCapacity, - (kMemoryCapacity - kMemoryPoolInitCapacity) / - kMemoryPoolTransferCapacity); + 0); + auto runTask = addTask(memoryCapacity); + auto* runPool = runTask->addMemoryOp(true); + auto waitTask = addTask(memoryCapacity); + auto* waitPool = waitTask->addMemoryOp(true); + + std::atomic_bool allocationWaitFlag{true}; + folly::EventCount allocationWait; + std::atomic_bool localArbitrationWaitFlag{true}; + folly::EventCount localArbitrationWait; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runLocalArbitration", + std::function( + ([&](const SharedArbitrator* /*unused*/) { + if (!allocationWaitFlag.exchange(false)) { + return; + } + allocationWait.notifyAll(); + localArbitrationWait.await( + [&]() { return !localArbitrationWaitFlag.load(); }); + }))); - verifyReclaimerStats( - memOp->reclaimer()->stats(), + std::atomic_int allocationCount{0}; + auto runThread = std::thread([&]() { + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + runPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 0); + ++allocationCount; + }); + + auto waitThread = std::thread([&]() { + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + waitPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 0); + ++allocationCount; + }); + + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + waitThread.join(); + ASSERT_EQ(allocationCount, 1); + + localArbitrationWaitFlag = false; + localArbitrationWait.notifyAll(); + + runThread.join(); + ASSERT_EQ(allocationCount, 2); +} + +// This test verifies local arbitration runs can run in parallel with free +// memory reclamation. +DEBUG_ONLY_TEST_F( + MockSharedArbitrationTest, + localArbitrationRunsWithFreeMemoryReclamation) { + const int64_t memoryCapacity = 512 << 20; + const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; + setupMemory( + memoryCapacity, + 0, + memoryPoolInitCapacity, 0, - (kMemoryCapacity - kMemoryPoolInitCapacity) / - kMemoryPoolTransferCapacity); + 0, + kFastExponentialGrowthCapacityLimit, + kSlowCapacityGrowPct, + 0, + 0); + auto runTask = addTask(memoryCapacity); + auto* runPool = runTask->addMemoryOp(true); + auto waitTask = addTask(memoryCapacity); + auto* waitPool = waitTask->addMemoryOp(true); + auto reclaimedTask = addTask(memoryCapacity); + auto* reclaimedPool = reclaimedTask->addMemoryOp(true); + reclaimedPool->allocate(memoryCapacity / 4); + reclaimedPool->allocate(memoryCapacity / 4); + reclaimedPool->freeAll(); + + std::atomic_bool allocationWaitFlag{true}; + folly::EventCount allocationWait; + std::atomic_bool localArbitrationWaitFlag{true}; + folly::EventCount localArbitrationWait; + std::atomic_int allocationCount{0}; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runLocalArbitration", + std::function( + ([&](const SharedArbitrator* /*unused*/) { + if (!allocationWaitFlag.exchange(false)) { + return; + } + allocationWait.notifyAll(); + while (allocationCount != 1) { + std::this_thread::sleep_for( + std::chrono::milliseconds(200)); // NOLINT + } + }))); - clearTasks(); - verifyArbitratorStats( - arbitrator_->stats(), - kMemoryCapacity, - kMemoryCapacity, - (kMemoryCapacity - kMemoryPoolInitCapacity) / kMemoryPoolTransferCapacity, - (kMemoryCapacity - kMemoryPoolInitCapacity) / - kMemoryPoolTransferCapacity); + auto runThread = std::thread([&]() { + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + runPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 0); + ++allocationCount; + }); + + auto waitThread = std::thread([&]() { + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + waitPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationLockWaitWallNanos] + .count, + 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kLocalArbitrationLockWaitWallNanos].sum, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 0); + ++allocationCount; + }); + + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + waitThread.join(); + ASSERT_EQ(allocationCount, 1); + runThread.join(); + ASSERT_EQ(allocationCount, 2); +} + +// This test verifies local arbitration run can't reclaim free memory from +// memory pool which is also under memory arbitration. +DEBUG_ONLY_TEST_F( + MockSharedArbitrationTest, + localArbitrationRunFreeMemoryReclamationCheck) { + const int64_t memoryCapacity = 512 << 20; + const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; + setupMemory(memoryCapacity, 0, memoryPoolInitCapacity, 0); + auto runTask = addTask(memoryCapacity); + auto* runPool = runTask->addMemoryOp(true); + runPool->allocate(memoryCapacity / 4); + runPool->allocate(memoryCapacity / 4); + auto waitTask = addTask(memoryCapacity); + auto* waitPool = waitTask->addMemoryOp(true); + waitPool->allocate(memoryCapacity / 4); + + std::atomic_bool allocationWaitFlag{true}; + folly::EventCount allocationWait; + std::atomic_bool localArbitrationWaitFlag{true}; + folly::EventCount localArbitrationWait; + std::atomic_int allocationCount{0}; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runLocalArbitration", + std::function( + ([&](const SharedArbitrator* /*unused*/) { + if (!allocationWaitFlag.exchange(false)) { + return; + } + allocationWait.notifyAll(); + + localArbitrationWait.await( + [&]() { return !localArbitrationWaitFlag.load(); }); + }))); + + auto runThread = std::thread([&]() { + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + runPool->allocate(memoryCapacity / 4); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 0); + ++allocationCount; + }); + + auto waitThread = std::thread([&]() { + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + waitPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].sum, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos].sum, + 0); + ++allocationCount; + }); + + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + ASSERT_EQ(allocationCount, 0); + + localArbitrationWaitFlag = false; + localArbitrationWait.notifyAll(); + + runThread.join(); + waitThread.join(); + ASSERT_EQ(allocationCount, 2); + ASSERT_EQ(runTask->capacity(), memoryCapacity / 4); + ASSERT_EQ(waitTask->capacity(), memoryCapacity / 4 + memoryCapacity / 2); +} + +DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, multipleGlobalRuns) { + const int64_t memoryCapacity = 512 << 20; + const uint64_t memoryPoolInitCapacity = memoryCapacity / 2; + setupMemory(memoryCapacity, 0, memoryPoolInitCapacity, 0); + auto runTask = addTask(memoryCapacity); + auto* runPool = runTask->addMemoryOp(true); + runPool->allocate(memoryCapacity / 2); + auto waitTask = addTask(memoryCapacity); + auto* waitPool = waitTask->addMemoryOp(true); + waitPool->allocate(memoryCapacity / 2); + + std::atomic_bool allocationWaitFlag{true}; + folly::EventCount allocationWait; + + std::atomic_bool globalArbitrationWaitFlag{true}; + folly::EventCount globalArbitrationWait; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", + std::function( + ([&](const SharedArbitrator* /*unused*/) { + if (!allocationWaitFlag.exchange(false)) { + return; + } + allocationWait.notifyAll(); + globalArbitrationWait.await( + [&]() { return !globalArbitrationWaitFlag.load(); }); + }))); + + std::atomic_int allocations{0}; + auto waitThread = std::thread([&]() { + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + waitPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].sum, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos].sum, + 0); + ++allocations; + }); + + auto runThread = std::thread([&]() { + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + runPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].sum, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ++allocations; + }); + + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + ASSERT_EQ(allocations, 0); + + globalArbitrationWaitFlag = false; + globalArbitrationWait.notifyAll(); + + runThread.join(); + waitThread.join(); + ASSERT_EQ(allocations, 2); + ASSERT_EQ(runTask->capacity(), memoryCapacity / 2); + ASSERT_EQ(waitTask->capacity(), memoryCapacity / 2); +} + +DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, globalArbitrationEnableCheck) { + for (bool globalArbitrationEnabled : {false, true}) { + SCOPED_TRACE( + fmt::format("globalArbitrationEnabled: {}", globalArbitrationEnabled)); + const int64_t memoryCapacity = 512 << 20; + const uint64_t memoryPoolInitCapacity = memoryCapacity / 2; + setupMemory( + memoryCapacity, + 0, + memoryPoolInitCapacity, + 0, + 0, + kFastExponentialGrowthCapacityLimit, + kSlowCapacityGrowPct, + kMemoryPoolMinFreeCapacity, + kMemoryPoolMinFreeCapacityPct, + nullptr, + globalArbitrationEnabled); + + auto reclaimedTask = addTask(memoryCapacity); + auto* reclaimedPool = reclaimedTask->addMemoryOp(true); + reclaimedPool->allocate(memoryCapacity / 2); + auto requestTask = addTask(memoryCapacity); + auto* requestPool = requestTask->addMemoryOp(false); + requestPool->allocate(memoryCapacity / 2); + if (globalArbitrationEnabled) { + requestPool->allocate(memoryCapacity / 2); + } else { + VELOX_ASSERT_THROW( + requestPool->allocate(memoryCapacity / 2), + "Exceeded memory pool cap"); + } + } +} + +// This test verifies when a global arbitration is running, the local +// arbitration run has to wait for the current running global arbitration run +// to complete. +DEBUG_ONLY_TEST_F( + MockSharedArbitrationTest, + localArbitrationWaitForGlobalArbitration) { + const int64_t memoryCapacity = 512 << 20; + const uint64_t memoryPoolInitCapacity = memoryCapacity / 2; + setupMemory( + memoryCapacity, + 0, + memoryPoolInitCapacity, + 0, + 0, + kFastExponentialGrowthCapacityLimit, + kSlowCapacityGrowPct, + 0, + 0); + auto runTask = addTask(memoryCapacity); + auto* runPool = runTask->addMemoryOp(true); + runPool->allocate(memoryCapacity / 2); + auto waitTask = addTask(memoryCapacity); + auto* waitPool = waitTask->addMemoryOp(true); + waitPool->allocate(memoryCapacity / 4); + + std::atomic_bool allocationWaitFlag{true}; + folly::EventCount allocationWait; + + std::atomic_bool globalArbitrationWaitFlag{true}; + folly::EventCount globalArbitrationWait; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", + std::function( + ([&](const SharedArbitrator* /*unused*/) { + if (!allocationWaitFlag.exchange(false)) { + return; + } + allocationWait.notifyAll(); + globalArbitrationWait.await( + [&]() { return !globalArbitrationWaitFlag.load(); }); + }))); + + std::atomic_int allocations{0}; + auto waitThread = std::thread([&]() { + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + waitPool->allocate(memoryCapacity / 4); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 0); + ++allocations; + }); + + auto runThread = std::thread([&]() { + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + runPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].sum, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ++allocations; + }); + + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + ASSERT_EQ(allocations, 0); + + globalArbitrationWaitFlag = false; + globalArbitrationWait.notifyAll(); + + runThread.join(); + waitThread.join(); + ASSERT_EQ(allocations, 2); + ASSERT_EQ(runTask->capacity(), memoryCapacity / 2); + ASSERT_EQ(waitTask->capacity(), memoryCapacity / 2); +} + +// This test verifies when a local arbitration is running, the global +// arbitration run have to wait for the current running global arbitration run +// to complete. +DEBUG_ONLY_TEST_F( + MockSharedArbitrationTest, + globalArbitrationWaitForLocalArbitration) { + const int64_t memoryCapacity = 512 << 20; + const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; + setupMemory(memoryCapacity, 0, memoryPoolInitCapacity, 0); + auto runTask = addTask(memoryCapacity / 2); + auto* runPool = runTask->addMemoryOp(true); + runPool->allocate(memoryCapacity / 4); + auto waitTask = addTask(memoryCapacity); + auto* waitPool = waitTask->addMemoryOp(true); + waitPool->allocate(memoryCapacity / 4); + waitPool->allocate(memoryCapacity / 4); + + std::atomic_bool allocationWaitFlag{true}; + folly::EventCount allocationWait; + std::atomic_bool localArbitrationWaitFlag{true}; + folly::EventCount localArbitrationWait; + std::atomic_int allocationCount{0}; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runLocalArbitration", + std::function( + ([&](const SharedArbitrator* /*unused*/) { + if (!allocationWaitFlag.exchange(false)) { + return; + } + allocationWait.notifyAll(); + + localArbitrationWait.await( + [&]() { return !localArbitrationWaitFlag.load(); }); + }))); + + auto runThread = std::thread([&]() { + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + runPool->allocate(memoryCapacity / 4); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 0); + ++allocationCount; + }); + + auto waitThread = std::thread([&]() { + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + waitPool->allocate(memoryCapacity / 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kLocalArbitrationQueueWallNanos].count, + 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos] + .count, + 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kGlobalArbitrationLockWaitWallNanos].sum, + 0); + ++allocationCount; + }); + + allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + ASSERT_EQ(allocationCount, 0); + + localArbitrationWaitFlag = false; + localArbitrationWait.notifyAll(); + + runThread.join(); + waitThread.join(); + ASSERT_EQ(allocationCount, 2); + ASSERT_EQ(runTask->capacity(), memoryCapacity / 2); + ASSERT_EQ(waitTask->capacity(), memoryCapacity / 2); +} + +TEST_F(MockSharedArbitrationTest, singlePoolShrinkWithoutArbitration) { + const int64_t memoryCapacity = 512 * MB; + struct TestParam { + uint64_t memoryPoolReservedBytes; + uint64_t memoryPoolMinFreeCapacity; + double memoryPoolMinFreeCapacityPct; + uint64_t requestBytes; + bool expectThrow; + uint64_t expectedCapacity; + std::string debugString() const { + return fmt::format( + "memoryPoolReservedBytes {}, " + "memoryPoolMinFreeCapacity {}, " + "memoryPoolMinFreeCapacityPct {}, " + "requestBytes {}, ", + succinctBytes(memoryPoolReservedBytes), + succinctBytes(memoryPoolMinFreeCapacity), + memoryPoolMinFreeCapacityPct, + succinctBytes(requestBytes)); + } + } testParams[] = { + {0, 128 * MB, 0, 256 * MB, true, 0}, + {0, 0, 0.1, 256 * MB, true, 0}, + {256 * MB, 128 * MB, 0.5, 256 * MB, false, 384 * MB}, + {256 * MB, 128 * MB, 0.125, 256 * MB, false, 320 * MB}, + {0, 128 * MB, 0.25, 0 * MB, false, 0}, + {256 * MB, 128 * MB, 0.125, 0 * MB, false, 256 * MB}, + {256 * MB, 128 * MB, 0.125, 512 * MB, false, 320 * MB}}; + + for (const auto& testParam : testParams) { + SCOPED_TRACE(testParam.debugString()); + if (testParam.expectThrow) { + VELOX_ASSERT_THROW( + setupMemory( + memoryCapacity, + 0, + memoryCapacity, + 0, + 0, + 0, + 0, + testParam.memoryPoolMinFreeCapacity, + testParam.memoryPoolMinFreeCapacityPct), + "both need to be set (non-zero) at the same time to enable shrink " + "capacity adjustment."); + continue; + } else { + setupMemory( + memoryCapacity, + 0, + memoryCapacity, + 0, + 0, + 0, + 0, + testParam.memoryPoolMinFreeCapacity, + testParam.memoryPoolMinFreeCapacityPct); + } + + auto* memOp = addMemoryOp(); + memOp->allocate(testParam.memoryPoolReservedBytes); + + ASSERT_EQ( + memOp->pool()->reservedBytes(), testParam.memoryPoolReservedBytes); + arbitrator_->shrinkCapacity(memOp->pool(), testParam.requestBytes); + ASSERT_EQ(memOp->pool()->capacity(), testParam.expectedCapacity); + clearTasks(); + } +} + +TEST_F(MockSharedArbitrationTest, singlePoolGrowWithoutArbitration) { + const int64_t memoryCapacity = 512 << 20; + const uint64_t memoryPoolInitCapacity = 32 << 20; + struct TestParam { + uint64_t fastExponentialGrowthCapacityLimit; + double slowCapacityGrowPct; + std::string debugString() const { + return fmt::format( + "fastExponentialGrowthCapacityLimit {}, " + "slowCapacityGrowPct {}", + succinctBytes(fastExponentialGrowthCapacityLimit), + slowCapacityGrowPct); + } + }; + + // Try to make each test allocation larger than the largest memory pool + // quantization(8MB) to not have noise. + std::vector testParams{ + {128 << 20, 0.1}, + {128 << 20, 0.1}, + {128 << 20, 0.5}, + }; + + for (const auto& testParam : testParams) { + SCOPED_TRACE(testParam.debugString()); + setupMemory( + memoryCapacity, + 0, + memoryPoolInitCapacity, + 0, + 0, + testParam.fastExponentialGrowthCapacityLimit, + testParam.slowCapacityGrowPct); + + auto* memOp = addMemoryOp(); + const int allocateSize = 1 * MB; + while (memOp->capacity() < memoryCapacity) { + memOp->allocate(allocateSize); + } + + // Computations of expected number of requests depending on capacity grow + // strategy (fast path or not). + uint64_t expectedNumRequests{0}; + + uint64_t simulateCapacity = memoryPoolInitCapacity; + while (simulateCapacity * 2 <= + testParam.fastExponentialGrowthCapacityLimit) { + simulateCapacity += simulateCapacity; + expectedNumRequests++; + } + while (simulateCapacity < memoryCapacity) { + auto growth = static_cast( + simulateCapacity * testParam.slowCapacityGrowPct); + simulateCapacity += growth; + expectedNumRequests++; + } + + verifyArbitratorStats( + arbitrator_->stats(), memoryCapacity, 0, 0, expectedNumRequests); + + verifyReclaimerStats(memOp->reclaimer()->stats(), 0, expectedNumRequests); + + clearTasks(); + verifyArbitratorStats( + arbitrator_->stats(), + memoryCapacity, + memoryCapacity, + 0, + expectedNumRequests); + } } TEST_F(MockSharedArbitrationTest, maxCapacityReserve) { const int memCapacity = 256 * MB; const int minPoolCapacity = 32 * MB; - setupMemory(memCapacity, minPoolCapacity); struct { - uint64_t maxCapacity; - uint64_t expectedInitialCapacity; + uint64_t memCapacity; + uint64_t reservedCapacity; + uint64_t poolInitCapacity; + uint64_t poolReservedCapacity; + uint64_t poolMaxCapacity; + uint64_t expectedPoolInitCapacity; std::string debugString() const { return fmt::format( - "maxCapacity {}, expectedInitialCapacity {}", - succinctBytes(maxCapacity), - succinctBytes(expectedInitialCapacity)); + "memCapacity {}, reservedCapacity {}, poolInitCapacity {}, poolReservedCapacity {}, poolMaxCapacity {}, expectedPoolInitCapacity {}", + succinctBytes(memCapacity), + succinctBytes(reservedCapacity), + succinctBytes(poolInitCapacity), + succinctBytes(poolReservedCapacity), + succinctBytes(poolMaxCapacity), + succinctBytes(expectedPoolInitCapacity)); } } testSettings[] = { - {minPoolCapacity, minPoolCapacity}, - {minPoolCapacity / 2, minPoolCapacity / 2}, - {minPoolCapacity * 2, minPoolCapacity}}; + {256 << 20, 256 << 20, 128 << 20, 64 << 20, 256 << 20, 64 << 20}, + {256 << 20, 0, 128 << 20, 64 << 20, 256 << 20, 128 << 20}, + {256 << 20, 0, 512 << 20, 64 << 20, 256 << 20, 256 << 20}, + {256 << 20, 0, 128 << 20, 64 << 20, 256 << 20, 128 << 20}, + {256 << 20, 128 << 20, 128 << 20, 64 << 20, 256 << 20, 128 << 20}, + {256 << 20, 128 << 20, 256 << 20, 64 << 20, 256 << 20, 128 << 20}, + {256 << 20, 128 << 20, 256 << 20, 256 << 20, 256 << 20, 256 << 20}, + {256 << 20, 128 << 20, 256 << 20, 256 << 20, 128 << 20, 128 << 20}}; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - auto task = addTask(testData.maxCapacity); - ASSERT_EQ(task->pool()->maxCapacity(), testData.maxCapacity); - ASSERT_EQ(task->pool()->capacity(), testData.expectedInitialCapacity); + setupMemory( + testData.memCapacity, + testData.reservedCapacity, + testData.poolInitCapacity, + testData.poolReservedCapacity); + auto task = addTask(testData.poolMaxCapacity); + ASSERT_EQ(task->pool()->maxCapacity(), testData.poolMaxCapacity); + ASSERT_EQ(task->pool()->capacity(), testData.expectedPoolInitCapacity); } } TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { const int memCapacity = 256 * MB; - const int minPoolCapacity = 8 * MB; + const int poolInitCapacity = 8 * MB; struct { - uint64_t maxCapacity; + uint64_t poolMaxCapacity; bool isReclaimable; uint64_t allocatedBytes; uint64_t requestBytes; bool hasOtherTask; + uint64_t otherAllocatedBytes; bool expectedSuccess; bool expectedReclaimFromOther; std::string debugString() const { return fmt::format( - "maxCapacity {} isReclaimable {} allocatedBytes {} requestBytes {} hasOtherTask {} expectedSuccess {} expectedReclaimFromOther {}", - succinctBytes(maxCapacity), + "poolMaxCapacity {} isReclaimable {} allocatedBytes {} requestBytes {} hasOtherTask {} otherAllocatedBytes {} expectedSuccess {} expectedReclaimFromOther {}", + succinctBytes(poolMaxCapacity), isReclaimable, succinctBytes(allocatedBytes), succinctBytes(requestBytes), hasOtherTask, + succinctBytes(otherAllocatedBytes), expectedSuccess, expectedReclaimFromOther); } @@ -678,6 +2036,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 4, memCapacity / 2, false, + 0, true, false}, {memCapacity / 2, @@ -685,6 +2044,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 4, memCapacity / 8, false, + 0, true, false}, {memCapacity / 2, @@ -692,6 +2052,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 4, memCapacity / 2, false, + 0, true, false}, {memCapacity / 2, @@ -699,6 +2060,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 2, memCapacity / 4, false, + 0, true, false}, {memCapacity / 2, @@ -706,6 +2068,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 4, memCapacity / 2, false, + 0, false, false}, {memCapacity / 2, @@ -713,6 +2076,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 2, memCapacity / 4, false, + 0, false, false}, {memCapacity / 2, @@ -720,6 +2084,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 4, memCapacity / 2, true, + memCapacity - memCapacity / 4, true, true}, {memCapacity / 2, @@ -727,6 +2092,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 4, memCapacity / 8, true, + memCapacity - memCapacity / 4, true, true}, {memCapacity / 2, @@ -734,6 +2100,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 4, memCapacity / 2, true, + memCapacity - memCapacity / 4, true, true}, {memCapacity / 2, @@ -741,6 +2108,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 2, memCapacity / 4, true, + memCapacity - memCapacity / 2, true, false}, {memCapacity / 2, @@ -748,6 +2116,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 4, memCapacity / 2, true, + memCapacity - memCapacity / 4, false, false}, {memCapacity / 2, @@ -755,22 +2124,31 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { memCapacity / 2, memCapacity / 4, false, + memCapacity - memCapacity / 2, false, false}}; - for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - setupMemory(memCapacity, minPoolCapacity); + setupMemory( + memCapacity, + 0, + poolInitCapacity, + 0, + 0, + kFastExponentialGrowthCapacityLimit, + kSlowCapacityGrowPct, + 0, + 0); - auto requestor = addTask(testData.maxCapacity); - auto requestorOp = addMemoryOp(requestor, testData.isReclaimable); + auto requestor = addTask(testData.poolMaxCapacity); + auto* requestorOp = addMemoryOp(requestor, testData.isReclaimable); requestorOp->allocate(testData.allocatedBytes); std::shared_ptr other; MockMemoryOperator* otherOp; if (testData.hasOtherTask) { other = addTask(); otherOp = addMemoryOp(other, true); - otherOp->allocate(memCapacity - testData.allocatedBytes); + otherOp->allocate(testData.otherAllocatedBytes); } const auto numRequests = arbitrator_->stats().numRequests; if (testData.expectedSuccess) { @@ -778,7 +2156,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { } else { VELOX_ASSERT_THROW( requestorOp->allocate(testData.requestBytes), - "Exceeded memory pool cap of"); + "Exceeded memory pool capacity"); } if (testData.expectedReclaimFromOther) { ASSERT_GT(otherOp->reclaimer()->stats().numReclaims, 0); @@ -787,7 +2165,7 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { } if (testData.expectedSuccess && (((testData.allocatedBytes + testData.requestBytes) > - testData.maxCapacity) || + testData.poolMaxCapacity) || testData.hasOtherTask)) { ASSERT_GT(arbitrator_->stats().numReclaimedBytes, 0); } else { @@ -809,7 +2187,9 @@ TEST_F(MockSharedArbitrationTest, ensureNodeMaxCapacity) { std::string debugString() const { return fmt::format( - "nodeCapacity {} poolMaxCapacity {} isReclaimable {} allocatedBytes {} requestBytes {} expectedSuccess {} expectedReclaimedBytes {}", + "nodeCapacity {} poolMaxCapacity {} isReclaimable {} " + "allocatedBytes {} requestBytes {} expectedSuccess {} " + "expectedReclaimedBytes {}", succinctBytes(nodeCapacity), succinctBytes(poolMaxCapacity), isReclaimable, @@ -830,10 +2210,10 @@ TEST_F(MockSharedArbitrationTest, ensureNodeMaxCapacity) { for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - setupMemory(testData.nodeCapacity); + setupMemory(testData.nodeCapacity, 0, 0, 0); auto requestor = addTask(testData.poolMaxCapacity); - auto requestorOp = addMemoryOp(requestor, testData.isReclaimable); + auto* requestorOp = addMemoryOp(requestor, testData.isReclaimable); requestorOp->allocate(testData.allocatedBytes); const auto numRequests = arbitrator_->stats().numRequests; if (testData.expectedSuccess) { @@ -855,70 +2235,88 @@ TEST_F(MockSharedArbitrationTest, ensureNodeMaxCapacity) { TEST_F(MockSharedArbitrationTest, failedArbitration) { const int memCapacity = 256 * MB; const int minPoolCapacity = 8 * MB; - setupMemory(memCapacity, minPoolCapacity); - auto reclaimableOp = addMemoryOp(); + setupMemory(memCapacity, 0, minPoolCapacity, 0); + auto* reclaimableOp = addMemoryOp(); ASSERT_EQ(reclaimableOp->capacity(), minPoolCapacity); - auto nonReclaimableOp = addMemoryOp(nullptr, false); + auto* nonReclaimableOp = addMemoryOp(nullptr, false); ASSERT_EQ(nonReclaimableOp->capacity(), minPoolCapacity); - auto arbitrateOp = addMemoryOp(); + auto* arbitrateOp = addMemoryOp(); ASSERT_EQ(arbitrateOp->capacity(), minPoolCapacity); reclaimableOp->allocate(minPoolCapacity); ASSERT_EQ(reclaimableOp->capacity(), minPoolCapacity); nonReclaimableOp->allocate(minPoolCapacity); ASSERT_EQ(nonReclaimableOp->capacity(), minPoolCapacity); - ASSERT_ANY_THROW(arbitrateOp->allocate(memCapacity)); + VELOX_ASSERT_THROW( + arbitrateOp->allocate(memCapacity), "Exceeded memory pool cap"); verifyReclaimerStats(nonReclaimableOp->reclaimer()->stats()); verifyReclaimerStats(reclaimableOp->reclaimer()->stats(), 1); - verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 1, 1); + verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1); verifyArbitratorStats( - arbitrator_->stats(), memCapacity, 260046848, 1, 0, 1, 8388608, 8388608); - ASSERT_EQ(arbitrator_->stats().queueTimeUs, 0); + arbitrator_->stats(), memCapacity, 260046848, 0, 1, 1, 8388608, 8388608); + ASSERT_GE(arbitrator_->stats().queueTimeUs, 0); } TEST_F(MockSharedArbitrationTest, singlePoolGrowCapacityWithArbitration) { - std::vector isLeafReclaimables = {true, false}; + const std::vector isLeafReclaimables = {true, false}; for (const auto isLeafReclaimable : isLeafReclaimables) { SCOPED_TRACE(fmt::format("isLeafReclaimable {}", isLeafReclaimable)); setupMemory(); auto op = addMemoryOp(nullptr, isLeafReclaimable); const int allocateSize = MB; - while (op->pool()->currentBytes() < kMemoryCapacity) { + while (op->pool()->usedBytes() < + kMemoryCapacity - kReservedMemoryCapacity) { op->allocate(allocateSize); } - verifyArbitratorStats(arbitrator_->stats(), kMemoryCapacity, 0, 62, 62); - verifyReclaimerStats(op->reclaimer()->stats(), 0, 62); + verifyArbitratorStats( + arbitrator_->stats(), + kMemoryCapacity, + kReservedMemoryCapacity, + kReservedMemoryCapacity, + 13); + verifyReclaimerStats(op->reclaimer()->stats(), 0, 13); if (!isLeafReclaimable) { - ASSERT_ANY_THROW(op->allocate(allocateSize)); + VELOX_ASSERT_THROW( + op->allocate(allocateSize), "Exceeded memory pool cap"); verifyArbitratorStats( - arbitrator_->stats(), kMemoryCapacity, 0, 63, 62, 1); - verifyReclaimerStats(op->reclaimer()->stats(), 1, 63); + arbitrator_->stats(), + kMemoryCapacity, + kReservedMemoryCapacity, + kReservedMemoryCapacity, + 14, + 1); + verifyReclaimerStats(op->reclaimer()->stats(), 0, 14); continue; } // Do more allocations to trigger arbitration. - for (int i = 0; i < kMemoryPoolTransferCapacity / allocateSize; ++i) { - op->allocate(allocateSize); - } + op->allocate( + op->pool()->capacity() - op->pool()->root()->reservedBytes() + MB); verifyArbitratorStats( - arbitrator_->stats(), kMemoryCapacity, 0, 63, 63, 0, 8388608); - verifyReclaimerStats(op->reclaimer()->stats(), 1, 63); + arbitrator_->stats(), + kMemoryCapacity, + kReservedMemoryCapacity, + kReservedMemoryCapacity, + 14, + 0, + 8388608); + verifyReclaimerStats(op->reclaimer()->stats(), 1, 14); clearTasks(); verifyArbitratorStats( arbitrator_->stats(), kMemoryCapacity, kMemoryCapacity, - 63, - 63, + kReservedMemoryCapacity, + 14, 0, 8388608); } } TEST_F(MockSharedArbitrationTest, arbitrateWithCapacityShrink) { - std::vector isLeafReclaimables = {true, false}; + const std::vector isLeafReclaimables = {true, false}; for (const auto isLeafReclaimable : isLeafReclaimables) { SCOPED_TRACE(fmt::format("isLeafReclaimable {}", isLeafReclaimable)); setupMemory(); @@ -932,7 +2330,7 @@ TEST_F(MockSharedArbitrationTest, arbitrateWithCapacityShrink) { ASSERT_GT(freeCapacity, 0); reclaimedOp->freeAll(); ASSERT_GT(reclaimedOp->pool()->freeBytes(), 0); - ASSERT_EQ(reclaimedOp->pool()->currentBytes(), 0); + ASSERT_EQ(reclaimedOp->pool()->usedBytes(), 0); ASSERT_EQ(arbitrator_->stats().freeCapacityBytes, freeCapacity); auto* arbitrateOp = addMemoryOp(nullptr, isLeafReclaimable); @@ -943,8 +2341,8 @@ TEST_F(MockSharedArbitrationTest, arbitrateWithCapacityShrink) { ASSERT_GT(arbitratorStats.numShrunkBytes, 0); ASSERT_EQ(arbitratorStats.numReclaimedBytes, 0); - verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 0, 11); - verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 5); + verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 0, 8); + verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1); clearTasks(); } @@ -952,34 +2350,47 @@ TEST_F(MockSharedArbitrationTest, arbitrateWithCapacityShrink) { TEST_F(MockSharedArbitrationTest, arbitrateWithMemoryReclaim) { const uint64_t memoryCapacity = 256 * MB; - const uint64_t minPoolCapacity = 8 * MB; + const uint64_t reservedMemoryCapacity = 128 * MB; + const uint64_t initPoolCapacity = 8 * MB; + const uint64_t reservedPoolCapacity = 8 * MB; const std::vector isLeafReclaimables = {true, false}; for (const auto isLeafReclaimable : isLeafReclaimables) { SCOPED_TRACE(fmt::format("isLeafReclaimable {}", isLeafReclaimable)); - setupMemory(memoryCapacity, minPoolCapacity); + setupMemory( + memoryCapacity, + reservedMemoryCapacity, + initPoolCapacity, + reservedPoolCapacity); auto* reclaimedOp = addMemoryOp(nullptr, isLeafReclaimable); const int allocateSize = 8 * MB; - while (reclaimedOp->pool()->currentBytes() < memoryCapacity) { + while (reclaimedOp->pool()->usedBytes() < + memoryCapacity - reservedMemoryCapacity) { reclaimedOp->allocate(allocateSize); } auto* arbitrateOp = addMemoryOp(); if (!isLeafReclaimable) { auto leafTask = tasks().front(); - ASSERT_NO_THROW(arbitrateOp->allocate(allocateSize)); + ASSERT_NO_THROW(arbitrateOp->allocate(reservedMemoryCapacity / 2)); + ASSERT_NE(leafTask->error(), nullptr); ASSERT_EQ(arbitrator_->stats().numFailures, 0); continue; } - arbitrateOp->allocate(allocateSize); + arbitrateOp->allocate(reservedMemoryCapacity / 2); verifyArbitratorStats( - arbitrator_->stats(), memoryCapacity, 0, 32, 32, 0, 8388608); + arbitrator_->stats(), + memoryCapacity, + kReservedMemoryCapacity - reservedPoolCapacity, + kReservedMemoryCapacity - reservedPoolCapacity, + 10, + 0, + 58720256, + 10559488); - verifyReclaimerStats( - arbitrateOp->reclaimer()->stats(), 0, 1, kMemoryPoolTransferCapacity); + verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1, 0); - verifyReclaimerStats( - reclaimedOp->reclaimer()->stats(), 1, 31, kMemoryPoolTransferCapacity); + verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 1, 9, 0); clearTasks(); } } @@ -989,29 +2400,34 @@ TEST_F(MockSharedArbitrationTest, arbitrateBySelfMemoryReclaim) { for (const auto isLeafReclaimable : isLeafReclaimables) { SCOPED_TRACE(fmt::format("isLeafReclaimable {}", isLeafReclaimable)); const uint64_t memCapacity = 128 * MB; - setupMemory(memCapacity); + const uint64_t reservedCapacity = 8 * MB; + const uint64_t poolReservedCapacity = 4 * MB; + setupMemory( + memCapacity, reservedCapacity, reservedCapacity, poolReservedCapacity); std::shared_ptr task = addTask(kMemoryCapacity); auto* memOp = addMemoryOp(task, isLeafReclaimable); const int allocateSize = 8 * MB; - while (memOp->pool()->currentBytes() < memCapacity / 2) { + while (memOp->pool()->usedBytes() < memCapacity / 2) { memOp->allocate(allocateSize); } - ASSERT_EQ(memOp->pool()->freeBytes(), 0); + // Extra free bytes due to fast/slow grow strategy + ASSERT_EQ(memOp->pool()->freeBytes(), 14811136); const int oldNumRequests = arbitrator_->stats().numRequests; // Allocate a large chunk of memory to trigger arbitration. if (!isLeafReclaimable) { - ASSERT_ANY_THROW(memOp->allocate(memCapacity)); + VELOX_ASSERT_THROW( + memOp->allocate(memCapacity), "Exceeded memory pool cap"); ASSERT_EQ(oldNumRequests + 1, arbitrator_->stats().numRequests); ASSERT_EQ(arbitrator_->stats().numFailures, 1); continue; } else { - memOp->allocate(memCapacity); + memOp->allocate(memCapacity / 2); ASSERT_EQ(oldNumRequests + 1, arbitrator_->stats().numRequests); ASSERT_EQ(arbitrator_->stats().numFailures, 0); - ASSERT_EQ(arbitrator_->stats().numShrunkBytes, 0); + ASSERT_EQ(arbitrator_->stats().numShrunkBytes, 14811136); ASSERT_GT(arbitrator_->stats().numReclaimedBytes, 0); } - ASSERT_EQ(arbitrator_->stats().queueTimeUs, 0); + ASSERT_GE(arbitrator_->stats().queueTimeUs, 0); } } @@ -1060,7 +2476,7 @@ TEST_F(MockSharedArbitrationTest, noAbortOnRequestWhenArbitrationFails) { DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, orderedArbitration) { SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::sortCandidatesByFreeCapacity", + "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableFreeCapacity", std::function*)>( ([&](const std::vector* candidates) { for (int i = 1; i < candidates->size(); ++i) { @@ -1069,7 +2485,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, orderedArbitration) { } }))); SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableMemory", + "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableUsedCapacity", std::function*)>( ([&](const std::vector* candidates) { for (int i = 1; i < candidates->size(); ++i) { @@ -1078,11 +2494,14 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, orderedArbitration) { (*candidates)[i - 1].reclaimableBytes); } }))); + folly::Random::DefaultGenerator rng; rng.seed(512); const uint64_t memCapacity = 512 * MB; - const uint64_t minPoolCapacity = 32 * MB; - const uint64_t minPoolCapacityTransferSize = 8 * MB; + const uint64_t reservedMemCapacity = 128 * MB; + const uint64_t initPoolCapacity = 32 * MB; + const uint64_t reservedPoolCapacity = 8 * MB; + const uint64_t baseAllocationSize = 8 * MB; const int numTasks = 8; struct { bool freeCapacity; @@ -1097,147 +2516,43 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, orderedArbitration) { for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - setupMemory(memCapacity, minPoolCapacity, minPoolCapacityTransferSize); + + setupMemory( + memCapacity, + reservedMemCapacity, + initPoolCapacity, + reservedPoolCapacity); std::vector memOps; - std::vector memOpCapacities; for (int i = 0; i < numTasks; ++i) { auto* memOp = addMemoryOp(); + ASSERT_GE(memOp->capacity(), reservedPoolCapacity); int allocationSize = testData.sameSize ? memCapacity / numTasks - : minPoolCapacity + + : baseAllocationSize + folly::Random::rand32(rng) % - ((memCapacity / numTasks) - minPoolCapacity); + ((memCapacity / numTasks) - baseAllocationSize); allocationSize = allocationSize / MB * MB; memOp->allocate(allocationSize); if (testData.freeCapacity) { memOp->freeAll(); - ASSERT_EQ(memOp->pool()->currentBytes(), 0); + ASSERT_EQ(memOp->pool()->usedBytes(), 0); } memOps.push_back(memOp); } auto* arbitrateOp = addMemoryOp(); - arbitrateOp->allocate(memCapacity); + arbitrateOp->allocate(memCapacity / 2); for (auto* memOp : memOps) { - ASSERT_EQ(memOp->capacity(), 0); + ASSERT_GE(memOp->capacity(), 0) << memOp->pool()->name(); } - ASSERT_EQ(arbitrator_->stats().queueTimeUs, 0); + ASSERT_GE(arbitrator_->stats().queueTimeUs, 0); clearTasks(); } } -TEST_F(MockSharedArbitrationTest, poolCapacityTransferWithFreeCapacity) { - const uint64_t memCapacity = 512 * MB; - const uint64_t minPoolCapacity = 32 * MB; - const uint64_t minPoolCapacityTransferSize = 16 * MB; - setupMemory(memCapacity, minPoolCapacity, minPoolCapacityTransferSize); - auto* memOp = addMemoryOp(); - ASSERT_EQ(memOp->capacity(), minPoolCapacity); - memOp->allocate(minPoolCapacity); - ASSERT_EQ(memOp->pool()->freeBytes(), 0); - const uint64_t allocationSize = 8 * MB; - uint64_t capacity = memOp->pool()->capacity(); - while (capacity < memCapacity) { - memOp->allocate(allocationSize); - ASSERT_EQ(capacity + minPoolCapacityTransferSize, memOp->capacity()); - while (memOp->pool()->freeBytes() > 0) { - memOp->allocate(allocationSize); - } - capacity = memOp->capacity(); - } - const int expectedArbitrationRequests = - (memCapacity - minPoolCapacity) / minPoolCapacityTransferSize; - verifyReclaimerStats( - memOp->reclaimer()->stats(), 0, expectedArbitrationRequests); - verifyArbitratorStats( - arbitrator_->stats(), - memCapacity, - 0, - expectedArbitrationRequests, - expectedArbitrationRequests); - ASSERT_EQ(arbitrator_->stats().queueTimeUs, 0); -} - -TEST_F(MockSharedArbitrationTest, poolCapacityTransferSizeWithCapacityShrunk) { - const int numCandidateOps = 8; - const uint64_t minPoolCapacity = 64 * MB; - const uint64_t minPoolCapacityTransferSize = 32 * MB; - const uint64_t memCapacity = minPoolCapacity * numCandidateOps; - setupMemory(memCapacity, minPoolCapacity, minPoolCapacityTransferSize); - const int allocationSize = 8 * MB; - std::vector candidateOps; - for (int i = 0; i < numCandidateOps; ++i) { - candidateOps.push_back(addMemoryOp()); - ASSERT_EQ(candidateOps.back()->capacity(), minPoolCapacity); - candidateOps.back()->allocate(allocationSize); - ASSERT_EQ(candidateOps.back()->capacity(), minPoolCapacity); - ASSERT_GT(candidateOps.back()->pool()->freeBytes(), 0); - } - auto* arbitrateOp = addMemoryOp(); - ASSERT_EQ(arbitrateOp->capacity(), 0); - arbitrateOp->allocate(allocationSize); - ASSERT_EQ(arbitrateOp->capacity(), minPoolCapacityTransferSize); - verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1); - ASSERT_EQ(arbitrator_->stats().numShrunkBytes, minPoolCapacityTransferSize); - ASSERT_EQ(arbitrator_->stats().numReclaimedBytes, 0); - ASSERT_EQ(arbitrator_->stats().numRequests, 1); -} - -TEST_F(MockSharedArbitrationTest, partialPoolCapacityTransferSize) { - const int numCandidateOps = 8; - const uint64_t minPoolCapacity = 64 * MB; - const uint64_t minPoolCapacityTransferSize = 32 * MB; - const uint64_t memCapacity = minPoolCapacity * numCandidateOps; - setupMemory(memCapacity, minPoolCapacity, minPoolCapacityTransferSize); - const int allocationSize = 8 * MB; - std::vector candidateOps; - for (int i = 0; i < numCandidateOps; ++i) { - candidateOps.push_back(addMemoryOp()); - ASSERT_EQ(candidateOps.back()->capacity(), minPoolCapacity); - candidateOps.back()->allocate(allocationSize); - ASSERT_EQ(candidateOps.back()->capacity(), minPoolCapacity); - ASSERT_GT(candidateOps.back()->pool()->freeBytes(), 0); - } - auto* arbitrateOp = addMemoryOp(); - ASSERT_EQ(arbitrateOp->capacity(), 0); - arbitrateOp->allocate(allocationSize); - ASSERT_EQ(arbitrateOp->capacity(), minPoolCapacityTransferSize); - verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1); - ASSERT_EQ(arbitrator_->stats().numShrunkBytes, minPoolCapacityTransferSize); - ASSERT_EQ(arbitrator_->stats().numReclaimedBytes, 0); - ASSERT_EQ(arbitrator_->stats().numRequests, 1); -} - -TEST_F(MockSharedArbitrationTest, poolCapacityTransferSizeWithMemoryReclaim) { - const uint64_t memCapacity = 128 * MB; - const uint64_t minPoolCapacity = memCapacity; - const uint64_t minPoolCapacityTransferSize = 64 * MB; - setupMemory(memCapacity, minPoolCapacity, minPoolCapacityTransferSize); - auto* reclaimedOp = addMemoryOp(); - ASSERT_EQ(reclaimedOp->capacity(), memCapacity); - const int allocationSize = 8 * MB; - std::vector> candidateOps; - for (int i = 0; i < memCapacity / allocationSize; ++i) { - reclaimedOp->allocate(allocationSize); - } - ASSERT_EQ(reclaimedOp->pool()->freeBytes(), 0); - - auto* arbitrateOp = addMemoryOp(); - ASSERT_EQ(arbitrateOp->capacity(), 0); - arbitrateOp->allocate(allocationSize); - ASSERT_EQ(arbitrateOp->capacity(), minPoolCapacityTransferSize); - verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1); - verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 1); - ASSERT_EQ(arbitrator_->stats().numShrunkBytes, 0); - ASSERT_EQ( - arbitrator_->stats().numReclaimedBytes, minPoolCapacityTransferSize); - ASSERT_EQ(arbitrator_->stats().numRequests, 1); -} - TEST_F(MockSharedArbitrationTest, enterArbitrationException) { const uint64_t memCapacity = 128 * MB; - const uint64_t minPoolCapacity = memCapacity; - const uint64_t minPoolCapacityTransferSize = 64 * MB; - setupMemory(memCapacity, minPoolCapacity, minPoolCapacityTransferSize); + const uint64_t initPoolCapacity = memCapacity; + setupMemory(memCapacity, 0, initPoolCapacity, 0); auto* reclaimedOp = addMemoryOp(); ASSERT_EQ(reclaimedOp->capacity(), memCapacity); const int allocationSize = 8 * MB; @@ -1251,17 +2566,19 @@ TEST_F(MockSharedArbitrationTest, enterArbitrationException) { VELOX_FAIL("enterArbitrationException failed"); }); ASSERT_EQ(failedArbitrateOp->capacity(), 0); - ASSERT_ANY_THROW(failedArbitrateOp->allocate(allocationSize)); + VELOX_ASSERT_THROW( + failedArbitrateOp->allocate(allocationSize), + "enterArbitrationException failed"); + ASSERT_FALSE(failedArbitrateOp->pool()->aborted()); verifyReclaimerStats(failedArbitrateOp->reclaimer()->stats()); ASSERT_EQ(failedArbitrateOp->capacity(), 0); auto* arbitrateOp = addMemoryOp(); arbitrateOp->allocate(allocationSize); - ASSERT_EQ(arbitrateOp->capacity(), minPoolCapacityTransferSize); + ASSERT_EQ(arbitrateOp->capacity(), allocationSize); verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1); verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 1); ASSERT_EQ(arbitrator_->stats().numShrunkBytes, 0); - ASSERT_EQ( - arbitrator_->stats().numReclaimedBytes, minPoolCapacityTransferSize); + ASSERT_EQ(arbitrator_->stats().numReclaimedBytes, allocationSize); ASSERT_EQ(arbitrator_->stats().numRequests, 1); ASSERT_EQ(arbitrator_->stats().numFailures, 0); } @@ -1285,13 +2602,13 @@ TEST_F(MockSharedArbitrationTest, noArbitratiognFromAbortedPool) { // Check we don't allow memory reservation increase or trigger memory // arbitration at root memory pool. ASSERT_EQ(reclaimedOp->pool()->capacity(), kMemoryPoolInitCapacity); - ASSERT_EQ(reclaimedOp->pool()->currentBytes(), 0); + ASSERT_EQ(reclaimedOp->pool()->usedBytes(), 0); VELOX_ASSERT_THROW(reclaimedOp->allocate(128), ""); - ASSERT_EQ(reclaimedOp->pool()->currentBytes(), 0); + ASSERT_EQ(reclaimedOp->pool()->usedBytes(), 0); ASSERT_EQ(reclaimedOp->pool()->capacity(), kMemoryPoolInitCapacity); VELOX_ASSERT_THROW(reclaimedOp->allocate(kMemoryPoolInitCapacity * 2), ""); ASSERT_EQ(reclaimedOp->pool()->capacity(), kMemoryPoolInitCapacity); - ASSERT_EQ(reclaimedOp->pool()->currentBytes(), 0); + ASSERT_EQ(reclaimedOp->pool()->usedBytes(), 0); ASSERT_EQ(arbitrator_->stats().numRequests, 0); ASSERT_EQ(arbitrator_->stats().numAborted, 0); ASSERT_EQ(arbitrator_->stats().numFailures, 0); @@ -1352,7 +2669,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromRequestor) { 0}}; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - setupMemory(); + setupMemory(kMemoryCapacity, 0, kMemoryPoolInitCapacity, 0, 0, 0, 0, 0, 0); std::vector> otherTasks; std::vector otherTaskOps; @@ -1361,7 +2678,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromRequestor) { otherTaskOps.push_back(addMemoryOp(otherTasks.back(), false)); otherTaskOps.back()->allocate(otherTaskMemoryCapacity); ASSERT_EQ( - otherTasks.back()->pool()->currentBytes(), otherTaskMemoryCapacity); + otherTasks.back()->pool()->usedBytes(), otherTaskMemoryCapacity); } std::shared_ptr failedTask = addTask(); MockMemoryOperator* failedTaskOp = addMemoryOp( @@ -1391,12 +2708,12 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromRequestor) { numFailedTaskAllocationsAfterAbort + numOtherAllocationsAfterAbort + 1); folly::futures::Barrier arbitrationBarrier( numFailedTaskAllocationsAfterAbort + numOtherAllocationsAfterAbort + 1); - std::atomic testInjectionCount{0}; - std::atomic arbitrationStarted{false}; + std::atomic_int testInjectionCount{0}; + std::atomic_bool arbitrationStarted{false}; SCOPED_TESTVALUE_SET( "facebook::velox::memory::SharedArbitrator::startArbitration", - std::function( - ([&](const MemoryPool* /*unsed*/) { + std::function( + ([&](const SharedArbitrator* /*unused*/) { if (!arbitrationStarted) { return; } @@ -1407,7 +2724,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromRequestor) { }))); SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::sortCandidatesByFreeCapacity", + "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableFreeCapacity", std::function*)>( ([&](const std::vector* /*unused*/) { if (!arbitrationStarted.exchange(true)) { @@ -1427,7 +2744,8 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromRequestor) { arbitrationStartBarrier.wait().wait(); if (i < numFailedTaskAllocationsAfterAbort) { VELOX_ASSERT_THROW( - failedTaskOp->allocate(failedTaskMemoryCapacity), ""); + failedTaskOp->allocate(failedTaskMemoryCapacity), + "The requestor pool has been aborted"); } else { otherTaskOps[i - numFailedTaskAllocationsAfterAbort]->allocate( otherTaskMemoryCapacity); @@ -1436,14 +2754,16 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromRequestor) { } // Trigger memory arbitration to reclaim from itself which throws. - VELOX_ASSERT_THROW(failedTaskOp->allocate(failedTaskMemoryCapacity), ""); + VELOX_ASSERT_THROW( + failedTaskOp->allocate(failedTaskMemoryCapacity), + "The requestor pool has been aborted"); // Wait for all the allocation threads to complete. for (auto& allocationThread : allocationThreadsAfterAbort) { allocationThread.join(); } ASSERT_TRUE(failedTaskOp->pool()->aborted()); ASSERT_EQ( - failedTaskOp->pool()->currentBytes(), + failedTaskOp->pool()->usedBytes(), testData.expectedFailedTaskMemoryCapacity); ASSERT_EQ( failedTaskOp->pool()->capacity(), @@ -1468,8 +2788,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromRequestor) { ASSERT_EQ( taskOp->pool()->capacity(), testData.expectedOtherTaskMemoryCapacity); ASSERT_EQ( - taskOp->pool()->currentBytes(), - testData.expectedOtherTaskMemoryUsage); + taskOp->pool()->usedBytes(), testData.expectedOtherTaskMemoryUsage); } VELOX_ASSERT_THROW(failedTaskOp->allocate(failedTaskMemoryCapacity), ""); @@ -1534,7 +2853,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromOtherTask) { nonFailTaskMemoryCapacity}}; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - setupMemory(); + setupMemory(kMemoryCapacity, 0, kMemoryPoolInitCapacity, 0, 0, 0, 0, 0, 0); std::vector> nonFailedTasks; std::vector nonFailedTaskOps; @@ -1543,7 +2862,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromOtherTask) { nonFailedTaskOps.push_back(addMemoryOp(nonFailedTasks.back(), false)); nonFailedTaskOps.back()->allocate(nonFailTaskMemoryCapacity); ASSERT_EQ( - nonFailedTasks.back()->pool()->currentBytes(), + nonFailedTasks.back()->pool()->usedBytes(), nonFailTaskMemoryCapacity); } std::shared_ptr failedTask = addTask(); @@ -1582,8 +2901,8 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromOtherTask) { std::atomic arbitrationStarted{false}; SCOPED_TESTVALUE_SET( "facebook::velox::memory::SharedArbitrator::startArbitration", - std::function( - ([&](const MemoryPool* /*unsed*/) { + std::function( + ([&](const SharedArbitrator* /*unsed*/) { if (!arbitrationStarted) { return; } @@ -1594,7 +2913,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromOtherTask) { }))); SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::sortCandidatesByFreeCapacity", + "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableFreeCapacity", std::function*)>( ([&](const std::vector* /*unused*/) { if (!arbitrationStarted.exchange(true)) { @@ -1630,7 +2949,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromOtherTask) { } ASSERT_TRUE(failedTaskOp->pool()->aborted()); ASSERT_EQ( - failedTaskOp->pool()->currentBytes(), + failedTaskOp->pool()->usedBytes(), testData.expectedFailedTaskMemoryCapacity); ASSERT_EQ( failedTaskOp->pool()->capacity(), @@ -1659,7 +2978,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromOtherTask) { testData.expectedNonFailedTaskMemoryCapacity + nonFailTaskMemoryCapacity); ASSERT_EQ( - taskOp->pool()->currentBytes(), + taskOp->pool()->usedBytes(), testData.expectedNonFailedTaskMemoryUsage + nonFailTaskMemoryCapacity); } else { @@ -1667,7 +2986,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromOtherTask) { taskOp->pool()->capacity(), testData.expectedNonFailedTaskMemoryCapacity); ASSERT_EQ( - taskOp->pool()->currentBytes(), + taskOp->pool()->usedBytes(), testData.expectedNonFailedTaskMemoryUsage); } } @@ -1679,6 +2998,16 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromOtherTask) { } TEST_F(MockSharedArbitrationTest, memoryPoolAbortThrow) { + setupMemory( + kMemoryCapacity, + 0, + kMemoryPoolInitCapacity, + 0, + 0, + kFastExponentialGrowthCapacityLimit, + kSlowCapacityGrowPct, + 0, + 0); const int numTasks = 4; const int smallTaskMemoryCapacity = kMemoryCapacity / 8; const int largeTaskMemoryCapacity = kMemoryCapacity / 2; @@ -1700,7 +3029,9 @@ TEST_F(MockSharedArbitrationTest, memoryPoolAbortThrow) { ASSERT_EQ(oldStats.numAborted, 0); // Trigger memory arbitration to reclaim from itself which throws. - VELOX_ASSERT_THROW(largeTaskOp->allocate(largeTaskMemoryCapacity), ""); + VELOX_ASSERT_THROW( + largeTaskOp->allocate(largeTaskMemoryCapacity), + "The requestor pool has been aborted"); const auto newStats = arbitrator_->stats(); ASSERT_EQ(newStats.numRequests, oldStats.numRequests + 1); ASSERT_EQ(newStats.numAborted, 1); @@ -1720,10 +3051,44 @@ TEST_F(MockSharedArbitrationTest, memoryPoolAbortThrow) { ASSERT_EQ(arbitrator_->stats().numAborted, 1); } +// This test makes sure the memory capacity grows as expected. +DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, concurrentArbitrationRequests) { + setupMemory(kMemoryCapacity, 0, 0, 0, 128 << 20); + std::shared_ptr task = addTask(); + MockMemoryOperator* op1 = addMemoryOp(task); + MockMemoryOperator* op2 = addMemoryOp(task); + + std::atomic_bool arbitrationWaitFlag{true}; + folly::EventCount arbitrationWait; + std::atomic_bool injectOnce{true}; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::startArbitration", + std::function( + ([&](const SharedArbitrator* arbitrator) { + if (!injectOnce.exchange(false)) { + return; + } + arbitrationWaitFlag = false; + arbitrationWait.notifyAll(); + while (arbitrator->testingNumRequests() != 2) { + std::this_thread::sleep_for(std::chrono::seconds(5)); // NOLINT + } + }))); + + std::thread firstArbitrationThread([&]() { op1->allocate(64 << 20); }); + + std::thread secondArbitrationThread([&]() { op2->allocate(64 << 20); }); + + firstArbitrationThread.join(); + secondArbitrationThread.join(); + + ASSERT_EQ(task->capacity(), 128 << 20); +} + DEBUG_ONLY_TEST_F( MockSharedArbitrationTest, freeUnusedCapacityWhenReclaimMemoryPool) { - setupMemory(kMemoryCapacity, 0); + setupMemory(kMemoryCapacity, 0, 0, 0); const int allocationSize = kMemoryCapacity / 4; std::shared_ptr reclaimedTask = addTask(); MockMemoryOperator* reclaimedTaskOp = addMemoryOp(reclaimedTask); @@ -1738,7 +3103,7 @@ DEBUG_ONLY_TEST_F( folly::EventCount reclaimBlock; auto reclaimBlockKey = reclaimBlock.prepareWait(); SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableMemory", + "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableUsedCapacity", std::function(([&](const MemoryPool* /*unsed*/) { reclaimWait.notify(); reclaimBlock.wait(reclaimBlockKey); @@ -1780,10 +3145,11 @@ DEBUG_ONLY_TEST_F( SCOPED_TESTVALUE_SET( "facebook::velox::memory::SharedArbitrator::startArbitration", - std::function(([&](const MemoryPool* /*unsed*/) { - arbitrationRun.notify(); - arbitrationBlock.wait(arbitrationBlockKey); - }))); + std::function( + ([&](const SharedArbitrator* /*unsed*/) { + arbitrationRun.notify(); + arbitrationBlock.wait(arbitrationBlockKey); + }))); std::thread allocThread([&]() { // Allocate more than its capacity to trigger arbitration which is blocked @@ -1793,11 +3159,11 @@ DEBUG_ONLY_TEST_F( arbitrationRun.wait(arbitrationRunKey); - // Allocate a new root memory pool and check its initial memory reservation is - // zero. + // Allocate a new root memory pool and check it has its initial capacity + // allocated. std::shared_ptr skipTask = addTask(kMemoryCapacity); MockMemoryOperator* skipTaskOp = addMemoryOp(skipTask); - ASSERT_EQ(skipTaskOp->pool()->capacity(), 0); + ASSERT_EQ(skipTaskOp->pool()->capacity(), kMemoryPoolInitCapacity); arbitrationBlock.notify(); allocThread.join(); @@ -1806,7 +3172,6 @@ DEBUG_ONLY_TEST_F( TEST_F(MockSharedArbitrationTest, arbitrationFailure) { int64_t maxCapacity = 128 * MB; int64_t initialCapacity = 0 * MB; - int64_t minTransferCapacity = 1 * MB; struct { int64_t requestorCapacity; int64_t requestorRequestBytes; @@ -1832,7 +3197,7 @@ TEST_F(MockSharedArbitrationTest, arbitrationFailure) { for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - setupMemory(maxCapacity, initialCapacity, minTransferCapacity); + setupMemory(maxCapacity, 0, initialCapacity, 0); std::shared_ptr requestorTask = addTask(); MockMemoryOperator* requestorOp = addMemoryOp(requestorTask, false); requestorOp->allocate(testData.requestorCapacity); diff --git a/velox/common/memory/tests/SharedArbitratorTest.cpp b/velox/common/memory/tests/SharedArbitratorTest.cpp index 3ffde0fc0a0d8..512a731e7d0fd 100644 --- a/velox/common/memory/tests/SharedArbitratorTest.cpp +++ b/velox/common/memory/tests/SharedArbitratorTest.cpp @@ -16,28 +16,29 @@ #include -#include #include #include +#include +#include +#include #include "folly/experimental/EventCount.h" -#include "folly/futures/Barrier.h" +#include "velox/common/base/Exceptions.h" #include "velox/common/base/tests/GTestUtils.h" #include "velox/common/memory/MallocAllocator.h" -#include "velox/common/memory/Memory.h" #include "velox/common/memory/SharedArbitrator.h" #include "velox/common/testutil/TestValue.h" -#include "velox/connectors/hive/HiveDataSink.h" +#include "velox/connectors/hive/HiveConfig.h" #include "velox/core/PlanNode.h" +#include "velox/dwio/dwrf/writer/Writer.h" #include "velox/exec/Driver.h" -#include "velox/exec/HashBuild.h" +#include "velox/exec/HashAggregation.h" +#include "velox/exec/PlanNodeStats.h" #include "velox/exec/TableWriter.h" #include "velox/exec/Values.h" -#include "velox/exec/tests/utils/AssertQueryBuilder.h" +#include "velox/exec/tests/utils/ArbitratorTestUtil.h" #include "velox/exec/tests/utils/HiveConnectorTestBase.h" -#include "velox/exec/tests/utils/PlanBuilder.h" -#include "velox/exec/tests/utils/TempDirectoryPath.h" -#include "velox/vector/fuzzer/VectorFuzzer.h" +#include "velox/exec/tests/utils/SumNonPODAggregate.h" DECLARE_bool(velox_memory_leak_check_enabled); DECLARE_bool(velox_suppress_memory_capacity_exceeding_error_message); @@ -48,33 +49,6 @@ using namespace facebook::velox::exec; using namespace facebook::velox::exec::test; namespace facebook::velox::memory { -constexpr int64_t KB = 1024L; -constexpr int64_t MB = 1024L * KB; - -constexpr uint64_t kMemoryCapacity = 512 * MB; -constexpr uint64_t kMemoryPoolInitCapacity = 16 * MB; -constexpr uint64_t kMemoryPoolTransferCapacity = 8 * MB; - -struct TestAllocation { - MemoryPool* pool{nullptr}; - void* buffer{nullptr}; - size_t size{0}; - - size_t free() { - const size_t freedBytes = size; - if (pool == nullptr) { - VELOX_CHECK_EQ(freedBytes, 0); - return freedBytes; - } - VELOX_CHECK_GT(freedBytes, 0); - pool->free(buffer, freedBytes); - pool = nullptr; - buffer = nullptr; - size = 0; - return freedBytes; - } -}; - // Custom node for the custom factory. class FakeMemoryNode : public core::PlanNode { public: @@ -101,8 +75,10 @@ class FakeMemoryNode : public core::PlanNode { using AllocationCallback = std::function; // If return true, the caller will terminate execution and return early. -using ReclaimInjectionCallback = std::function< - bool(MemoryPool* pool, uint64_t targetByte, MemoryReclaimer::Stats& stats)>; +using ReclaimInjectionCallback = std::function; // Custom operator for the custom factory. class FakeMemoryOperator : public Operator { @@ -168,7 +144,7 @@ class FakeMemoryOperator : public Operator { override { VELOX_CHECK(canReclaim()); auto* driver = operatorCtx_->driver(); - VELOX_CHECK(!driver->state().isOnThread() || driver->state().isSuspended); + VELOX_CHECK(!driver->state().isOnThread() || driver->state().suspended()); VELOX_CHECK(driver->task()->pauseRequested()); VELOX_CHECK_GT(targetBytes, 0); @@ -185,17 +161,15 @@ class FakeMemoryOperator : public Operator { pool()->free(allocIt->buffer, allocIt->size); allocIt = allocations_.erase(allocIt); } - VELOX_CHECK_GE(totalBytes_, 0); } private: void clear() { for (auto& allocation : allocations_) { totalBytes_ -= allocation.free(); - VELOX_CHECK_GE(totalBytes_, 0); } allocations_.clear(); - VELOX_CHECK_EQ(totalBytes_, 0); + VELOX_CHECK_EQ(totalBytes_.load(), 0); } const bool canReclaim_; @@ -252,38 +226,19 @@ class FakeMemoryOperatorFactory : public Operator::PlanNodeTranslator { uint32_t maxDrivers_{1}; }; -class FakeMemoryReclaimer : public MemoryReclaimer { - public: - FakeMemoryReclaimer() = default; - - static std::unique_ptr create() { - return std::make_unique(); - } - - void enterArbitration() override { - auto* driverThreadCtx = driverThreadContext(); - if (driverThreadCtx == nullptr) { - return; - } - auto* driver = driverThreadCtx->driverCtx.driver; - ASSERT_TRUE(driver != nullptr); - if (driver->task()->enterSuspended(driver->state()) != StopReason::kNone) { - VELOX_FAIL("Terminate detected when entering suspension"); - } - } +namespace { +std::unique_ptr newParallelExecutor() { + return std::make_unique(32); +} - void leaveArbitration() noexcept override { - auto* driverThreadCtx = driverThreadContext(); - if (driverThreadCtx == nullptr) { - return; - } - auto* driver = driverThreadCtx->driverCtx.driver; - ASSERT_TRUE(driver != nullptr); - driver->task()->leaveSuspended(driver->state()); - } +struct TestParam { + bool isSerialExecutionMode{false}; }; +} // namespace -class SharedArbitrationTest : public exec::test::HiveConnectorTestBase { +class SharedArbitrationTest : public testing::WithParamInterface, + public exec::test::HiveConnectorTestBase { + public: protected: static void SetUpTestCase() { exec::test::HiveConnectorTestBase::SetUpTestCase(); @@ -308,339 +263,275 @@ class SharedArbitrationTest : public exec::test::HiveConnectorTestBase { fuzzerOpts_.stringVariableLength = false; fuzzerOpts_.stringLength = 1024; fuzzerOpts_.allowLazyVector = false; - VectorFuzzer fuzzer(fuzzerOpts_, pool()); - vector_ = newVector(); - executor_ = std::make_unique(32); + vector_ = makeRowVector(rowType_, fuzzerOpts_); + numAddedPools_ = 0; + isSerialExecutionMode_ = GetParam().isSerialExecutionMode; + if (isSerialExecutionMode_) { + executor_ = nullptr; + } else { + executor_ = newParallelExecutor(); + } } void TearDown() override { + vector_.reset(); HiveConnectorTestBase::TearDown(); } void setupMemory( int64_t memoryCapacity = 0, - uint64_t memoryPoolInitCapacity = kMemoryPoolInitCapacity, - uint64_t memoryPoolTransferCapacity = kMemoryPoolTransferCapacity) { + uint64_t memoryPoolInitCapacity = kMemoryPoolInitCapacity) { memoryCapacity = (memoryCapacity != 0) ? memoryCapacity : kMemoryCapacity; - allocator_ = std::make_shared(memoryCapacity); - MemoryManagerOptions options; - options.allocator = allocator_.get(); - options.capacity = allocator_->capacity(); - options.arbitratorKind = "SHARED"; - options.capacity = options.capacity; - options.memoryPoolInitCapacity = memoryPoolInitCapacity; - options.memoryPoolTransferCapacity = memoryPoolTransferCapacity; - options.checkUsageLeak = true; - options.arbitrationStateCheckCb = memoryArbitrationStateCheck; - memoryManager_ = std::make_unique(options); + memoryManager_ = + createMemoryManager(memoryCapacity, memoryPoolInitCapacity); ASSERT_EQ(memoryManager_->arbitrator()->kind(), "SHARED"); arbitrator_ = static_cast(memoryManager_->arbitrator()); + numAddedPools_ = 0; + } + + void checkOperatorStatsForArbitration( + PlanNodeStats& stats, + bool expectGlobalArbitration) { + if (expectGlobalArbitration) { + VELOX_CHECK_EQ( + stats.customStats.count(SharedArbitrator::kGlobalArbitrationCount), + 1); + VELOX_CHECK_GE( + stats.customStats.at(SharedArbitrator::kGlobalArbitrationCount).sum, + 1); + VELOX_CHECK_EQ( + stats.customStats.count(SharedArbitrator::kLocalArbitrationCount), 0); + } else { + VELOX_CHECK_EQ( + stats.customStats.count(SharedArbitrator::kLocalArbitrationCount), 1); + VELOX_CHECK_EQ( + stats.customStats.at(SharedArbitrator::kLocalArbitrationCount).sum, + 1); + VELOX_CHECK_EQ( + stats.customStats.count(SharedArbitrator::kGlobalArbitrationCount), + 0); + } } - RowVectorPtr newVector() { - VectorFuzzer fuzzer(fuzzerOpts_, pool()); - return fuzzer.fuzzRow(rowType_); + AssertQueryBuilder newQueryBuilder() { + AssertQueryBuilder builder = AssertQueryBuilder(duckDbQueryRunner_); + builder.serialExecution(isSerialExecutionMode_); + return builder; } - std::shared_ptr newQueryCtx( - int64_t memoryCapacity = kMaxMemory, - std::unique_ptr&& reclaimer = nullptr) { - std::unordered_map> configs; - std::shared_ptr pool = memoryManager_->addRootPool( - "", - memoryCapacity, - reclaimer != nullptr ? std::move(reclaimer) - : MemoryReclaimer::create()); - auto queryCtx = std::make_shared( - executor_.get(), - core::QueryConfig({}), - configs, - cache::AsyncDataCache::getInstance(), - std::move(pool)); - return queryCtx; + AssertQueryBuilder newQueryBuilder(const core::PlanNodePtr& plan) { + AssertQueryBuilder builder = AssertQueryBuilder(plan); + builder.serialExecution(isSerialExecutionMode_); + return builder; } static inline FakeMemoryOperatorFactory* fakeOperatorFactory_; - std::shared_ptr allocator_; - std::unique_ptr memoryManager_; - SharedArbitrator* arbitrator_; + std::unique_ptr memoryManager_; + SharedArbitrator* arbitrator_{nullptr}; RowTypePtr rowType_; VectorFuzzer::Options fuzzerOpts_; RowVectorPtr vector_; - std::unique_ptr executor_; + std::atomic_uint64_t numAddedPools_{0}; + bool isSerialExecutionMode_{false}; }; -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimFromOrderBy) { - const int numVectors = 32; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } +/// A test fixture that runs cases within parallel execution mode. +class SharedArbitrationTestWithParallelExecutionModeOnly + : public SharedArbitrationTest {}; +/// A test fixture that runs cases within both serial and +/// parallel execution modes. +class SharedArbitrationTestWithThreadingModes : public SharedArbitrationTest {}; + +DEBUG_ONLY_TEST_P( + SharedArbitrationTestWithThreadingModes, + queryArbitrationStateCheck) { + const std::vector vectors = + createVectors(rowType_, 32, 32 << 20); createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr orderByQueryCtx; - if (sameQuery) { - orderByQueryCtx = fakeMemoryQueryCtx; - } else { - orderByQueryCtx = newQueryCtx(kMemoryCapacity); - } - - folly::EventCount fakeAllocationWait; - auto fakeAllocationWaitKey = fakeAllocationWait.prepareWait(); - folly::EventCount taskPauseWait; - auto taskPauseWaitKey = taskPauseWait.prepareWait(); + std::shared_ptr queryCtx = + newQueryCtx(memory::memoryManager(), executor_.get(), kMemoryCapacity); - const auto orderByMemoryUsage = 32L << 20; - const auto fakeAllocationSize = kMemoryCapacity - orderByMemoryUsage / 2; - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - fakeAllocationWait.wait(fakeAllocationWaitKey); - auto buffer = op->pool()->allocate(fakeAllocationSize); - return TestAllocation{op->pool(), buffer, fakeAllocationSize}; - }); + std::atomic_bool queryCtxStateChecked{false}; + SCOPED_TESTVALUE_SET( + "facebook::velox::exec::Task::requestPauseLocked", + std::function(([&](Task* /*unused*/) { + ASSERT_TRUE(queryCtx->testingUnderArbitration()); + queryCtxStateChecked = true; + }))); - std::atomic injectOrderByOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "OrderBy") { - return; - } - if (op->pool()->capacity() < orderByMemoryUsage) { - return; - } - if (!injectOrderByOnce.exchange(false)) { - return; - } - fakeAllocationWait.notify(); - // Wait for pause to be triggered. - taskPauseWait.wait(taskPauseWaitKey); - }))); + const auto spillDirectory = exec::test::TempDirectoryPath::create(); + TestScopedSpillInjection scopedSpillInjection(100); + core::PlanNodeId aggregationNodeId; + newQueryBuilder() + .queryCtx(queryCtx) + .spillDirectory(spillDirectory->getPath()) + .config(core::QueryConfig::kSpillEnabled, "true") + .plan(PlanBuilder() + .values(vectors) + .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) + .capturePlanNodeId(aggregationNodeId) + .planNode()) + .assertResults("SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); + ASSERT_TRUE(queryCtxStateChecked); + ASSERT_FALSE(queryCtx->testingUnderArbitration()); + waitForAllTasksToBeDeleted(); + ASSERT_FALSE(queryCtx->testingUnderArbitration()); +} - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function( - ([&](Task* /*unused*/) { taskPauseWait.notify(); }))); +DEBUG_ONLY_TEST_P( + SharedArbitrationTestWithThreadingModes, + raceBetweenAbortAndArbitrationLeave) { + const std::vector vectors = + createVectors(rowType_, 32, 32 << 20); + setupMemory(kMemoryCapacity, /*memoryPoolInitCapacity=*/0); + std::shared_ptr queryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), 32 << 20); + + folly::EventCount abortWait; + std::atomic_bool abortWaitFlag{true}; + std::atomic task{nullptr}; + const std::string errorMsg{"injected abort error"}; + SCOPED_TESTVALUE_SET( + "facebook::velox::exec::Task::leaveSuspended", + std::function(([&](exec::Task* _task) { + if (task.exchange(_task) != nullptr) { + return; + } + abortWaitFlag = false; + abortWait.notifyAll(); + // Let memory pool abort thread to run first. We inject a randomized + // delay here to trigger all the potential timing race conditions but + // the test result should be the same. + std::this_thread::sleep_for( + std::chrono::milliseconds(folly::Random::rand32() % 1'000)); + }))); - std::thread orderByThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kOrderBySpillEnabled, "true") - .queryCtx(orderByQueryCtx) - .plan( - PlanBuilder() - .values(vectors) - .orderBy({fmt::format("{} ASC NULLS LAST", "c0")}, false) - .planNode()) - .assertResults("SELECT * FROM tmp ORDER BY c0 ASC NULLS LAST"); - auto stats = task->taskStats().pipelineStats; - ASSERT_GT(stats[0].operatorStats[1].spilledBytes, 0); - }); + std::thread queryThread([&] { + const auto spillDirectory = exec::test::TempDirectoryPath::create(); + core::PlanNodeId aggregationNodeId; + auto plan = PlanBuilder() + .values(vectors) + .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) + .capturePlanNodeId(aggregationNodeId) + .planNode(); + VELOX_ASSERT_THROW( + newQueryBuilder(plan) + .queryCtx(queryCtx) + .spillDirectory(spillDirectory->getPath()) + .config(core::QueryConfig::kSpillEnabled, "true") + .copyResults(pool()), + errorMsg); + }); - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); + abortWait.await([&] { return !abortWaitFlag.load(); }); - orderByThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); + try { + VELOX_FAIL(errorMsg); + } catch (...) { + task.load()->pool()->abort(std::current_exception()); } + queryThread.join(); + waitForAllTasksToBeDeleted(); } -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimFromEmptyOrderBy) { - const int numVectors = 32; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr orderByQueryCtx = - newQueryCtx(kMemoryCapacity); - - folly::EventCount fakeAllocationWait; - auto fakeAllocationWaitKey = fakeAllocationWait.prepareWait(); - folly::EventCount taskPauseWait; - auto taskPauseWaitKey = taskPauseWait.prepareWait(); - - std::atomic injectAllocations{0}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - const auto injectionCount = ++injectAllocations; - if (injectionCount > 2) { - return TestAllocation{}; - } - if (injectionCount == 1) { - return TestAllocation{ - op->pool(), - op->pool()->allocate(kMemoryCapacity / 2), - kMemoryCapacity / 2}; - } - fakeAllocationWait.wait(fakeAllocationWaitKey); - EXPECT_ANY_THROW(op->pool()->allocate(kMemoryCapacity)); - return TestAllocation{}; - }); - fakeOperatorFactory_->setCanReclaim(false); - - core::PlanNodeId orderByPlanNodeId; - auto orderByPlan = - PlanBuilder() - .values(vectors) - .orderBy({fmt::format("{} ASC NULLS LAST", "c0")}, false) - .capturePlanNodeId(orderByPlanNodeId) - .planNode(); - - std::atomic injectDriverBlockOnce{true}; +DEBUG_ONLY_TEST_P( + SharedArbitrationTestWithThreadingModes, + skipNonReclaimableTaskTest) { + const std::vector vectors = + createVectors(rowType_, 32, 32 << 20); + std::shared_ptr queryCtx = + newQueryCtx(memory::memoryManager(), executor_.get(), kMemoryCapacity); + std::unordered_map configs; + configs.emplace(core::QueryConfig::kSpillEnabled, "true"); + queryCtx->testingOverrideConfigUnsafe(std::move(configs)); + + std::atomic_bool blockedAggregation{false}; + std::atomic_bool blockedPartialAggregation{false}; + folly::EventCount arbitrationWait; + std::atomic arbitrationWaitFlag{true}; SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal", - std::function(([&](Driver* driver) { - Operator* op = driver->findOperator(orderByPlanNodeId); - if (op == nullptr) { + "facebook::velox::exec::Driver::runInternal::addInput", + std::function(([&](exec::Operator* op) { + if (op->testingOperatorCtx()->operatorType() != "Aggregation" && + op->testingOperatorCtx()->operatorType() != "PartialAggregation") { return; } - if (op->operatorType() != "OrderBy") { + if (op->pool()->usedBytes() == 0) { return; } - if (!injectDriverBlockOnce.exchange(false)) { - return; + if (op->testingOperatorCtx()->operatorType() == "PartialAggregation") { + if (blockedPartialAggregation.exchange(true)) { + return; + } + } else { + if (blockedAggregation.exchange(true)) { + return; + } } - fakeAllocationWait.notify(); - // Wait for pause to be triggered. - taskPauseWait.wait(taskPauseWaitKey); + auto* driver = op->testingOperatorCtx()->driver(); + SuspendedSection suspendedSection(driver); + arbitrationWait.await([&]() { return !arbitrationWaitFlag.load(); }); }))); + std::atomic_int taskPausedCount{0}; SCOPED_TESTVALUE_SET( "facebook::velox::exec::Task::requestPauseLocked", - std::function( - ([&](Task* /*unused*/) { taskPauseWait.notify(); }))); + std::function(([&](Task* /*unused*/) { + ASSERT_TRUE(queryCtx->testingUnderArbitration()); + ++taskPausedCount; + }))); - std::thread orderByThread([&]() { - std::shared_ptr task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .maxDrivers(1) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kOrderBySpillEnabled, "true") - .queryCtx(orderByQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .orderBy({fmt::format("{} ASC NULLS LAST", "c0")}, false) - .planNode()) - .assertResults("SELECT * FROM tmp ORDER BY c0 ASC NULLS LAST"); - // Verify no spill has been triggered. - const auto stats = task->taskStats().pipelineStats; - ASSERT_EQ(stats[0].operatorStats[1].spilledBytes, 0); - ASSERT_EQ(stats[0].operatorStats[1].spilledPartitions, 0); + const auto spillPlan = PlanBuilder() + .values(vectors) + .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) + .planNode(); + std::thread spillableThread([&]() { + const auto spillDirectory = exec::test::TempDirectoryPath::create(); + newQueryBuilder(spillPlan) + .queryCtx(queryCtx) + .spillDirectory(spillDirectory->getPath()) + .copyResults(pool()); }); - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(orderByQueryCtx) - .maxDrivers(1) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); + const auto nonSpillPlan = PlanBuilder() + .values(vectors) + .aggregation( + {"c0", "c1"}, + {"array_agg(c2)"}, + {}, + core::AggregationNode::Step::kPartial, + false) + .planNode(); + std::thread nonSpillableThread([&]() { + newQueryBuilder(nonSpillPlan).queryCtx(queryCtx).copyResults(pool()); }); - orderByThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); -} - -class TestMemoryReclaimer : public MemoryReclaimer { - public: - TestMemoryReclaimer(std::function reclaimCb) - : reclaimCb_(std::move(reclaimCb)) {} - - uint64_t reclaim(MemoryPool* pool, uint64_t targetBytes, Stats& stats) - override { - if (pool->kind() == MemoryPool::Kind::kLeaf) { - return 0; - } - std::vector> children; - { - children.reserve(pool->children_.size()); - for (auto& entry : pool->children_) { - auto child = entry.second.lock(); - if (child != nullptr) { - children.push_back(std::move(child)); - } - } - } + while (!blockedPartialAggregation || !blockedAggregation) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } - std::vector candidates(children.size()); - for (uint32_t i = 0; i < children.size(); ++i) { - candidates[i].pool = children[i].get(); - children[i]->reclaimableBytes(candidates[i].reclaimableBytes); - } - sortCandidatesByReclaimableMemoryAsc(candidates); + testingRunArbitration(); - uint64_t reclaimedBytes{0}; - for (const auto& candidate : candidates) { - const auto bytes = candidate.pool->reclaim(targetBytes, stats); - if (reclaimCb_ != nullptr) { - reclaimCb_(candidate.pool); - } - reclaimedBytes += bytes; - if (targetBytes != 0) { - if (bytes >= targetBytes) { - break; - } - targetBytes -= bytes; - } - } - return reclaimedBytes; - } + arbitrationWaitFlag = false; + arbitrationWait.notifyAll(); - private: - struct ArbitrationCandidate { - uint64_t reclaimableBytes{0}; - MemoryPool* pool{nullptr}; - }; - - void sortCandidatesByReclaimableMemoryAsc( - std::vector& candidates) { - std::sort( - candidates.begin(), - candidates.end(), - [](const ArbitrationCandidate& lhs, const ArbitrationCandidate& rhs) { - return lhs.reclaimableBytes < rhs.reclaimableBytes; - }); - } + spillableThread.join(); + nonSpillableThread.join(); - std::function reclaimCb_{nullptr}; -}; + // We shall only reclaim from the reclaimable task but not non-reclaimable + // task. + ASSERT_EQ(taskPausedCount, 1); + ASSERT_FALSE(queryCtx->testingUnderArbitration()); + waitForAllTasksToBeDeleted(); + ASSERT_FALSE(queryCtx->testingUnderArbitration()); + ASSERT_EQ(taskPausedCount, 1); +} -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimToOrderBy) { +DEBUG_ONLY_TEST_P(SharedArbitrationTestWithThreadingModes, reclaimToOrderBy) { const int numVectors = 32; std::vector vectors; for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); + vectors.push_back(makeRowVector(rowType_, fuzzerOpts_)); } createDuckDbTable(vectors); std::vector sameQueries = {false, true}; @@ -648,12 +539,15 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimToOrderBy) { SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); const auto oldStats = arbitrator_->stats(); std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); + ++numAddedPools_; std::shared_ptr orderByQueryCtx; if (sameQuery) { orderByQueryCtx = fakeMemoryQueryCtx; } else { - orderByQueryCtx = newQueryCtx(kMemoryCapacity); + orderByQueryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); + ++numAddedPools_; } folly::EventCount orderByWait; @@ -694,21 +588,28 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimToOrderBy) { ([&](Task* /*unused*/) { taskPauseWait.notify(); }))); std::thread orderByThread([&]() { + core::PlanNodeId orderByNodeId; auto task = - AssertQueryBuilder(duckDbQueryRunner_) + newQueryBuilder() .queryCtx(orderByQueryCtx) - .plan( - PlanBuilder() - .values(vectors) - .orderBy({fmt::format("{} ASC NULLS LAST", "c0")}, false) - .planNode()) + .serialExecution(isSerialExecutionMode_) + .plan(PlanBuilder() + .values(vectors) + .orderBy({"c0 ASC NULLS LAST"}, false) + .capturePlanNodeId(orderByNodeId) + .planNode()) .assertResults("SELECT * FROM tmp ORDER BY c0 ASC NULLS LAST"); + auto taskStats = exec::toPlanStats(task->taskStats()); + auto& stats = taskStats.at(orderByNodeId); + checkOperatorStatsForArbitration( + stats, !sameQuery /*expectGlobalArbitration*/); }); std::thread memThread([&]() { auto task = - AssertQueryBuilder(duckDbQueryRunner_) + newQueryBuilder() .queryCtx(fakeMemoryQueryCtx) + .serialExecution(isSerialExecutionMode_) .plan(PlanBuilder() .values(vectors) .addNode([&](std::string id, core::PlanNodePtr input) { @@ -724,62 +625,96 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimToOrderBy) { const auto newStats = arbitrator_->stats(); ASSERT_GT(newStats.numReclaimedBytes, oldStats.numReclaimedBytes); ASSERT_GT(newStats.reclaimTimeUs, oldStats.reclaimTimeUs); + ASSERT_GT(orderByQueryCtx->pool()->stats().numCapacityGrowths, 0); } } -TEST_F(SharedArbitrationTest, reclaimFromCompletedOrderBy) { - const int numVectors = 2; +DEBUG_ONLY_TEST_P( + SharedArbitrationTestWithThreadingModes, + reclaimToAggregation) { + const int numVectors = 32; std::vector vectors; for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); + vectors.push_back(makeRowVector(rowType_, fuzzerOpts_)); } createDuckDbTable(vectors); std::vector sameQueries = {false, true}; for (bool sameQuery : sameQueries) { SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); + const auto oldStats = arbitrator_->stats(); std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr orderByQueryCtx; + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); + ++numAddedPools_; + std::shared_ptr aggregationQueryCtx; if (sameQuery) { - orderByQueryCtx = fakeMemoryQueryCtx; + aggregationQueryCtx = fakeMemoryQueryCtx; } else { - orderByQueryCtx = newQueryCtx(kMemoryCapacity); + aggregationQueryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); + ++numAddedPools_; } - folly::EventCount fakeAllocationWait; - auto fakeAllocationWaitKey = fakeAllocationWait.prepareWait(); + folly::EventCount aggregationWait; + auto aggregationWaitKey = aggregationWait.prepareWait(); + folly::EventCount taskPauseWait; + auto taskPauseWaitKey = taskPauseWait.prepareWait(); - const auto fakeAllocationSize = kMemoryCapacity; + const auto fakeAllocationSize = kMemoryCapacity - (32L << 20); std::atomic injectAllocationOnce{true}; fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { if (!injectAllocationOnce.exchange(false)) { return TestAllocation{}; } - fakeAllocationWait.wait(fakeAllocationWaitKey); auto buffer = op->pool()->allocate(fakeAllocationSize); + aggregationWait.notify(); + // Wait for pause to be triggered. + taskPauseWait.wait(taskPauseWaitKey); return TestAllocation{op->pool(), buffer, fakeAllocationSize}; }); - std::thread orderByThread([&]() { + std::atomic injectAggregationOnce{true}; + SCOPED_TESTVALUE_SET( + "facebook::velox::exec::Driver::runInternal::addInput", + std::function(([&](Operator* op) { + if (op->operatorType() != "Aggregation") { + return; + } + if (!injectAggregationOnce.exchange(false)) { + return; + } + aggregationWait.wait(aggregationWaitKey); + }))); + + SCOPED_TESTVALUE_SET( + "facebook::velox::exec::Task::requestPauseLocked", + std::function( + ([&](Task* /*unused*/) { taskPauseWait.notify(); }))); + + std::thread aggregationThread([&]() { + core::PlanNodeId aggregationNodeId; auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(orderByQueryCtx) - .plan( - PlanBuilder() - .values(vectors) - .orderBy({fmt::format("{} ASC NULLS LAST", "c0")}, false) - .planNode()) - .assertResults("SELECT * FROM tmp ORDER BY c0 ASC NULLS LAST"); - waitForTaskCompletion(task.get()); - fakeAllocationWait.notify(); + newQueryBuilder() + .queryCtx(aggregationQueryCtx) + .serialExecution(isSerialExecutionMode_) + .plan(PlanBuilder() + .values(vectors) + .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) + .capturePlanNodeId(aggregationNodeId) + .planNode()) + .assertResults( + "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); + auto taskStats = exec::toPlanStats(task->taskStats()); + auto& stats = taskStats.at(aggregationNodeId); + checkOperatorStatsForArbitration( + stats, !sameQuery /*expectGlobalArbitration*/); }); std::thread memThread([&]() { auto task = - AssertQueryBuilder(duckDbQueryRunner_) + newQueryBuilder() .queryCtx(fakeMemoryQueryCtx) + .serialExecution(isSerialExecutionMode_) .plan(PlanBuilder() .values(vectors) .addNode([&](std::string id, core::PlanNodePtr input) { @@ -789,67 +724,71 @@ TEST_F(SharedArbitrationTest, reclaimFromCompletedOrderBy) { .assertResults("SELECT * FROM tmp"); }); - orderByThread.join(); + aggregationThread.join(); memThread.join(); waitForAllTasksToBeDeleted(); + + const auto newStats = arbitrator_->stats(); + ASSERT_GT(newStats.numReclaimedBytes, oldStats.numReclaimedBytes); + ASSERT_GT(newStats.reclaimTimeUs, oldStats.reclaimTimeUs); } } -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimFromAggregation) { +DEBUG_ONLY_TEST_P( + SharedArbitrationTestWithThreadingModes, + reclaimToJoinBuilder) { const int numVectors = 32; std::vector vectors; for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); + vectors.push_back(makeRowVector(rowType_, fuzzerOpts_)); } createDuckDbTable(vectors); std::vector sameQueries = {false, true}; for (bool sameQuery : sameQueries) { SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); + const auto oldStats = arbitrator_->stats(); std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr aggregationQueryCtx; + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); + ++numAddedPools_; + std::shared_ptr joinQueryCtx; if (sameQuery) { - aggregationQueryCtx = fakeMemoryQueryCtx; + joinQueryCtx = fakeMemoryQueryCtx; } else { - aggregationQueryCtx = newQueryCtx(kMemoryCapacity); + joinQueryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); + ++numAddedPools_; } - folly::EventCount fakeAllocationWait; - auto fakeAllocationWaitKey = fakeAllocationWait.prepareWait(); + folly::EventCount joinWait; + auto joinWaitKey = joinWait.prepareWait(); folly::EventCount taskPauseWait; auto taskPauseWaitKey = taskPauseWait.prepareWait(); - const auto aggregationMemoryUsage = 32L << 20; - const auto fakeAllocationSize = - kMemoryCapacity - aggregationMemoryUsage + 1; + const auto fakeAllocationSize = kMemoryCapacity - (32L << 20); std::atomic injectAllocationOnce{true}; fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { if (!injectAllocationOnce.exchange(false)) { return TestAllocation{}; } - fakeAllocationWait.wait(fakeAllocationWaitKey); auto buffer = op->pool()->allocate(fakeAllocationSize); + joinWait.notify(); + // Wait for pause to be triggered. + taskPauseWait.wait(taskPauseWaitKey); return TestAllocation{op->pool(), buffer, fakeAllocationSize}; }); - std::atomic injectAggregationByOnce{true}; + std::atomic injectJoinOnce{true}; SCOPED_TESTVALUE_SET( "facebook::velox::exec::Driver::runInternal::addInput", std::function(([&](Operator* op) { - if (op->operatorType() != "Aggregation") { - return; - } - if (op->pool()->capacity() < aggregationMemoryUsage) { + if (op->operatorType() != "HashBuild") { return; } - if (!injectAggregationByOnce.exchange(false)) { + if (!injectJoinOnce.exchange(false)) { return; } - fakeAllocationWait.notify(); - // Wait for pause to be triggered. - taskPauseWait.wait(taskPauseWaitKey); + joinWait.wait(joinWaitKey); }))); SCOPED_TESTVALUE_SET( @@ -857,1400 +796,69 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimFromAggregation) { std::function( ([&](Task* /*unused*/) { taskPauseWait.notify(); }))); - std::thread aggregationThread([&]() { + std::thread joinThread([&]() { + auto planNodeIdGenerator = std::make_shared(); + core::PlanNodeId joinNodeId; auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kAggregationSpillEnabled, "true") - .config(core::QueryConfig::kAggregationSpillPartitionBits, "2") - .queryCtx(aggregationQueryCtx) - .plan(PlanBuilder() + newQueryBuilder() + .queryCtx(joinQueryCtx) + .serialExecution(isSerialExecutionMode_) + .plan(PlanBuilder(planNodeIdGenerator) .values(vectors) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .planNode()) - .assertResults( - "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); - auto stats = task->taskStats().pipelineStats; - ASSERT_GT(stats[0].operatorStats[1].spilledBytes, 0); - }); - - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - - aggregationThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); - } -} - -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimFromAggregationOnNoMoreInput) { - const int numVectors = 32; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - for (bool sameQuery : {false, true}) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr aggregationQueryCtx; - if (sameQuery) { - aggregationQueryCtx = fakeMemoryQueryCtx; - } else { - aggregationQueryCtx = newQueryCtx(kMemoryCapacity); - } - - std::atomic fakeAllocationBlocked{true}; - folly::EventCount fakeAllocationWait; - - std::atomic injectedPool{nullptr}; - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - fakeAllocationWait.await([&]() { return !fakeAllocationBlocked.load(); }); - EXPECT_TRUE(injectedPool != nullptr); - const auto fakeAllocationSize = - kMemoryCapacity - injectedPool.load()->reservedBytes() + 1; - return TestAllocation{ - op->pool(), - op->pool()->allocate(fakeAllocationSize), - fakeAllocationSize}; - }); - - folly::EventCount taskPauseWait; - std::atomic taskPaused{false}; - std::atomic injectNoMoreInputOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::noMoreInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "Aggregation") { - return; - } - if (!injectNoMoreInputOnce.exchange(false)) { - return; - } - injectedPool = op->pool(); - fakeAllocationBlocked = false; - fakeAllocationWait.notifyAll(); - taskPauseWait.await([&]() { return taskPaused.load(); }); - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function(([&](Task* /*unused*/) { - taskPaused = true; - taskPauseWait.notifyAll(); - }))); - - std::thread aggregationThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kAggregationSpillEnabled, "true") - .config(core::QueryConfig::kAggregationSpillPartitionBits, "2") - .queryCtx(aggregationQueryCtx) - .maxDrivers(1) - .plan(PlanBuilder() - .values(vectors) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .planNode()) - .assertResults( - "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); - auto stats = task->taskStats().pipelineStats; - ASSERT_GT(stats[0].operatorStats[1].spilledBytes, 0); - }); - - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - - aggregationThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); - } -} - -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimFromAggregationDuringOutput) { - const int numVectors = 32; - std::vector vectors; - int numRows{0}; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - numRows += vectors.back()->size(); - } - createDuckDbTable(vectors); - for (bool sameQuery : {false, true}) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr aggregationQueryCtx; - if (sameQuery) { - aggregationQueryCtx = fakeMemoryQueryCtx; - } else { - aggregationQueryCtx = newQueryCtx(kMemoryCapacity); - } - - std::atomic fakeAllocationBlocked{true}; - folly::EventCount fakeAllocationWait; - - std::atomic injectedPool{nullptr}; - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - fakeAllocationWait.await([&]() { return !fakeAllocationBlocked.load(); }); - EXPECT_TRUE(injectedPool != nullptr); - const auto fakeAllocationSize = - kMemoryCapacity - injectedPool.load()->reservedBytes() + 1; - return TestAllocation{ - op->pool(), - op->pool()->allocate(fakeAllocationSize), - fakeAllocationSize}; - }); - - folly::EventCount taskPauseWait; - std::atomic taskPaused{false}; - std::atomic injectGetOutputCount{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::getOutput", - std::function(([&](Operator* op) { - if (op->operatorType() != "Aggregation") { - return; - } - if (!op->testingNoMoreInput()) { - return; - } - if (++injectGetOutputCount != 3) { - return; - } - injectedPool = op->pool(); - fakeAllocationBlocked = false; - fakeAllocationWait.notifyAll(); - taskPauseWait.await([&]() { return taskPaused.load(); }); - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function(([&](Task* /*unused*/) { - taskPaused = true; - taskPauseWait.notifyAll(); - }))); - - std::thread aggregationThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kAggregationSpillEnabled, "true") - .config(core::QueryConfig::kAggregationSpillPartitionBits, "2") - .config( - core::QueryConfig::kPreferredOutputBatchRows, - std::to_string(numRows / 10)) - .maxDrivers(1) - .queryCtx(aggregationQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .planNode()) - .assertResults( - "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); - auto stats = task->taskStats().pipelineStats; - ASSERT_GT(stats[0].operatorStats[1].spilledBytes, 0); - }); - - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - - aggregationThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); - } -} - -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimToAggregation) { - const int numVectors = 32; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto oldStats = arbitrator_->stats(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr aggregationQueryCtx; - if (sameQuery) { - aggregationQueryCtx = fakeMemoryQueryCtx; - } else { - aggregationQueryCtx = newQueryCtx(kMemoryCapacity); - } - - folly::EventCount aggregationWait; - auto aggregationWaitKey = aggregationWait.prepareWait(); - folly::EventCount taskPauseWait; - auto taskPauseWaitKey = taskPauseWait.prepareWait(); - - const auto fakeAllocationSize = kMemoryCapacity - (32L << 20); - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - auto buffer = op->pool()->allocate(fakeAllocationSize); - aggregationWait.notify(); - // Wait for pause to be triggered. - taskPauseWait.wait(taskPauseWaitKey); - return TestAllocation{op->pool(), buffer, fakeAllocationSize}; - }); - - std::atomic injectAggregationOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "Aggregation") { - return; - } - if (!injectAggregationOnce.exchange(false)) { - return; - } - aggregationWait.wait(aggregationWaitKey); - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function( - ([&](Task* /*unused*/) { taskPauseWait.notify(); }))); - - std::thread aggregationThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(aggregationQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .planNode()) - .assertResults( - "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); - }); - - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - - aggregationThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); - - const auto newStats = arbitrator_->stats(); - ASSERT_GT(newStats.numReclaimedBytes, oldStats.numReclaimedBytes); - ASSERT_GT(newStats.reclaimTimeUs, oldStats.reclaimTimeUs); - } -} - -TEST_F(SharedArbitrationTest, reclaimFromCompletedAggregation) { - const int numVectors = 2; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr aggregationQueryCtx; - if (sameQuery) { - aggregationQueryCtx = fakeMemoryQueryCtx; - } else { - aggregationQueryCtx = newQueryCtx(kMemoryCapacity); - } - - folly::EventCount fakeAllocationWait; - auto fakeAllocationWaitKey = fakeAllocationWait.prepareWait(); - - const auto fakeAllocationSize = kMemoryCapacity; - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - fakeAllocationWait.wait(fakeAllocationWaitKey); - auto buffer = op->pool()->allocate(fakeAllocationSize); - return TestAllocation{op->pool(), buffer, fakeAllocationSize}; - }); - - std::thread aggregationThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(aggregationQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .planNode()) - .assertResults( - "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); - waitForTaskCompletion(task.get()); - fakeAllocationWait.notify(); - }); - - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - - aggregationThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); - } -} - -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimFromJoinBuilder) { - const int numVectors = 32; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr joinQueryCtx; - if (sameQuery) { - joinQueryCtx = fakeMemoryQueryCtx; - } else { - joinQueryCtx = newQueryCtx(kMemoryCapacity); - } - - std::atomic_bool fakeAllocationReady{false}; - folly::EventCount fakeAllocationWait; - std::atomic_bool taskPauseDone{false}; - folly::EventCount taskPauseWait; - - const auto joinMemoryUsage = 32L << 20; - const auto fakeAllocationSize = kMemoryCapacity - joinMemoryUsage / 2; - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - fakeAllocationWait.await([&]() { return fakeAllocationReady.load(); }); - auto buffer = op->pool()->allocate(fakeAllocationSize); - return TestAllocation{op->pool(), buffer, fakeAllocationSize}; - }); - - std::atomic injectAggregationByOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "HashBuild") { - return; - } - if (op->pool()->currentBytes() < joinMemoryUsage) { - return; - } - if (!injectAggregationByOnce.exchange(false)) { - return; - } - fakeAllocationReady.store(true); - fakeAllocationWait.notifyAll(); - // Wait for pause to be triggered. - taskPauseWait.await([&]() { return taskPauseDone.load(); }); - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function(([&](Task* /*unused*/) { - taskPauseDone.store(true); - taskPauseWait.notifyAll(); - }))); - - // joinQueryCtx and fakeMemoryQueryCtx may be the same and thus share the - // same underlying QueryConfig. We apply the changes here instead of using - // the AssertQueryBuilder to avoid a potential race condition caused by - // writing the config in the join thread, and reading it in the memThread. - std::unordered_map config{ - {core::QueryConfig::kSpillEnabled, "true"}, - {core::QueryConfig::kJoinSpillEnabled, "true"}, - {core::QueryConfig::kJoinSpillPartitionBits, "2"}, - }; - joinQueryCtx->testingOverrideConfigUnsafe(std::move(config)); - - std::thread aggregationThread([&]() { - auto planNodeIdGenerator = std::make_shared(); - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .queryCtx(joinQueryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0"}, - {"u0"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kAnti) - .planNode()) - .assertResults( - "SELECT c1 FROM tmp WHERE c0 NOT IN (SELECT c0 FROM tmp)"); - auto stats = task->taskStats().pipelineStats; - ASSERT_GT(stats[1].operatorStats[2].spilledBytes, 0); - }); - - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - - aggregationThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); - } -} - -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimToJoinBuilder) { - const int numVectors = 32; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto oldStats = arbitrator_->stats(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr joinQueryCtx; - if (sameQuery) { - joinQueryCtx = fakeMemoryQueryCtx; - } else { - joinQueryCtx = newQueryCtx(kMemoryCapacity); - } - - folly::EventCount joinWait; - auto joinWaitKey = joinWait.prepareWait(); - folly::EventCount taskPauseWait; - auto taskPauseWaitKey = taskPauseWait.prepareWait(); - - const auto fakeAllocationSize = kMemoryCapacity - (32L << 20); - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - auto buffer = op->pool()->allocate(fakeAllocationSize); - joinWait.notify(); - // Wait for pause to be triggered. - taskPauseWait.wait(taskPauseWaitKey); - return TestAllocation{op->pool(), buffer, fakeAllocationSize}; - }); - - std::atomic injectJoinOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "HashBuild") { - return; - } - if (!injectJoinOnce.exchange(false)) { - return; - } - joinWait.wait(joinWaitKey); - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function( - ([&](Task* /*unused*/) { taskPauseWait.notify(); }))); - - std::thread joinThread([&]() { - auto planNodeIdGenerator = std::make_shared(); - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(joinQueryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0"}, - {"u0"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kAnti) - .planNode()) - .assertResults( - "SELECT c1 FROM tmp WHERE c0 NOT IN (SELECT c0 FROM tmp)"); - }); - - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - - joinThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); - - const auto newStats = arbitrator_->stats(); - ASSERT_GT(newStats.numReclaimedBytes, oldStats.numReclaimedBytes); - ASSERT_GT(newStats.reclaimTimeUs, oldStats.reclaimTimeUs); - } -} - -TEST_F(SharedArbitrationTest, reclaimFromCompletedJoinBuilder) { - const int numVectors = 2; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - const uint64_t numCreatedTasks = Task::numCreatedTasks(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr joinQueryCtx; - if (sameQuery) { - joinQueryCtx = fakeMemoryQueryCtx; - } else { - joinQueryCtx = newQueryCtx(kMemoryCapacity); - } - - folly::EventCount fakeAllocationWait; - auto fakeAllocationWaitKey = fakeAllocationWait.prepareWait(); - - const auto fakeAllocationSize = kMemoryCapacity; - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - fakeAllocationWait.wait(fakeAllocationWaitKey); - auto buffer = op->pool()->allocate(fakeAllocationSize); - return TestAllocation{op->pool(), buffer, fakeAllocationSize}; - }); - - std::thread joinThread([&]() { - auto planNodeIdGenerator = std::make_shared(); - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(joinQueryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0"}, - {"u0"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kAnti) - .planNode()) - .assertResults( - "SELECT c1 FROM tmp WHERE c0 NOT IN (SELECT c0 FROM tmp)"); - waitForTaskCompletion(task.get()); - task.reset(); - // Make sure the join query task has been destroyed. - waitForAllTasksToBeDeleted(numCreatedTasks + 1, 3'000'000); - fakeAllocationWait.notify(); - }); - - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - - joinThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); - } -} - -DEBUG_ONLY_TEST_F( - SharedArbitrationTest, - reclaimFromJoinBuilderWithMultiDrivers) { - const int numVectors = 32; - std::vector vectors; - fuzzerOpts_.vectorSize = 128; - fuzzerOpts_.stringVariableLength = false; - fuzzerOpts_.stringLength = 512; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - const int numDrivers = 4; - createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr joinQueryCtx; - if (sameQuery) { - joinQueryCtx = fakeMemoryQueryCtx; - } else { - joinQueryCtx = newQueryCtx(kMemoryCapacity); - } - - folly::EventCount fakeAllocationWait; - auto fakeAllocationWaitKey = fakeAllocationWait.prepareWait(); - folly::EventCount taskPauseWait; - - const auto joinMemoryUsage = 8L << 20; - const auto fakeAllocationSize = kMemoryCapacity - joinMemoryUsage / 2; - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - fakeAllocationWait.wait(fakeAllocationWaitKey); - auto buffer = op->pool()->allocate(fakeAllocationSize); - return TestAllocation{op->pool(), buffer, fakeAllocationSize}; - }); - - std::atomic injectCount{0}; - folly::futures::Barrier builderBarrier(numDrivers); - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "HashBuild") { - return; - } - // Check all the hash build operators' memory usage instead of - // individual operator. - if (op->pool()->parent()->currentBytes() < joinMemoryUsage) { - return; - } - if (++injectCount > numDrivers) { - return; - } - auto future = builderBarrier.wait(); - if (future.wait().value()) { - fakeAllocationWait.notify(); - } - - auto taskPauseWaitKey = taskPauseWait.prepareWait(); - // Wait for pause to be triggered. - taskPauseWait.wait(taskPauseWaitKey); - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function( - [&](Task* /*unused*/) { taskPauseWait.notifyAll(); })); - - // joinQueryCtx and fakeMemoryQueryCtx may be the same and thus share the - // same underlying QueryConfig. We apply the changes here instead of using - // the AssertQueryBuilder to avoid a potential race condition caused by - // writing the config in the join thread, and reading it in the memThread. - std::unordered_map config{ - {core::QueryConfig::kSpillEnabled, "true"}, - {core::QueryConfig::kJoinSpillEnabled, "true"}, - {core::QueryConfig::kJoinSpillPartitionBits, "2"}, - // NOTE: set an extreme large value to avoid non-reclaimable - // section in test. - {core::QueryConfig::kSpillableReservationGrowthPct, "8000"}, - }; - joinQueryCtx->testingOverrideConfigUnsafe(std::move(config)); - - std::thread joinThread([&]() { - auto planNodeIdGenerator = std::make_shared(); - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .maxDrivers(numDrivers) - .queryCtx(joinQueryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0"}, - {"u1"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kInner) - .planNode()) - .assertResults( - "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1"); - auto stats = task->taskStats().pipelineStats; - ASSERT_GT(stats[1].operatorStats[2].spilledBytes, 0); - }); - - std::thread memThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - joinThread.join(); - memThread.join(); - waitForAllTasksToBeDeleted(); - } -} - -DEBUG_ONLY_TEST_F( - SharedArbitrationTest, - failedToReclaimFromHashJoinBuildersInNonReclaimableSection) { - const int numVectors = 32; - std::vector vectors; - fuzzerOpts_.vectorSize = 128; - fuzzerOpts_.stringVariableLength = false; - fuzzerOpts_.stringLength = 512; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - const int numDrivers = 4; - createDuckDbTable(vectors); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr joinQueryCtx = newQueryCtx(kMemoryCapacity); - - folly::EventCount allocationWait; - auto allocationWaitKey = allocationWait.prepareWait(); - folly::EventCount allocationDoneWait; - auto allocationDoneWaitKey = allocationDoneWait.prepareWait(); - - const auto joinMemoryUsage = 8L << 20; - const auto fakeAllocationSize = kMemoryCapacity - joinMemoryUsage / 2; - - std::atomic injectAllocationOnce{true}; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - allocationWait.wait(allocationWaitKey); - EXPECT_ANY_THROW(op->pool()->allocate(fakeAllocationSize)); - allocationDoneWait.notify(); - return TestAllocation{}; - }); - - std::atomic injectCount{0}; - folly::futures::Barrier builderBarrier(numDrivers); - folly::futures::Barrier pauseBarrier(numDrivers + 1); - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "HashBuild") { - return; - } - // Check all the hash build operators' memory usage instead of - // individual operator. - if (op->pool()->parent()->currentBytes() < joinMemoryUsage) { - return; - } - if (++injectCount > numDrivers - 1) { - return; - } - if (builderBarrier.wait().get()) { - allocationWait.notify(); - } - pauseBarrier.wait(); - }))); - - std::atomic injectNonReclaimableSectionOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::common::memory::MemoryPoolImpl::allocateNonContiguous", - std::function( - ([&](memory::MemoryPoolImpl* pool) { - const std::string re(".*HashBuild"); - if (!RE2::FullMatch(pool->name(), re)) { - return; - } - if (pool->parent()->currentBytes() < joinMemoryUsage) { - return; - } - if (!injectNonReclaimableSectionOnce.exchange(false)) { - return; - } - if (builderBarrier.wait().get()) { - allocationWait.notify(); - } - pauseBarrier.wait(); - // Suspend the driver to simulate the arbitration. - pool->reclaimer()->enterArbitration(); - allocationDoneWait.wait(allocationDoneWaitKey); - pool->reclaimer()->leaveArbitration(); - }))); - - std::atomic injectPauseOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function([&](Task* /*unused*/) { - if (!injectPauseOnce.exchange(false)) { - return; - } - pauseBarrier.wait(); - })); - - // Verifies that we only trigger the hash build reclaim once. - std::atomic numHashBuildReclaims{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::HashBuild::reclaim", - std::function([&](Operator* /*unused*/) { - ++numHashBuildReclaims; - ASSERT_EQ(numHashBuildReclaims, 1); - })); - - std::thread joinThread([&]() { - auto planNodeIdGenerator = std::make_shared(); - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillPartitionBits, "2") - // NOTE: set an extreme large value to avoid non-reclaimable - // section in test. - .config(core::QueryConfig::kSpillableReservationGrowthPct, "8000") - .maxDrivers(numDrivers) - .queryCtx(joinQueryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0"}, - {"u1"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kInner) - .planNode()) - .assertResults( - "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1"); - // We expect the spilling is not triggered because of non-reclaimable - // section. - auto stats = task->taskStats().pipelineStats; - ASSERT_EQ(stats[1].operatorStats[2].spilledBytes, 0); - }); - - std::thread memThread([&]() { - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - joinThread.join(); - memThread.join(); - // We only expect to reclaim from one hash build operator once. - ASSERT_EQ(numHashBuildReclaims, 1); - waitForAllTasksToBeDeleted(); - ASSERT_EQ(arbitrator_->stats().numNonReclaimableAttempts, 1); -} - -DEBUG_ONLY_TEST_F( - SharedArbitrationTest, - reclaimFromJoinBuildWaitForTableBuild) { - setupMemory(kMemoryCapacity, 0); - const int numVectors = 32; - std::vector vectors; - fuzzerOpts_.vectorSize = 128; - fuzzerOpts_.stringVariableLength = false; - fuzzerOpts_.stringLength = 512; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - const int numDrivers = 4; - createDuckDbTable(vectors); - - std::shared_ptr fakeMemoryQueryCtx = - newQueryCtx(kMemoryCapacity); - std::shared_ptr joinQueryCtx = newQueryCtx(kMemoryCapacity); - - folly::EventCount fakeAllocationWait; - std::atomic fakeAllocationUnblock{false}; - std::atomic injectFakeAllocationOnce{true}; - - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (!injectFakeAllocationOnce.exchange(false)) { - return TestAllocation{}; - } - fakeAllocationWait.await([&]() { return fakeAllocationUnblock.load(); }); - // Set the fake allocation size to trigger memory reclaim. - const auto fakeAllocationSize = arbitrator_->stats().freeCapacityBytes + - joinQueryCtx->pool()->freeBytes() + 1; - return TestAllocation{ - op->pool(), - op->pool()->allocate(fakeAllocationSize), - fakeAllocationSize}; - }); - - folly::futures::Barrier builderBarrier(numDrivers); - folly::futures::Barrier pauseBarrier(numDrivers + 1); - std::atomic addInputInjectCount{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal", - std::function(([&](Driver* driver) { - // Check if the driver is from join query. - if (driver->task()->queryCtx()->pool()->name() != - joinQueryCtx->pool()->name()) { - return; - } - // Check if the driver is from the pipeline with hash build. - if (driver->driverCtx()->pipelineId != 1) { - return; - } - if (++addInputInjectCount > numDrivers - 1) { - return; - } - if (builderBarrier.wait().get()) { - fakeAllocationUnblock = true; - fakeAllocationWait.notifyAll(); - } - // Wait for pause to be triggered. - pauseBarrier.wait().get(); - }))); - - std::atomic injectNoMoreInputOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::noMoreInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "HashBuild") { - return; - } - if (!injectNoMoreInputOnce.exchange(false)) { - return; - } - if (builderBarrier.wait().get()) { - fakeAllocationUnblock = true; - fakeAllocationWait.notifyAll(); - } - // Wait for pause to be triggered. - pauseBarrier.wait().get(); - }))); - - std::atomic injectPauseOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function([&](Task* /*unused*/) { - if (!injectPauseOnce.exchange(false)) { - return; - } - pauseBarrier.wait().get(); - })); - - // Verifies that we only trigger the hash build reclaim once. - std::atomic numHashBuildReclaims{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::HashBuild::reclaim", - std::function( - ([&](Operator* /*unused*/) { ++numHashBuildReclaims; }))); - - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::thread joinThread([&]() { - auto planNodeIdGenerator = std::make_shared(); - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillPartitionBits, "2") - // NOTE: set an extreme large value to avoid non-reclaimable - // section in test. - .config(core::QueryConfig::kSpillableReservationGrowthPct, "8000") - .maxDrivers(numDrivers) - .queryCtx(joinQueryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0"}, - {"u1"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kInner) - .planNode()) - .assertResults( - "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1"); - // We expect the spilling triggered. - auto stats = task->taskStats().pipelineStats; - ASSERT_GT(stats[1].operatorStats[2].spilledBytes, 0); - }); - - std::thread memThread([&]() { - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(fakeMemoryQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); - }); - - joinThread.join(); - memThread.join(); - - // We only expect to reclaim from one hash build operator once. - ASSERT_EQ(numHashBuildReclaims, 1); - waitForAllTasksToBeDeleted(); -} - -DEBUG_ONLY_TEST_F( - SharedArbitrationTest, - arbitrationTriggeredDuringParallelJoinBuild) { - const int numVectors = 2; - std::vector vectors; - // Build a large vector to trigger memory arbitration. - fuzzerOpts_.vectorSize = 10'000; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - - std::shared_ptr joinQueryCtx = newQueryCtx(kMemoryCapacity); - - // Make sure the parallel build has been triggered. - std::atomic parallelBuildTriggered{false}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::HashTable::parallelJoinBuild", - std::function( - [&](void*) { parallelBuildTriggered = true; })); - - // TODO: add driver context to test if the memory allocation is triggered in - // driver context or not. - - auto planNodeIdGenerator = std::make_shared(); - AssertQueryBuilder(duckDbQueryRunner_) - // Set very low table size threshold to trigger parallel build. - .config( - core::QueryConfig::kMinTableRowsForParallelJoinBuild, - std::to_string(0)) - // Set multiple hash build drivers to trigger parallel build. - .maxDrivers(4) - .queryCtx(joinQueryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0", "t1"}, - {"u1", "u0"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kInner) - .planNode()) - .assertResults( - "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1 AND t.c1 == u.c0"); - ASSERT_TRUE(parallelBuildTriggered); - waitForAllTasksToBeDeleted(); -} - -DEBUG_ONLY_TEST_F( - SharedArbitrationTest, - arbitrationTriggeredByEnsureJoinTableFit) { - setupMemory(kMemoryCapacity, 0); - const int numVectors = 2; - std::vector vectors; - fuzzerOpts_.vectorSize = 10'000; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - - std::shared_ptr queryCtx = newQueryCtx(kMemoryCapacity); - std::shared_ptr fakeCtx = newQueryCtx(kMemoryCapacity); - auto fakePool = fakeCtx->pool()->addLeafChild( - "fakePool", true, FakeMemoryReclaimer::create()); - std::vector> injectAllocations; - std::atomic injectAllocationOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::HashBuild::ensureTableFits", - std::function([&](HashBuild* buildOp) { - // Inject the allocation once to ensure the merged table allocation will - // trigger memory arbitration. - if (!injectAllocationOnce.exchange(false)) { - return; - } - auto* buildPool = buildOp->pool(); - // Free up available reservation from the leaf build memory pool. - uint64_t injectAllocationSize = buildPool->availableReservation(); - injectAllocations.emplace_back(new TestAllocation{ - buildPool, - buildPool->allocate(injectAllocationSize), - injectAllocationSize}); - // Free up available memory from the system. - injectAllocationSize = arbitrator_->stats().freeCapacityBytes + - queryCtx->pool()->freeBytes(); - injectAllocations.emplace_back(new TestAllocation{ - fakePool.get(), - fakePool->allocate(injectAllocationSize), - injectAllocationSize}); - })); + .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) + .hashJoin( + {"t0"}, + {"u0"}, + PlanBuilder(planNodeIdGenerator) + .values(vectors) + .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) + .planNode(), + "", + {"t1"}, + core::JoinType::kAnti) + .capturePlanNodeId(joinNodeId) + .planNode()) + .assertResults( + "SELECT c1 FROM tmp WHERE c0 NOT IN (SELECT c0 FROM tmp)"); + auto taskStats = exec::toPlanStats(task->taskStats()); + auto& stats = taskStats.at(joinNodeId); + checkOperatorStatsForArbitration( + stats, !sameQuery /*expectGlobalArbitration*/); + }); - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::HashBuild::reclaim", - std::function([&](Operator* /*unused*/) { - ASSERT_EQ(injectAllocations.size(), 2); - for (auto& injectAllocation : injectAllocations) { - injectAllocation->free(); - } - })); + std::thread memThread([&]() { + auto task = + newQueryBuilder() + .queryCtx(fakeMemoryQueryCtx) + .serialExecution(isSerialExecutionMode_) + .plan(PlanBuilder() + .values(vectors) + .addNode([&](std::string id, core::PlanNodePtr input) { + return std::make_shared(id, input); + }) + .planNode()) + .assertResults("SELECT * FROM tmp"); + }); - auto planNodeIdGenerator = std::make_shared(); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillPartitionBits, "2") - // Set multiple hash build drivers to trigger parallel build. - .maxDrivers(4) - .queryCtx(queryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0", "t1"}, - {"u1", "u0"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kInner) - .planNode()) - .assertResults( - "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1 AND t.c1 == u.c0"); - task.reset(); - waitForAllTasksToBeDeleted(); - ASSERT_EQ(injectAllocations.size(), 2); -} + joinThread.join(); + memThread.join(); + waitForAllTasksToBeDeleted(); -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimDuringJoinTableBuild) { - setupMemory(kMemoryCapacity, 0); - const int numVectors = 2; - std::vector vectors; - fuzzerOpts_.vectorSize = 10'000; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); + const auto newStats = arbitrator_->stats(); + ASSERT_GT(newStats.numReclaimedBytes, oldStats.numReclaimedBytes); + ASSERT_GT(newStats.reclaimTimeUs, oldStats.reclaimTimeUs); } - createDuckDbTable(vectors); - - std::shared_ptr queryCtx = newQueryCtx(kMemoryCapacity); - - std::atomic blockTableBuildOpOnce{true}; - std::atomic tableBuildBlocked{false}; - folly::EventCount tableBuildBlockWait; - std::atomic unblockTableBuild{false}; - folly::EventCount unblockTableBuildWait; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::HashTable::parallelJoinBuild", - std::function(([&](MemoryPool* pool) { - if (!blockTableBuildOpOnce.exchange(false)) { - return; - } - tableBuildBlocked = true; - tableBuildBlockWait.notifyAll(); - unblockTableBuildWait.await([&]() { return unblockTableBuild.load(); }); - void* buffer = pool->allocate(kMemoryCapacity / 4); - pool->free(buffer, kMemoryCapacity / 4); - }))); - - std::thread queryThread([&]() { - auto planNodeIdGenerator = std::make_shared(); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillPartitionBits, "2") - // Set multiple hash build drivers to trigger parallel build. - .maxDrivers(4) - .queryCtx(queryCtx) - .plan(PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS t0", "c1 AS t1", "c2 AS t2"}) - .hashJoin( - {"t0", "t1"}, - {"u1", "u0"}, - PlanBuilder(planNodeIdGenerator) - .values(vectors, true) - .project({"c0 AS u0", "c1 AS u1", "c2 AS u2"}) - .planNode(), - "", - {"t1"}, - core::JoinType::kInner) - .planNode()) - .assertResults( - "SELECT t.c1 FROM tmp as t, tmp AS u WHERE t.c0 == u.c1 AND t.c1 == u.c0"); - }); - - tableBuildBlockWait.await([&]() { return tableBuildBlocked.load(); }); - - folly::EventCount taskPauseWait; - std::atomic taskPaused{false}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function(([&](Task* /*unused*/) { - taskPaused = true; - taskPauseWait.notifyAll(); - }))); - - std::unique_ptr fakeAllocation; - std::thread memThread([&]() { - std::shared_ptr fakeCtx = newQueryCtx(kMemoryCapacity); - auto fakePool = fakeCtx->pool()->addLeafChild("fakePool"); - const auto fakeAllocationSize = arbitrator_->stats().freeCapacityBytes + - queryCtx->pool()->freeBytes() + 1; - VELOX_ASSERT_THROW( - fakePool->allocate(fakeAllocationSize), "Exceeded memory pool cap"); - }); - - taskPauseWait.await([&]() { return taskPaused.load(); }); - - unblockTableBuild = true; - unblockTableBuildWait.notifyAll(); - - memThread.join(); - queryThread.join(); - - waitForAllTasksToBeDeleted(); } -DEBUG_ONLY_TEST_F(SharedArbitrationTest, driverInitTriggeredArbitration) { +DEBUG_ONLY_TEST_P( + SharedArbitrationTestWithThreadingModes, + driverInitTriggeredArbitration) { const int numVectors = 2; std::vector vectors; const int vectorSize = 100; fuzzerOpts_.vectorSize = vectorSize; for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); + vectors.push_back(makeRowVector(rowType_, fuzzerOpts_)); } const int expectedResultVectorSize = numVectors * vectorSize; const auto expectedVector = makeRowVector( @@ -2262,12 +870,13 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, driverInitTriggeredArbitration) { createDuckDbTable(vectors); setupMemory(kMemoryCapacity, 0); - std::shared_ptr queryCtx = newQueryCtx(kMemoryCapacity); + std::shared_ptr queryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); ASSERT_EQ(queryCtx->pool()->capacity(), 0); ASSERT_EQ(queryCtx->pool()->maxCapacity(), kMemoryCapacity); auto planNodeIdGenerator = std::make_shared(); - AssertQueryBuilder(duckDbQueryRunner_) + newQueryBuilder() .config(core::QueryConfig::kSpillEnabled, "false") .queryCtx(queryCtx) .plan(PlanBuilder(planNodeIdGenerator, pool()) @@ -2277,77 +886,26 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, driverInitTriggeredArbitration) { .project({"1+1+4 as t0", "1+3+3 as t1"}) .planNode()) .assertResults(expectedVector); + waitForAllTasksToBeDeleted(); } -DEBUG_ONLY_TEST_F(SharedArbitrationTest, reclaimerStats) { - std::vector vectors; - const int vectorSize = 100; - fuzzerOpts_.vectorSize = vectorSize; - vectors.push_back(newVector()); - - createDuckDbTable(vectors); - setupMemory(32 * MB, 0); - std::shared_ptr queryCtx = newQueryCtx(32 * MB); - ASSERT_EQ(queryCtx->pool()->capacity(), 0); - ASSERT_EQ(queryCtx->pool()->maxCapacity(), 32 * MB); - auto additionalCtxLeafPool = queryCtx->pool()->addLeafChild("ctx_leaf"); - TestAllocation outerAlloc; - outerAlloc.buffer = additionalCtxLeafPool->allocate(8 * MB); - outerAlloc.pool = additionalCtxLeafPool.get(); - outerAlloc.size = 8 * MB; - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - TestAllocation allocation0; - TestAllocation allocation1; - auto guard = folly::makeGuard([&]() { - allocation0.free(); - allocation1.free(); - }); - allocation0.buffer = op->pool()->allocate(16 * MB); - allocation0.pool = op->pool(); - allocation0.size = 16 * MB; - - allocation1.buffer = op->pool()->allocate(16 * MB); - allocation1.pool = op->pool(); - allocation1.size = 16 * MB; - - return TestAllocation{}; - }); - fakeOperatorFactory_->setReclaimCallback([&](MemoryPool* /*unused*/, - uint64_t /*unused*/, - MemoryReclaimer::Stats& stats) { - ++stats.numNonReclaimableAttempts; - outerAlloc.free(); - return true; - }); - auto planNodeIdGenerator = std::make_shared(); - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .plan(PlanBuilder(planNodeIdGenerator, pool()) - .values(vectors) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode()) - .assertResults(vectors); - ASSERT_EQ(arbitrator_->stats().numNonReclaimableAttempts, 1); -} - -DEBUG_ONLY_TEST_F( - SharedArbitrationTest, +DEBUG_ONLY_TEST_P( + SharedArbitrationTestWithThreadingModes, DISABLED_raceBetweenTaskTerminateAndReclaim) { setupMemory(kMemoryCapacity, 0); const int numVectors = 10; std::vector vectors; for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); + vectors.push_back(makeRowVector(rowType_, fuzzerOpts_)); } createDuckDbTable(vectors); - std::shared_ptr queryCtx = newQueryCtx(kMemoryCapacity); + std::shared_ptr queryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); ASSERT_EQ(queryCtx->pool()->capacity(), 0); - // Allocate a large chunk of memory to trigger memory reclaim during the query - // execution. + // Allocate a large chunk of memory to trigger memory reclaim during the + // query execution. auto fakeLeafPool = queryCtx->pool()->addLeafChild("fakeLeaf"); const size_t fakeAllocationSize = kMemoryCapacity / 2; TestAllocation fakeAllocation{ @@ -2366,7 +924,7 @@ DEBUG_ONLY_TEST_F( return; } task = values->testingOperatorCtx()->task(); - MemoryPool* pool = values->pool(); + memory::MemoryPool* pool = values->pool(); VELOX_ASSERT_THROW( pool->allocate(kMemoryCapacity * 2 / 3), "Exceeded memory pool cap"); @@ -2380,7 +938,7 @@ DEBUG_ONLY_TEST_F( std::atomic taskAborted{false}; SCOPED_TESTVALUE_SET( "facebook::velox::exec::Operator::MemoryReclaimer::reclaim", - std::function(([&](MemoryPool* pool) { + std::function(([&](memory::MemoryPool* pool) { const std::string re(".*Aggregation"); if (!RE2::FullMatch(pool->name(), re)) { return; @@ -2395,12 +953,12 @@ DEBUG_ONLY_TEST_F( const auto spillDirectory = exec::test::TempDirectoryPath::create(); std::thread queryThread([&]() { VELOX_ASSERT_THROW( - AssertQueryBuilder(duckDbQueryRunner_) + newQueryBuilder() .queryCtx(queryCtx) - .spillDirectory(spillDirectory->path) + .spillDirectory(spillDirectory->getPath()) .config(core::QueryConfig::kSpillEnabled, "true") .config(core::QueryConfig::kJoinSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillPartitionBits, "2") + .config(core::QueryConfig::kSpillNumPartitionBits, "2") .maxDrivers(numDrivers) .plan(PlanBuilder() .values(vectors) @@ -2428,91 +986,28 @@ DEBUG_ONLY_TEST_F( waitForAllTasksToBeDeleted(); } -DEBUG_ONLY_TEST_F(SharedArbitrationTest, raceBetweenMaybeReserveAndTaskAbort) { - setupMemory(kMemoryCapacity, 0); - const int numVectors = 10; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - - auto queryCtx = newQueryCtx(kMemoryCapacity); - ASSERT_EQ(queryCtx->pool()->capacity(), 0); - - // Create a fake query to hold some memory to trigger memory arbitration. - auto fakeQueryCtx = newQueryCtx(kMemoryCapacity); - auto fakeLeafPool = fakeQueryCtx->pool()->addLeafChild( - "fakeLeaf", true, FakeMemoryReclaimer::create()); - TestAllocation fakeAllocation{ - fakeLeafPool.get(), - fakeLeafPool->allocate(kMemoryCapacity / 3), - kMemoryCapacity / 3}; - - std::unique_ptr injectAllocation; - std::atomic injectAllocationOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::common::memory::MemoryPoolImpl::maybeReserve", - std::function([&](memory::MemoryPool* pool) { - if (!injectAllocationOnce.exchange(false)) { - return; - } - // The injection memory allocation (with the given size) makes sure that - // maybeReserve fails and abort this query itself. - const size_t injectAllocationSize = - pool->freeBytes() + arbitrator_->stats().freeCapacityBytes; - injectAllocation.reset(new TestAllocation{ - fakeLeafPool.get(), - fakeLeafPool->allocate(injectAllocationSize), - injectAllocationSize}); - })); - - const int numDrivers = 1; - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::thread queryThread([&]() { - VELOX_ASSERT_THROW( - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillPartitionBits, "2") - .maxDrivers(numDrivers) - .plan(PlanBuilder() - .values(vectors) - .localPartition({"c0", "c1"}) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .localPartition(std::vector{}) - .planNode()) - .copyResults(pool()), - "Exceeded memory pool cap"); - }); - - queryThread.join(); - fakeAllocation.free(); - injectAllocation->free(); - waitForAllTasksToBeDeleted(); -} - -DEBUG_ONLY_TEST_F(SharedArbitrationTest, asyncArbitratonFromNonDriverContext) { +DEBUG_ONLY_TEST_P( + SharedArbitrationTestWithParallelExecutionModeOnly, + asyncArbitratonFromNonDriverContext) { setupMemory(kMemoryCapacity, 0); const int numVectors = 10; std::vector vectors; for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); + vectors.push_back(makeRowVector(rowType_, fuzzerOpts_)); } createDuckDbTable(vectors); - std::shared_ptr queryCtx = newQueryCtx(kMemoryCapacity); + std::shared_ptr queryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); ASSERT_EQ(queryCtx->pool()->capacity(), 0); folly::EventCount aggregationAllocationWait; std::atomic aggregationAllocationOnce{true}; folly::EventCount aggregationAllocationUnblockWait; std::atomic aggregationAllocationUnblocked{false}; - std::atomic injectPool{nullptr}; + std::atomic injectPool{nullptr}; SCOPED_TESTVALUE_SET( "facebook::velox::memory::MemoryPoolImpl::reserveThreadSafe", - std::function(([&](MemoryPool* pool) { + std::function(([&](memory::MemoryPool* pool) { const std::string re(".*Aggregation"); if (!RE2::FullMatch(pool->name(), re)) { return; @@ -2531,12 +1026,12 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, asyncArbitratonFromNonDriverContext) { const auto spillDirectory = exec::test::TempDirectoryPath::create(); std::shared_ptr task; std::thread queryThread([&]() { - task = AssertQueryBuilder(duckDbQueryRunner_) + task = newQueryBuilder() .queryCtx(queryCtx) - .spillDirectory(spillDirectory->path) + .spillDirectory(spillDirectory->getPath()) .config(core::QueryConfig::kSpillEnabled, "true") .config(core::QueryConfig::kJoinSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillPartitionBits, "2") + .config(core::QueryConfig::kSpillNumPartitionBits, "2") .plan(PlanBuilder() .values(vectors) .localPartition({"c0", "c1"}) @@ -2551,8 +1046,8 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, asyncArbitratonFromNonDriverContext) { [&]() { return !aggregationAllocationOnce.load(); }); ASSERT_TRUE(injectPool != nullptr); - // Trigger the memory arbitration with memory pool whose associated driver is - // running on driver thread. + // Trigger the memory arbitration with memory pool whose associated driver + // is running on driver thread. const size_t fakeAllocationSize = arbitrator_->stats().freeCapacityBytes / 2; TestAllocation fakeAllocation = { injectPool.load(), @@ -2569,152 +1064,105 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, asyncArbitratonFromNonDriverContext) { waitForAllTasksToBeDeleted(); } -DEBUG_ONLY_TEST_F( - SharedArbitrationTest, - allocationMemoryFromNonSpillMemoryPoolUnderArbitration) { - setupMemory(kMemoryCapacity, 0); - const int numVectors = 10; - std::vector vectors; - for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); - } - createDuckDbTable(vectors); - std::shared_ptr queryCtx = newQueryCtx(kMemoryCapacity); - ASSERT_EQ(queryCtx->pool()->capacity(), 0); - - std::atomic aggregationMaybeReserveInjectionOnce{true}; - std::atomic injectPool{nullptr}; - SCOPED_TESTVALUE_SET( - "facebook::velox::common::memory::MemoryPoolImpl::maybeReserve", - std::function(([&](MemoryPool* pool) { - if (!aggregationMaybeReserveInjectionOnce.exchange(false)) { - return; - } - if (pool->currentBytes() == 0) { - return; - } - injectPool = pool; - VELOX_ASSERT_THROW( - pool->allocate(kMemoryCapacity - pool->reservedBytes() / 2), - "Exceeded memory pool cap"); - }))); - - std::atomic nonSpillMemoryPoolChecked{false}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function(([&](Task* /*unused*/) { - VELOX_ASSERT_THROW( - injectPool.load()->allocate(20L << 20), - "Unexpected non-spilling memory reservation"); - nonSpillMemoryPoolChecked = true; - }))); - - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .spillDirectory(spillDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillPartitionBits, "2") - .plan(PlanBuilder() - .values(vectors) - .localPartition({"c0", "c1"}) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .localPartition(std::vector{}) - .planNode()) - .assertResults("SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); - - waitForAllTasksToBeDeleted(); - - ASSERT_TRUE(injectPool != nullptr); - ASSERT_TRUE(nonSpillMemoryPoolChecked); -} - -DEBUG_ONLY_TEST_F(SharedArbitrationTest, arbitrationFromTableWriter) { - setupMemory(kMemoryCapacity, 0); - - VectorFuzzer::Options options; - const int batchSize = 1000; - options.vectorSize = batchSize; - VectorFuzzer fuzzer(options, pool()); - const int numBatches = 10; +DEBUG_ONLY_TEST_P(SharedArbitrationTestWithThreadingModes, runtimeStats) { + const uint64_t memoryCapacity = 128 * MB; + setupMemory(memoryCapacity); + fuzzerOpts_.vectorSize = 1000; + fuzzerOpts_.stringLength = 1024; + fuzzerOpts_.stringVariableLength = false; + VectorFuzzer fuzzer(fuzzerOpts_, pool()); std::vector vectors; int numRows{0}; - for (int i = 0; i < numBatches; ++i) { - numRows += batchSize; - vectors.push_back(fuzzer.fuzzRow(rowType_)); + for (int i = 0; i < 10; ++i) { + vectors.push_back(fuzzer.fuzzInputRow(rowType_)); + numRows += vectors.back()->size(); } - createDuckDbTable(vectors); - - std::shared_ptr queryCtx = newQueryCtx(kMemoryCapacity); - ASSERT_EQ(queryCtx->pool()->capacity(), 0); - - std::atomic injectArbitrationOnce{true}; + std::atomic outputCount{0}; SCOPED_TESTVALUE_SET( - "facebook::velox::memory::MemoryPoolImpl::reserveThreadSafe", - std::function([&](memory::MemoryPool* pool) { - const std::string dictPoolRe(".*dictionary"); - const std::string generalPoolRe(".*general"); - const std::string compressionPoolRe(".*compression"); - if (!RE2::FullMatch(pool->name(), dictPoolRe) && - !RE2::FullMatch(pool->name(), generalPoolRe) && - !RE2::FullMatch(pool->name(), compressionPoolRe)) { - return; - } - if (pool->currentBytes() == 0) { - return; - } - if (!injectArbitrationOnce.exchange(false)) { - return; - } - const auto fakeAllocationSize = - arbitrator_->stats().maxCapacityBytes - pool->currentBytes(); - VELOX_ASSERT_THROW( - pool->allocate(fakeAllocationSize), "Exceeded memory pool"); - })); + "facebook::velox::exec::Values::getOutput", + std::function( + ([&](const facebook::velox::exec::Values* values) { + if (outputCount++ != 5) { + return; + } + const auto fakeAllocationSize = + arbitrator_->stats().maxCapacityBytes - + values->pool()->capacity() + 1; + void* buffer = values->pool()->allocate(fakeAllocationSize); + values->pool()->free(buffer, fakeAllocationSize); + }))); - auto outputDirectory = TempDirectoryPath::create(); + const auto spillDirectory = exec::test::TempDirectoryPath::create(); + const auto outputDirectory = TempDirectoryPath::create(); + const auto queryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), memoryCapacity); auto writerPlan = PlanBuilder() .values(vectors) - .tableWrite(outputDirectory->path) - .project({TableWriteTraits::rowCountColumnName()}) + .tableWrite(outputDirectory->getPath()) .singleAggregation( {}, {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}) .planNode(); - - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .spillDirectory(outputDirectory->path) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillEnabled, "true") - .config(core::QueryConfig::kJoinSpillPartitionBits, "2") - .plan(std::move(writerPlan)) - .assertResults(fmt::format("SELECT {}", numRows)); - - ASSERT_EQ(arbitrator_->stats().numFailures, 1); + { + const std::shared_ptr task = + newQueryBuilder() + .queryCtx(queryCtx) + .maxDrivers(1) + .spillDirectory(spillDirectory->getPath()) + .config(core::QueryConfig::kSpillEnabled, "true") + .config(core::QueryConfig::kWriterSpillEnabled, "true") + // Set 0 file writer flush threshold to always trigger flush in + // test. + .config(core::QueryConfig::kWriterFlushThresholdBytes, "0") + // Set stripe size to extreme large to avoid writer internal + // triggered flush. + .connectorSessionProperty( + kHiveConnectorId, + connector::hive::HiveConfig::kOrcWriterMaxStripeSizeSession, + "1GB") + .connectorSessionProperty( + kHiveConnectorId, + connector::hive::HiveConfig:: + kOrcWriterMaxDictionaryMemorySession, + "1GB") + .plan(std::move(writerPlan)) + .assertResults(fmt::format("SELECT {}", numRows)); + + auto stats = task->taskStats().pipelineStats.front().operatorStats; + // TableWrite Operator's stripeSize runtime stats would be updated twice: + // - Values Operator's memory allocation triggers TableWrite's memory + // reclaim, which triggers data flush. + // - TableWrite Operator's close would trigger flush. + ASSERT_EQ(stats[1].runtimeStats["stripeSize"].count, 2); + // Values Operator won't be set stripeSize in its runtimeStats. + ASSERT_EQ(stats[0].runtimeStats["stripeSize"].count, 0); + } + waitForAllTasksToBeDeleted(); } -DEBUG_ONLY_TEST_F(SharedArbitrationTest, arbitrateMemoryFromOtherOperator) { +DEBUG_ONLY_TEST_P( + SharedArbitrationTestWithParallelExecutionModeOnly, + arbitrateMemoryFromOtherOperator) { setupMemory(kMemoryCapacity, 0); const int numVectors = 10; std::vector vectors; for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); + vectors.push_back(makeRowVector(rowType_, fuzzerOpts_)); } createDuckDbTable(vectors); for (bool sameDriver : {false, true}) { SCOPED_TRACE(fmt::format("sameDriver {}", sameDriver)); - std::shared_ptr queryCtx = newQueryCtx(kMemoryCapacity); + std::shared_ptr queryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); ASSERT_EQ(queryCtx->pool()->capacity(), 0); std::atomic injectAllocationOnce{true}; const int initialBufferLen = 1 << 20; std::atomic buffer{nullptr}; - std::atomic bufferPool{nullptr}; + std::atomic bufferPool{nullptr}; SCOPED_TESTVALUE_SET( "facebook::velox::exec::Values::getOutput", std::function( @@ -2735,7 +1183,7 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, arbitrateMemoryFromOtherOperator) { if (!RE2::FullMatch(pool->name(), re)) { return; } - if (pool->root()->currentBytes() == 0) { + if (pool->root()->usedBytes() == 0) { return; } if (!injectReallocateOnce.exchange(false)) { @@ -2752,24 +1200,27 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, arbitrateMemoryFromOtherOperator) { }))); std::shared_ptr task; + core::PlanNodeId aggregationNodeId; std::thread queryThread([&]() { if (sameDriver) { - task = AssertQueryBuilder(duckDbQueryRunner_) + task = newQueryBuilder() .queryCtx(queryCtx) .plan(PlanBuilder() .values(vectors) .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) + .capturePlanNodeId(aggregationNodeId) .localPartition(std::vector{}) .planNode()) .assertResults( "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); } else { - task = AssertQueryBuilder(duckDbQueryRunner_) + task = newQueryBuilder() .queryCtx(queryCtx) .plan(PlanBuilder() .values(vectors) .localPartition({"c0", "c1"}) .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) + .capturePlanNodeId(aggregationNodeId) .planNode()) .assertResults( "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); @@ -2777,6 +1228,10 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, arbitrateMemoryFromOtherOperator) { }); queryThread.join(); + auto taskStats = exec::toPlanStats(task->taskStats()); + auto& aggNodeStats = taskStats.at(aggregationNodeId); + checkOperatorStatsForArbitration( + aggNodeStats, false /*expectGlobalArbitration*/); ASSERT_TRUE(buffer != nullptr); ASSERT_TRUE(bufferPool != nullptr); bufferPool.load()->free(buffer, initialBufferLen); @@ -2786,141 +1241,212 @@ DEBUG_ONLY_TEST_F(SharedArbitrationTest, arbitrateMemoryFromOtherOperator) { } } -TEST_F(SharedArbitrationTest, concurrentArbitration) { +TEST_P( + SharedArbitrationTestWithParallelExecutionModeOnly, + concurrentArbitration) { + // Tries to replicate an actual workload by concurrently running multiple + // query shapes that support spilling (and hence can be forced to abort or + // spill by the arbitrator). Also adds an element of randomness by randomly + // keeping completed tasks alive (zombie tasks) hence holding on to some + // memory. Ensures that arbitration is engaged under memory contention and + // failed queries only have errors related to memory or arbitration. FLAGS_velox_suppress_memory_capacity_exceeding_error_message = true; const int numVectors = 8; std::vector vectors; fuzzerOpts_.vectorSize = 32; fuzzerOpts_.stringVariableLength = false; fuzzerOpts_.stringLength = 32; + vectors.reserve(numVectors); for (int i = 0; i < numVectors; ++i) { - vectors.push_back(newVector()); + vectors.push_back(makeRowVector(rowType_, fuzzerOpts_)); } const int numDrivers = 4; - createDuckDbTable(vectors); - - const auto queryPlan = - PlanBuilder() - .values(vectors, true) - .addNode([&](std::string id, core::PlanNodePtr input) { - return std::make_shared(id, input); - }) - .planNode(); - const std::string referenceSQL = "SELECT * FROM tmp"; - - std::atomic stopped{false}; - - std::mutex mutex; - std::vector> queries; - std::deque> zombieTasks; - - fakeOperatorFactory_->setAllocationCallback([&](Operator* op) { - if (folly::Random::oneIn(4)) { - auto task = op->testingOperatorCtx()->driverCtx()->task; - if (folly::Random::oneIn(3)) { - task->requestAbort(); - } else { - task->requestYield(); - } + const auto expectedWriteResult = runWriteTask( + vectors, + nullptr, + isSerialExecutionMode_, + numDrivers, + pool(), + kHiveConnectorId, + false) + .data; + const auto expectedJoinResult = + runHashJoinTask( + vectors, nullptr, isSerialExecutionMode_, numDrivers, pool(), false) + .data; + const auto expectedOrderResult = + runOrderByTask( + vectors, nullptr, isSerialExecutionMode_, numDrivers, pool(), false) + .data; + const auto expectedRowNumberResult = + runRowNumberTask( + vectors, nullptr, isSerialExecutionMode_, numDrivers, pool(), false) + .data; + const auto expectedTopNResult = + runTopNTask( + vectors, nullptr, isSerialExecutionMode_, numDrivers, pool(), false) + .data; + + struct { + uint64_t totalCapacity; + uint64_t queryCapacity; + + std::string debugString() const { + return fmt::format( + "totalCapacity = {}, queryCapacity = {}.", + succinctBytes(totalCapacity), + succinctBytes(queryCapacity)); } - const size_t allocationSize = std::max( - kMemoryCapacity / 16, folly::Random::rand32() % kMemoryCapacity); - auto buffer = op->pool()->allocate(allocationSize); - return TestAllocation{op->pool(), buffer, allocationSize}; - }); - fakeOperatorFactory_->setMaxDrivers(numDrivers); - const std::string injectReclaimErrorMessage("Inject reclaim failure"); - fakeOperatorFactory_->setReclaimCallback( - [&](MemoryPool* /*unused*/, - uint64_t /*unused*/, - MemoryReclaimer::Stats& /*unused*/) { - if (folly::Random::oneIn(10)) { - VELOX_FAIL(injectReclaimErrorMessage); - } - return false; - }); - - const int numThreads = 30; - const int maxNumZombieTasks = 128; - std::vector queryThreads; - for (int i = 0; i < numThreads; ++i) { - queryThreads.emplace_back([&, i]() { - DuckDbQueryRunner duckDbQueryRunner; - folly::Random::DefaultGenerator rng; - rng.seed(i); - while (!stopped) { - std::shared_ptr query; - { - std::lock_guard l(mutex); - if (queries.empty()) { - queries.emplace_back(newQueryCtx()); - } - const int index = folly::Random::rand32() % queries.size(); - query = queries[index]; - } + } testSettings[] = { + {16 * MB, 128 * MB}, {128 * MB, 16 * MB}, {128 * MB, 128 * MB}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + const auto totalCapacity = testData.totalCapacity; + const auto queryCapacity = testData.queryCapacity; + setupMemory(totalCapacity); + + std::mutex mutex; + std::vector> queries; + std::deque> zombieTasks; + + const int numThreads = 32; + const int maxNumZombieTasks = 8; + std::vector queryThreads; + queryThreads.reserve(numThreads); + TestScopedAbortInjection testScopedAbortInjection(10, numThreads); + for (int i = 0; i < numThreads; ++i) { + queryThreads.emplace_back([&, i]() { std::shared_ptr task; try { - task = AssertQueryBuilder(duckDbQueryRunner) - .queryCtx(query) - .plan(PlanBuilder() - .values(vectors) - .addNode([&](std::string id, - core::PlanNodePtr input) { - return std::make_shared( - id, input); - }) - .planNode()) - .assertResults("SELECT * FROM tmp"); + auto queryCtx = + newQueryCtx(memoryManager_.get(), executor_.get(), queryCapacity); + if (i == 0) { + // Write task contains aggregate node, which does not support + // multithread aggregation type resolver, so make sure it is built + // in a single thread. + task = runWriteTask( + vectors, + queryCtx, + isSerialExecutionMode_, + numDrivers, + pool(), + kHiveConnectorId, + true, + expectedWriteResult) + .task; + } else if ((i % 4) == 0) { + task = runHashJoinTask( + vectors, + queryCtx, + isSerialExecutionMode_, + numDrivers, + pool(), + true, + expectedJoinResult) + .task; + } else if ((i % 4) == 1) { + task = runOrderByTask( + vectors, + queryCtx, + isSerialExecutionMode_, + numDrivers, + pool(), + true, + expectedOrderResult) + .task; + } else if ((i % 4) == 2) { + task = runRowNumberTask( + vectors, + queryCtx, + isSerialExecutionMode_, + numDrivers, + pool(), + true, + expectedRowNumberResult) + .task; + } else { + task = runTopNTask( + vectors, + queryCtx, + isSerialExecutionMode_, + numDrivers, + pool(), + true, + expectedTopNResult) + .task; + } } catch (const VeloxException& e) { - continue; + if (e.errorCode() != error_code::kMemCapExceeded.c_str() && + e.errorCode() != error_code::kMemAborted.c_str() && + e.errorCode() != error_code::kMemAllocError.c_str() && + (e.message() != "Aborted for external error")) { + std::rethrow_exception(std::current_exception()); + } } + std::lock_guard l(mutex); - zombieTasks.emplace_back(std::move(task)); + if (folly::Random().oneIn(3)) { + zombieTasks.emplace_back(std::move(task)); + } while (zombieTasks.size() > maxNumZombieTasks) { zombieTasks.pop_front(); } - } - }); - } + }); + } - const int maxNumQueries = 64; - std::thread controlThread([&]() { - folly::Random::DefaultGenerator rng; - rng.seed(1000); - while (!stopped) { - std::shared_ptr queryToDelete; - { - std::lock_guard l(mutex); - if (queries.empty() || - ((queries.size() < maxNumQueries) && - folly::Random::oneIn(4, rng))) { - queries.emplace_back(newQueryCtx()); - } else { - const int deleteIndex = folly::Random::rand32(rng) % queries.size(); - queryToDelete = queries[deleteIndex]; - queries.erase(queries.begin() + deleteIndex); - } - } - std::this_thread::sleep_for(std::chrono::microseconds(5)); + for (auto& queryThread : queryThreads) { + queryThread.join(); } - }); + zombieTasks.clear(); + waitForAllTasksToBeDeleted(); + ASSERT_GT(arbitrator_->stats().numRequests, 0); + } +} - std::this_thread::sleep_for(std::chrono::seconds(5)); - stopped = true; +TEST_P(SharedArbitrationTestWithThreadingModes, reserveReleaseCounters) { + for (int i = 0; i < 37; ++i) { + folly::Random::DefaultGenerator rng(i); + auto numRootPools = folly::Random::rand32(rng) % 11 + 3; + std::vector threads; + threads.reserve(numRootPools); + std::mutex mutex; + setupMemory(kMemoryCapacity, 0); + { + std::vector> queries; + queries.reserve(numRootPools); + for (int j = 0; j < numRootPools; ++j) { + threads.emplace_back([&]() { + { + std::lock_guard l(mutex); + queries.emplace_back( + newQueryCtx(memoryManager_.get(), executor_.get())); + } + }); + } - for (auto& queryThread : queryThreads) { - queryThread.join(); + for (auto& queryThread : threads) { + queryThread.join(); + } + ASSERT_EQ(arbitrator_->stats().numShrinks, 0); + } + ASSERT_EQ(arbitrator_->stats().numShrinks, numRootPools); } - controlThread.join(); } -// TODO: add more tests. +VELOX_INSTANTIATE_TEST_SUITE_P( + SharedArbitrationTest, + SharedArbitrationTestWithParallelExecutionModeOnly, + testing::ValuesIn(std::vector{{false}})); +VELOX_INSTANTIATE_TEST_SUITE_P( + SharedArbitrationTest, + SharedArbitrationTestWithThreadingModes, + testing::ValuesIn(std::vector{{false}, {true}})); } // namespace facebook::velox::memory int main(int argc, char** argv) { - folly::SingletonVault::singleton()->registrationComplete(); testing::InitGoogleTest(&argc, argv); - + folly::Init init{&argc, &argv, false}; return RUN_ALL_TESTS(); } diff --git a/velox/common/memory/tests/StreamArenaTest.cpp b/velox/common/memory/tests/StreamArenaTest.cpp index 7d4ee844b8623..3cafe734a320f 100644 --- a/velox/common/memory/tests/StreamArenaTest.cpp +++ b/velox/common/memory/tests/StreamArenaTest.cpp @@ -28,29 +28,24 @@ class StreamArenaTest : public testing::Test { protected: void SetUp() override { constexpr uint64_t kMaxMappedMemory = 64 << 20; - MmapAllocator::Options options; - options.capacity = kMaxMappedMemory; - mmapAllocator_ = std::make_shared(options); - MemoryAllocator::setDefaultInstance(mmapAllocator_.get()); - memoryManager_ = std::make_unique(MemoryManagerOptions{ - .capacity = kMaxMappedMemory, - .allocator = MemoryAllocator::getInstance()}); + MemoryManagerOptions options; + options.allocatorCapacity = kMaxMappedMemory; + options.useMmapAllocator = true; + memoryManager_ = std::make_unique(options); + mmapAllocator_ = static_cast(memoryManager_->allocator()); pool_ = memoryManager_->addLeafPool("ByteStreamTest"); rng_.seed(124); } - void TearDown() override { - MmapAllocator::testingDestroyInstance(); - MemoryAllocator::setDefaultInstance(nullptr); - } + void TearDown() override {} std::unique_ptr newArena() { return std::make_unique(pool_.get()); } folly::Random::DefaultGenerator rng_; - std::shared_ptr mmapAllocator_; std::unique_ptr memoryManager_; + MmapAllocator* mmapAllocator_; std::shared_ptr pool_; }; @@ -107,7 +102,7 @@ TEST_F(StreamArenaTest, newRange) { auto arena = newArena(); ByteRange range; for (int i = 0; i < testData.requestRangeSizes.size(); ++i) { - arena->newRange(testData.requestRangeSizes[i], &range); + arena->newRange(testData.requestRangeSizes[i], nullptr, &range); ASSERT_EQ(range.size, testData.expectedRangeSizes[i]) << range.toString(); ASSERT_EQ(range.position, 0); ASSERT_TRUE(range.buffer != nullptr); @@ -135,18 +130,18 @@ TEST_F(StreamArenaTest, randomRange) { if (folly::Random::oneIn(4)) { const int requestSize = 1 + folly::Random::rand32() % (2 * AllocationTraits::kPageSize); - arena->newTinyRange(requestSize, &range); + arena->newTinyRange(requestSize, nullptr, &range); ASSERT_EQ(range.size, requestSize); } else if (folly::Random::oneIn(3)) { const int requestSize = AllocationTraits::pageBytes(pool_->largestSizeClass()) + (folly::Random::rand32() % (4 << 20)); - arena->newRange(requestSize, &range); + arena->newRange(requestSize, nullptr, &range); ASSERT_EQ(AllocationTraits::roundUpPageBytes(requestSize), range.size); } else { const int requestSize = 1 + folly::Random::rand32() % pool_->largestSizeClass(); - arena->newRange(requestSize, &range); + arena->newRange(requestSize, nullptr, &range); ASSERT_LE(range.size, AllocationTraits::roundUpPageBytes(requestSize)); } ASSERT_EQ(range.position, 0); @@ -158,8 +153,9 @@ TEST_F(StreamArenaTest, error) { auto arena = newArena(); ByteRange range; VELOX_ASSERT_THROW( - arena->newTinyRange(0, &range), + arena->newTinyRange(0, nullptr, &range), "StreamArena::newTinyRange can't be zero length"); VELOX_ASSERT_THROW( - arena->newRange(0, &range), "StreamArena::newRange can't be zero length"); + arena->newRange(0, nullptr, &range), + "StreamArena::newRange can't be zero length"); } diff --git a/velox/common/process/CMakeLists.txt b/velox/common/process/CMakeLists.txt index 22182ed58f120..0cebd335cb982 100644 --- a/velox/common/process/CMakeLists.txt +++ b/velox/common/process/CMakeLists.txt @@ -12,12 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_process ProcessBase.cpp StackTrace.cpp ThreadDebugInfo.cpp - TraceContext.cpp) +velox_add_library( + velox_process + ProcessBase.cpp + Profiler.cpp + StackTrace.cpp + ThreadDebugInfo.cpp + TraceContext.cpp + TraceHistory.cpp) -target_link_libraries( +velox_link_libraries( velox_process - PUBLIC velox_flag_definitions Folly::folly + PUBLIC velox_file velox_flag_definitions Folly::folly PRIVATE fmt::fmt gflags::gflags glog::glog) if(${VELOX_BUILD_TESTING}) diff --git a/velox/common/process/ProcessBase.h b/velox/common/process/ProcessBase.h index 990e7a86855e3..34edd6d14676f 100644 --- a/velox/common/process/ProcessBase.h +++ b/velox/common/process/ProcessBase.h @@ -18,51 +18,36 @@ #include #include +#include #include #include -namespace facebook { -namespace velox { -namespace process { +namespace facebook::velox::process { -/** - * Current executable's name. - */ +/// Current executable's name. std::string getAppName(); -/** - * This machine'a name. - */ +/// This machine'a name. std::string getHostName(); -/** - * Process identifier. - */ +/// Process identifier. pid_t getProcessId(); -/** - * Current thread's identifier. - */ +/// Current thread's identifier. pthread_t getThreadId(); -/** - * Get current working directory. - */ +/// Get current working directory. std::string getCurrentDirectory(); -/** - * Returns elapsed CPU nanoseconds on the calling thread - */ +/// Returns elapsed CPU nanoseconds on the calling thread uint64_t threadCpuNanos(); -// True if the machine has Intel AVX2 instructions and these are not disabled by -// flag. +/// True if the machine has Intel AVX2 instructions and these are not disabled +/// by flag. bool hasAvx2(); -// True if the machine has Intel BMI2 instructions and these are not disabled by -// flag. +/// True if the machine has Intel BMI2 instructions and these are not disabled +/// by flag. bool hasBmi2(); -} // namespace process -} // namespace velox -} // namespace facebook +} // namespace facebook::velox::process diff --git a/velox/common/process/Profiler.cpp b/velox/common/process/Profiler.cpp new file mode 100644 index 0000000000000..82a5e83652693 --- /dev/null +++ b/velox/common/process/Profiler.cpp @@ -0,0 +1,360 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/process/Profiler.h" +#include "velox/common/file/File.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +DEFINE_string(profiler_tmp_dir, "/tmp", "Writable temp for perf.data"); + +DEFINE_int32( + profiler_check_interval_seconds, + 60, + "Frequency of checking CPU load and turning profiling on/off"); + +DEFINE_int32( + profiler_min_cpu_pct, + 200, + "Minimum CPU percent to justify profile. 100 is one core busy"); + +DEFINE_int32( + profiler_min_sample_seconds, + 60, + "Minimum amount of time at above minimum load to justify producing a result file"); + +DEFINE_int32( + profiler_max_sample_seconds, + 300, + "Number of seconds before switching to new file"); + +DEFINE_string(profiler_perf_flags, "", "Extra flags for Linux perf"); + +namespace facebook::velox::process { + +tsan_atomic Profiler::profileStarted_; +std::thread Profiler::profileThread_; +std::mutex Profiler::profileMutex_; +std::shared_ptr Profiler::fileSystem_; +tsan_atomic Profiler::isSleeping_; +std::string Profiler::resultPath_; +tsan_atomic Profiler::shouldStop_; +folly::Promise Profiler::sleepPromise_; +tsan_atomic Profiler::shouldSaveResult_; +tsan_atomic Profiler::sampleStartTime_; +int64_t Profiler::cpuAtSampleStart_; +int64_t Profiler::cpuAtLastCheck_; +std::function Profiler::startExtra_; +std::function Profiler::extraReport_; + +namespace { +std::string hostname; + +// Check that paths do not have shell escapes. +void checkSafe(const std::string& str) { + if (strchr(str.c_str(), '`') != nullptr || + strchr(str.c_str(), '$') != nullptr) { + LOG(ERROR) << "Unsafe path " << str << ". Exiting."; + ::exit(1); + } +} + +void testWritable(const std::string& dir) { + auto testPath = fmt::format("{}/test", dir); + int32_t fd = + open(testPath.c_str(), O_RDWR | O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO); + if (fd < 0) { + LOG(ERROR) << "Can't open " << testPath << " for write errno=" << errno; + return; + } + if (4 != write(fd, "test", 4)) { + LOG(ERROR) << "Can't write to " << testPath << " errno=" << errno; + } + close(fd); +} + +// Returns user+system cpu seconds from getrusage() +int64_t cpuSeconds() { + struct rusage ru; + getrusage(RUSAGE_SELF, &ru); + return ru.ru_utime.tv_sec + ru.ru_stime.tv_sec; +} + +int64_t nowSeconds() { + struct timeval tv; + struct timezone tz; + gettimeofday(&tv, &tz); + return tv.tv_sec; +} + +std::string timeString(time_t seconds) { + struct tm tm; + localtime_r(&seconds, &tm); + char temp[100]; + strftime(temp, sizeof(temp), "%Y-%m-%d_%H:%M:%S", &tm); + return std::string(temp); +} +} // namespace + +void Profiler::copyToResult(const std::string* data) { + char* buffer; + int32_t resultSize; + std::string temp; + if (data) { + buffer = const_cast(data->data()); + resultSize = std::min(data->size(), 400000); + } else { + testWritable(FLAGS_profiler_tmp_dir); + auto reportFile = fmt::format("{}/perf", FLAGS_profiler_tmp_dir); + int32_t fd = open(reportFile.c_str(), O_RDONLY); + if (fd < 0) { + LOG(ERROR) << "PROFILE: << Could not open report file at " << reportFile; + return; + } + auto bufferSize = 400000; + temp.resize(400000); + buffer = temp.data(); + resultSize = ::read(fd, buffer, bufferSize); + close(fd); + } + + std::string dt = timeString(nowSeconds()); + auto target = + fmt::format("{}/prof-{}-{}-{}", resultPath_, hostname, dt, getpid()); + try { + try { + fileSystem_->remove(target); + } catch (const std::exception&) { + // ignore + } + auto out = fileSystem_->openFileForWrite(target); + auto now = nowSeconds(); + auto elapsed = (now - sampleStartTime_); + auto cpu = cpuSeconds(); + out->append(fmt::format( + "Profile from {} to {} at {}% CPU\n\n", + + timeString(sampleStartTime_), + timeString(now), + 100 * (cpu - cpuAtSampleStart_) / std::max(1, elapsed))); + out->append(std::string_view(buffer, resultSize)); + if (extraReport_) { + std::string extra = extraReport_(); + out->append(std::string_view(extra.data(), extra.size())); + } + out->flush(); + LOG(INFO) << "PROFILE: Produced result " << target << " " << resultSize + << " bytes"; + } catch (const std::exception& e) { + LOG(ERROR) << "PROFILE: Error opening/writing " << target << ":" + << e.what(); + } +} + +void Profiler::makeProfileDir(std::string path) { + try { + fileSystem_->mkdir(path); + } catch (const std::exception& e) { + LOG(ERROR) << "PROFILE: Failed to create directory " << path << ":" + << e.what(); + } +} + +std::thread Profiler::startSample() { + if (startExtra_) { + startExtra_(); + } + std::thread thread([&]() { + // We run perf under a shell because running it with fork + rexec + // and killing it with SIGINT produces a corrupt perf.data + // file. The perf.data file generated when called via system() is + // good, though. Unsolved mystery. + system(fmt::format( + "(cd {}; /usr/bin/perf record --pid {} {};" + "perf report --sort symbol > perf ;" + "sed --in-place 's/ / /'g perf;" + "sed --in-place 's/ / /'g perf; date) " + ">> {}/perftrace 2>>{}/perftrace2", + FLAGS_profiler_tmp_dir, + getpid(), + FLAGS_profiler_perf_flags, + FLAGS_profiler_tmp_dir, + FLAGS_profiler_tmp_dir) + .c_str()); // NOLINT + if (shouldSaveResult_) { + copyToResult(); + } + }); + + cpuAtSampleStart_ = cpuSeconds(); + sampleStartTime_ = nowSeconds(); + return thread; +} + +bool Profiler::interruptibleSleep(int32_t seconds) { + sleepPromise_ = folly::Promise(); + + folly::SemiFuture sleepFuture(false); + { + std::lock_guard l(profileMutex_); + isSleeping_ = true; + sleepPromise_ = folly::Promise(); + sleepFuture = sleepPromise_.getSemiFuture(); + } + if (!shouldStop_) { + try { + auto& executor = folly::QueuedImmediateExecutor::instance(); + std::move(sleepFuture) + .via(&executor) + .wait((std::chrono::seconds(seconds))); + } catch (std::exception&) { + } + } + { + std::lock_guard l(profileMutex_); + isSleeping_ = false; + } + + return shouldStop_; +} + +void Profiler::stopSample(std::thread systemThread) { + LOG(INFO) << "PROFILE: Signalling perf"; + + system("killall -2 perf"); + systemThread.join(); + + sampleStartTime_ = 0; +} + +void Profiler::threadFunction() { + makeProfileDir(resultPath_); + cpuAtLastCheck_ = cpuSeconds(); + std::thread sampleThread; + for (int32_t counter = 0;; ++counter) { + if (FLAGS_profiler_min_cpu_pct == 0) { + sampleThread = startSample(); + // First two times sleep for one interval and then five intervals. + if (interruptibleSleep( + FLAGS_profiler_check_interval_seconds * (counter < 2 ? 1 : 5))) { + break; + } + stopSample(std::move(sampleThread)); // NOLINT + } else { + int64_t now = nowSeconds(); + int64_t cpuNow = cpuSeconds(); + int64_t lastPct = counter == 0 ? 0 + : (100 * (cpuNow - cpuAtLastCheck_) / + FLAGS_profiler_check_interval_seconds); + if (sampleStartTime_ != 0) { + if (now - sampleStartTime_ > FLAGS_profiler_max_sample_seconds) { + shouldSaveResult_ = true; + stopSample(std::move(sampleThread)); // NOLINT + } + } + if (lastPct > FLAGS_profiler_min_cpu_pct) { + if (sampleStartTime_ == 0) { + sampleThread = startSample(); + } + } else { + if (sampleStartTime_ != 0) { + shouldSaveResult_ = + now - sampleStartTime_ >= FLAGS_profiler_min_sample_seconds; + stopSample(std::move(sampleThread)); // NOLINT + } + } + cpuAtLastCheck_ = cpuNow; + if (interruptibleSleep(FLAGS_profiler_check_interval_seconds)) { + break; + } + } + } + if (sampleStartTime_ != 0) { + auto now = nowSeconds(); + shouldSaveResult_ = + now - sampleStartTime_ >= FLAGS_profiler_min_sample_seconds; + stopSample(std::move(sampleThread)); // NOLINT + } +} + +bool Profiler::isRunning() { + std::lock_guard l(profileMutex_); + return profileStarted_; +} + +void Profiler::start( + const std::string& path, + std::function extraStart, + std::function extraReport) { + { +#if !defined(linux) + VELOX_FAIL("Profiler is only available for Linux"); +#endif + resultPath_ = path; + startExtra_ = extraStart; + extraReport_ = extraReport; + std::lock_guard l(profileMutex_); + if (profileStarted_) { + return; + } + profileStarted_ = true; + } + checkSafe(FLAGS_profiler_tmp_dir); + checkSafe(FLAGS_profiler_perf_flags); + char temp[1000] = {}; + gethostname(temp, sizeof(temp) - 1); + hostname = std::string(temp); + fileSystem_ = velox::filesystems::getFileSystem(path, nullptr); + if (!fileSystem_) { + LOG(ERROR) << "PROFILE: Failed to find file system for " << path + << ". Profiler not started."; + return; + } + makeProfileDir(path); + atexit(Profiler::stop); + LOG(INFO) << "PROFILE: Starting profiling to " << path; + profileThread_ = std::thread([]() { threadFunction(); }); +} + +void Profiler::stop() { + { + std::lock_guard l(profileMutex_); + shouldStop_ = true; + if (!profileStarted_) { + return; + } + if (isSleeping_) { + sleepPromise_.setValue(true); + } + } + profileThread_.join(); + { + std::lock_guard l(profileMutex_); + profileStarted_ = false; + } + LOG(INFO) << "Stopped profiling"; +} + +} // namespace facebook::velox::process diff --git a/velox/common/process/Profiler.h b/velox/common/process/Profiler.h new file mode 100644 index 0000000000000..7626946e8b944 --- /dev/null +++ b/velox/common/process/Profiler.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "velox/common/file/FileSystems.h" + +DECLARE_int32(profiler_check_interval_seconds); +DECLARE_int32(profiler_min_cpu_pct); +DECLARE_int32(profiler_min_sample_seconds); +DECLARE_int32(profiler_max_sample_seconds); + +namespace facebook::velox::process { + +class Profiler { + public: + /// Starts periodic production of perf reports. + static void start( + const std::string& path, + std::function extraStart = nullptr, + std::function extraReport = nullptr); + + // Stops profiling background associated threads. Threads are stopped on + // return. + static void stop(); + + static bool isRunning(); + + private: + static void copyToResult(const std::string* result = nullptr); + static void makeProfileDir(std::string path); + static std::thread startSample(); + // Returns after 'seconds' of wall time or sooner if interrupted by stop(). + static bool interruptibleSleep(int32_t seconds); + static void stopSample(std::thread thread); + static void threadFunction(); + + static tsan_atomic profileStarted_; + static std::thread profileThread_; + static std::mutex profileMutex_; + static std::shared_ptr fileSystem_; + static tsan_atomic isSleeping_; + static tsan_atomic shouldStop_; + static folly::Promise sleepPromise_; + + // Directory where results are deposited. Results have unique names within + // this. + static std::string resultPath_; + + // indicates if the results of the the profile should be saved at stop. + static tsan_atomic shouldSaveResult_; + + // Time of starting the profile. Seconds from epoch. + static tsan_atomic sampleStartTime_; + + // CPU time at start of profile. + static int64_t cpuAtSampleStart_; + + // CPU time at last periodic check. + static int64_t cpuAtLastCheck_; + + static std::function startExtra_; + static std::function extraReport_; +}; + +} // namespace facebook::velox::process diff --git a/velox/common/process/ThreadDebugInfo.cpp b/velox/common/process/ThreadDebugInfo.cpp index 75db6ed5ac102..ae681a2dcc436 100644 --- a/velox/common/process/ThreadDebugInfo.cpp +++ b/velox/common/process/ThreadDebugInfo.cpp @@ -42,24 +42,32 @@ static void printCurrentQueryId() { const char* msg2 = " Task Id= "; write(STDERR_FILENO, msg2, strlen(msg2)); write(STDERR_FILENO, info->taskId_.c_str(), info->taskId_.length()); + if (!fatalSignalProcessed && info->callback_) { + fatalSignalProcessed = true; + info->callback_(); + } } write(STDERR_FILENO, "\n", 1); - - if (!fatalSignalProcessed && info->callback_) { - fatalSignalProcessed = true; - info->callback_(); - } } const ThreadDebugInfo* GetThreadDebugInfo() { return threadDebugInfo; } + ScopedThreadDebugInfo::ScopedThreadDebugInfo( - const ThreadDebugInfo& localDebugInfo) { - prevThreadDebugInfo_ = threadDebugInfo; + const ThreadDebugInfo& localDebugInfo) + : prevThreadDebugInfo_(threadDebugInfo) { threadDebugInfo = &localDebugInfo; } +ScopedThreadDebugInfo::ScopedThreadDebugInfo( + const ThreadDebugInfo* localDebugInfo) + : prevThreadDebugInfo_(threadDebugInfo) { + if (localDebugInfo != nullptr) { + threadDebugInfo = localDebugInfo; + } +} + ScopedThreadDebugInfo::~ScopedThreadDebugInfo() { threadDebugInfo = prevThreadDebugInfo_; } diff --git a/velox/common/process/ThreadDebugInfo.h b/velox/common/process/ThreadDebugInfo.h index 058a8a3c8e683..4b3145553106a 100644 --- a/velox/common/process/ThreadDebugInfo.h +++ b/velox/common/process/ThreadDebugInfo.h @@ -34,6 +34,9 @@ struct ThreadDebugInfo { class ScopedThreadDebugInfo { public: explicit ScopedThreadDebugInfo(const ThreadDebugInfo& localDebugInfo); + + explicit ScopedThreadDebugInfo(const ThreadDebugInfo* localDebugInfo); + ~ScopedThreadDebugInfo(); private: @@ -47,9 +50,9 @@ const ThreadDebugInfo* GetThreadDebugInfo(); // Install a signal handler to dump thread local debug information. This should // be called before calling folly::symbolizer::installFatalSignalCallbacks() -// which is usually called at startup via folly::Init(). This is just a default -// implementation but you can install your own signal handler. Make sure to -// install one at the start of your program. +// which is usually called at startup via folly::Init init{}. This is just a +// default implementation but you can install your own signal handler. Make sure +// to install one at the start of your program. void addDefaultFatalSignalHandler(); } // namespace facebook::velox::process diff --git a/velox/common/process/ThreadLocalRegistry.h b/velox/common/process/ThreadLocalRegistry.h new file mode 100644 index 0000000000000..a69b92da02699 --- /dev/null +++ b/velox/common/process/ThreadLocalRegistry.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace facebook::velox::process { + +/// A registry for keeping static thread local objects of type T. Similar to +/// folly::ThreadLocal but a little bit more efficient in terms of performance +/// and memory usage, because we do not support thread local with lexical scope. +/// +/// NOTE: only one instance of ThreadLocalRegistry can be created with each +/// T. +template +class ThreadLocalRegistry { + public: + class Reference; + + /// Access values from all threads. Takes a global lock and should be used + /// with caution. + template + void forAllValues(F f) { + std::lock_guard entriesLock(entriesMutex_); + for (auto& entry : entries_) { + std::lock_guard lk(entry->mutex); + f(entry->value); + } + } + + private: + struct Entry { + std::mutex mutex; + T value; + }; + + std::list> entries_; + std::mutex entriesMutex_; +}; + +/// Reference to one thread local value. Should be stored in thread local +/// memory. +template +class ThreadLocalRegistry::Reference { + public: + explicit Reference(const std::shared_ptr& registry) + : registry_(registry) { + auto entry = std::make_unique(); + std::lock_guard lk(registry_->entriesMutex_); + iterator_ = + registry_->entries_.insert(registry_->entries_.end(), std::move(entry)); + } + + ~Reference() { + std::lock_guard lk(registry_->entriesMutex_); + registry_->entries_.erase(iterator_); + } + + /// Obtain the thread local value and process it with the functor `f'. + template + auto withValue(F f) { + auto* entry = iterator_->get(); + std::lock_guard lk(entry->mutex); + return f(entry->value); + } + + private: + std::shared_ptr const registry_; + typename std::list>::iterator iterator_; +}; + +} // namespace facebook::velox::process diff --git a/velox/common/process/TraceContext.cpp b/velox/common/process/TraceContext.cpp index cad158f48ee78..b0ee5b7240970 100644 --- a/velox/common/process/TraceContext.cpp +++ b/velox/common/process/TraceContext.cpp @@ -16,23 +16,34 @@ #include "velox/common/process/TraceContext.h" +#include "velox/common/process/TraceHistory.h" + #include namespace facebook::velox::process { namespace { -folly::Synchronized>& traceMap() { - static folly::Synchronized> - staticTraceMap; - return staticTraceMap; -} + +// We use thread local instead lock here since the critical path is on write +// side. +auto registry = std::make_shared(); +thread_local auto threadLocalTraceData = + std::make_shared(registry); + } // namespace TraceContext::TraceContext(std::string label, bool isTemporary) : label_(std::move(label)), enterTime_(std::chrono::steady_clock::now()), - isTemporary_(isTemporary) { - traceMap().withWLock([&](auto& counts) { + isTemporary_(isTemporary), + traceData_(threadLocalTraceData) { + TraceHistory::push([&](auto& entry) { + entry.time = enterTime_; + entry.file = __FILE__; + entry.line = __LINE__; + snprintf(entry.label, entry.kLabelCapacity, "%s", label_.c_str()); + }); + traceData_->withValue([&](auto& counts) { auto& data = counts[label_]; ++data.numThreads; if (data.numThreads == 1) { @@ -43,17 +54,18 @@ TraceContext::TraceContext(std::string label, bool isTemporary) } TraceContext::~TraceContext() { - traceMap().withWLock([&](auto& counts) { - auto& data = counts[label_]; - --data.numThreads; + traceData_->withValue([&](auto& counts) { + auto it = counts.find(label_); + auto& data = it->second; + if (--data.numThreads == 0 && isTemporary_) { + counts.erase(it); + return; + } auto ms = std::chrono::duration_cast( std::chrono::steady_clock::now() - enterTime_) .count(); data.totalMs += ms; data.maxMs = std::max(data.maxMs, ms); - if (!data.numThreads && isTemporary_) { - counts.erase(label_); - } }); } @@ -61,27 +73,39 @@ TraceContext::~TraceContext() { std::string TraceContext::statusLine() { std::stringstream out; auto now = std::chrono::steady_clock::now(); - traceMap().withRLock([&](auto& counts) { - for (auto& pair : counts) { - if (pair.second.numThreads) { - auto continued = std::chrono::duration_cast( - now - pair.second.startTime) - .count(); - - out << pair.first << "=" << pair.second.numThreads << " entered " - << pair.second.numEnters << " avg ms " - << (pair.second.totalMs / pair.second.numEnters) << " max ms " - << pair.second.maxMs << " continuous for " << continued - << std::endl; - } + auto counts = status(); + for (auto& [label, data] : counts) { + if (data.numThreads > 0) { + auto continued = std::chrono::duration_cast( + now - data.startTime) + .count(); + out << label << ": numThreads=" << data.numThreads + << " numEnters=" << data.numEnters + << " avgMs=" << (data.totalMs / data.numEnters) + << " maxMs=" << data.maxMs << " continued=" << continued << std::endl; } - }); + } return out.str(); } // static -std::unordered_map TraceContext::status() { - return traceMap().withRLock([&](auto& map) { return map; }); +folly::F14FastMap TraceContext::status() { + folly::F14FastMap total; + registry->forAllValues([&](auto& counts) { + for (auto& [k, v] : counts) { + auto& sofar = total[k]; + if (sofar.numEnters == 0) { + sofar.startTime = v.startTime; + } else if (v.numEnters > 0) { + sofar.startTime = std::min(sofar.startTime, v.startTime); + } + sofar.numThreads += v.numThreads; + sofar.numEnters += v.numEnters; + sofar.totalMs += v.totalMs; + sofar.maxMs = std::max(sofar.maxMs, v.maxMs); + } + }); + return total; } } // namespace facebook::velox::process diff --git a/velox/common/process/TraceContext.h b/velox/common/process/TraceContext.h index c3d3a18be1420..6e718515b58d0 100644 --- a/velox/common/process/TraceContext.h +++ b/velox/common/process/TraceContext.h @@ -16,11 +16,13 @@ #pragma once +#include "velox/common/process/ThreadLocalRegistry.h" + #include #include #include -#include +#include namespace facebook::velox::process { @@ -47,6 +49,8 @@ struct TraceData { // produces a concise report of what the system is doing at any one // time. This is good for diagnosing crashes or hangs which are // difficult to figure out from stacks in a core dump. +// +// NOTE: TraceContext is not sharable between different threads. class TraceContext { public: // Starts a trace context. isTemporary is false if this is a generic @@ -56,6 +60,9 @@ class TraceContext { // which the record should be dropped once the last thread finishes. explicit TraceContext(std::string label, bool isTemporary = false); + TraceContext(const TraceContext&) = delete; + TraceContext& operator=(const TraceContext&) = delete; + ~TraceContext(); // Produces a human readable report of all TraceContexts in existence at the @@ -63,12 +70,18 @@ class TraceContext { static std::string statusLine(); // Returns a copy of the trace status. - static std::unordered_map status(); + static folly::F14FastMap status(); + + // Implementation detail type. Made public to be available with + // std::make_shared. Do not use outside this class. + using Registry = + ThreadLocalRegistry>; private: const std::string label_; const std::chrono::steady_clock::time_point enterTime_; const bool isTemporary_; + std::shared_ptr traceData_; }; } // namespace facebook::velox::process diff --git a/velox/common/process/TraceHistory.cpp b/velox/common/process/TraceHistory.cpp new file mode 100644 index 0000000000000..bf7524590802b --- /dev/null +++ b/velox/common/process/TraceHistory.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/process/TraceHistory.h" + +#include + +#include + +namespace facebook::velox::process { + +namespace { +auto registry = std::make_shared>(); +} + +namespace detail { +thread_local ThreadLocalRegistry::Reference traceHistory( + registry); +} + +TraceHistory::TraceHistory() + : threadId_(std::this_thread::get_id()), osTid_(folly::getOSThreadID()) {} + +std::vector TraceHistory::listAll() { + std::vector results; + registry->forAllValues([&](auto& history) { + EntriesWithThreadInfo result; + result.threadId = history.threadId_; + result.osTid = history.osTid_; + for (int i = 0; i < kCapacity; ++i) { + const int j = (history.index_ + kCapacity - 1 - i) % kCapacity; + if (!populated(history.data_[j])) { + break; + } + result.entries.push_back(history.data_[j]); + } + std::reverse(result.entries.begin(), result.entries.end()); + results.push_back(std::move(result)); + }); + return results; +} + +} // namespace facebook::velox::process diff --git a/velox/common/process/TraceHistory.h b/velox/common/process/TraceHistory.h new file mode 100644 index 0000000000000..bcee2cec69d74 --- /dev/null +++ b/velox/common/process/TraceHistory.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/common/process/ThreadLocalRegistry.h" + +#include +#include +#include +#include +#include + +/// Push an entry to the history ring buffer with a label from format string +/// (same as printf) and optional arguments. +#define VELOX_TRACE_HISTORY_PUSH(_format, ...) \ + ::facebook::velox::process::TraceHistory::push([&](auto& entry) { \ + entry.time = ::std::chrono::steady_clock::now(); \ + entry.file = __FILE__; \ + entry.line = __LINE__; \ + ::snprintf(entry.label, entry.kLabelCapacity, _format, ##__VA_ARGS__); \ + }) + +namespace facebook::velox::process { + +class TraceHistory; + +namespace detail { +extern thread_local ThreadLocalRegistry::Reference traceHistory; +} + +/// Keep list of labels in a ring buffer that is fixed sized and thread local. +class TraceHistory { + public: + TraceHistory(); + + /// An entry with tracing information and custom label. + struct Entry { + std::chrono::steady_clock::time_point time; + const char* file; + int32_t line; + + static constexpr int kLabelCapacity = + 64 - sizeof(time) - sizeof(file) - sizeof(line); + char label[kLabelCapacity]; + }; + + /// NOTE: usually VELOX_TRACE_HISTORY_PUSH should be used instead of calling + /// this function directly. + /// + /// Add a new entry to the thread local instance. If there are more than + /// `kCapacity' entries, overwrite the oldest ones. All the mutation on the + /// new entry should be done in the functor `init'. + template + static void push(F&& init) { + detail::traceHistory.withValue( + [init = std::forward(init)](auto& history) { + auto& entry = history.data_[history.index_]; + init(entry); + assert(populated(entry)); + history.index_ = (history.index_ + 1) % kCapacity; + }); + } + + /// All entries in a specific thread. + struct EntriesWithThreadInfo { + std::thread::id threadId; + uint64_t osTid; + std::vector entries; + }; + + /// List all entries from all threads. + static std::vector listAll(); + + /// Keep the last `kCapacity' entries per thread. Must be a power of 2. + static constexpr int kCapacity = 16; + + private: + static_assert((kCapacity & (kCapacity - 1)) == 0); + static_assert(sizeof(Entry) == 64); + + static bool populated(const Entry& entry) { + return entry.file != nullptr; + } + + alignas(64) Entry data_[kCapacity]{}; + const std::thread::id threadId_; + const uint64_t osTid_; + int index_ = 0; +}; + +} // namespace facebook::velox::process diff --git a/velox/common/process/tests/CMakeLists.txt b/velox/common/process/tests/CMakeLists.txt index d64466568a291..23ef279c21af0 100644 --- a/velox/common/process/tests/CMakeLists.txt +++ b/velox/common/process/tests/CMakeLists.txt @@ -12,9 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_executable(velox_process_test TraceContextTest.cpp) +add_executable(velox_process_test ProfilerTest.cpp ThreadLocalRegistryTest.cpp + TraceContextTest.cpp TraceHistoryTest.cpp) add_test(velox_process_test velox_process_test) -target_link_libraries(velox_process_test PRIVATE velox_process fmt::fmt gtest - gtest_main) +target_link_libraries( + velox_process_test + PRIVATE + velox_process + fmt::fmt + velox_time + GTest::gtest + GTest::gtest_main) diff --git a/velox/common/process/tests/ProfilerTest.cpp b/velox/common/process/tests/ProfilerTest.cpp new file mode 100644 index 0000000000000..788e02ef27d2d --- /dev/null +++ b/velox/common/process/tests/ProfilerTest.cpp @@ -0,0 +1,143 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/process/Profiler.h" +#include +#include +#include +#include +#include +#include +#include "velox/common/process/TraceContext.h" + +using namespace facebook::velox::process; +using namespace facebook::velox; + +namespace { +int32_t fi(int32_t x) { + return x < 2 ? x : fi(x - 1) + fi(x - 2); +} +void compute(int32_t seconds) { + auto start = getCurrentTimeMs(); + constexpr int32_t kNumThreads = 10; + for (;;) { + std::vector threads; + threads.reserve(kNumThreads); + std::atomic sum = 0; + for (int32_t i = 0; i < kNumThreads; ++i) { + threads.push_back(std::thread([&]() { + sum += fi(40); + std::this_thread::sleep_for(std::chrono::milliseconds(3)); // NOLINT + })); + } + for (auto& thread : threads) { + thread.join(); + } + LOG(INFO) << "Sum " << sum; + if (getCurrentTimeMs() - start > seconds * 1000) { + break; + } + } +} + +} // namespace + +TEST(ProfilerTest, basic) { +#if !defined(linux) + return; +#endif + filesystems::registerLocalFileSystem(); + // We have seconds of busy and idle activity. We set the profiler to + // check every second and to trigger after 1s at 200%. A burst of + // under 2s is not recorded and a new file is started after every 4s + // of cpu busy. + + FLAGS_profiler_check_interval_seconds = 1; + FLAGS_profiler_min_cpu_pct = 200; + FLAGS_profiler_max_sample_seconds = 4; + FLAGS_profiler_max_sample_seconds = 2; + + Profiler::start("/tmp/profilertest"); + compute(5); + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + compute(1); + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + + compute(3); + Profiler::stop(); + + // We set the profiler to start regardless of load and wait 30s before + // producing the next result. + FLAGS_profiler_check_interval_seconds = 30; + FLAGS_profiler_min_cpu_pct = 0; + FLAGS_profiler_min_sample_seconds = 0; + Profiler::start("/tmp/profilertest"); + compute(2); + // The test exits during the measurement interval. We expect no + // crash on exit if the threads are properly joined. +} + +int main(int argc, char** argv) { + // Fork a child process to run all the tests. + int32_t pid = fork(); + if (pid < 0) { + LOG(ERROR) << "Failed to fork child"; + exit(1); + } + if (pid > 0) { + // The parent waits for the child to return. If the child returns + // in time, the child's return code is returned. If the child does + // not return in time, we return 0 and the test fails silently. + std::atomic timedOut = false; + std::atomic completed = false; + auto sleepPromise = folly::Promise(); + folly::SemiFuture sleepFuture(false); + sleepFuture = sleepPromise.getSemiFuture(); + std::thread timer([&]() { + try { + auto& executor = folly::QueuedImmediateExecutor::instance(); + // Wait for up to 100 seconds. The test is normally ~20s unless it + // hangs. + std::move(sleepFuture).via(&executor).wait((std::chrono::seconds(100))); + } catch (std::exception&) { + } + if (completed) { + return; + } + timedOut = true; + LOG(INFO) << "Killing the test process for timeout"; + kill(pid, SIGKILL); + }); + + int wstatus; + int w = waitpid(pid, &wstatus, WUNTRACED | WCONTINUED); + LOG(INFO) << "Test completed"; + completed = true; + sleepPromise.setValue(true); + timer.join(); + + if (timedOut) { + return 0; + } + return WEXITSTATUS(wstatus); + } + + testing::InitGoogleTest(&argc, argv); + // Signal handler required for ThreadDebugInfoTest + folly::Init init(&argc, &argv, false); + return RUN_ALL_TESTS(); + return 0; +} diff --git a/velox/common/process/tests/ThreadLocalRegistryTest.cpp b/velox/common/process/tests/ThreadLocalRegistryTest.cpp new file mode 100644 index 0000000000000..0887f89dd165a --- /dev/null +++ b/velox/common/process/tests/ThreadLocalRegistryTest.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/process/ThreadLocalRegistry.h" + +#include +#include +#include + +#include + +namespace facebook::velox::process { +namespace { + +template +class TestObject { + public: + static std::atomic_int& count() { + static std::atomic_int value; + return value; + } + + TestObject() : threadId_(std::this_thread::get_id()) { + ++count(); + } + + ~TestObject() { + --count(); + } + + std::thread::id threadId() const { + return threadId_; + } + + private: + const std::thread::id threadId_; +}; + +TEST(ThreadLocalRegistryTest, basic) { + struct Tag {}; + using T = TestObject; + ASSERT_EQ(T::count(), 0); + auto registry = std::make_shared>(); + registry->forAllValues([](const T&) { FAIL(); }); + thread_local ThreadLocalRegistry::Reference ref(registry); + const T* object = ref.withValue([](const T& x) { + EXPECT_EQ(T::count(), 1); + return &x; + }); + ASSERT_EQ(object->threadId(), std::this_thread::get_id()); + ref.withValue([&](const T& x) { ASSERT_EQ(&x, object); }); + int count = 0; + registry->forAllValues([&](const T& x) { + ++count; + ASSERT_EQ(x.threadId(), std::this_thread::get_id()); + }); + ASSERT_EQ(count, 1); + ASSERT_EQ(T::count(), 1); +} + +TEST(ThreadLocalRegistryTest, multiThread) { + struct Tag {}; + using T = TestObject; + ASSERT_EQ(T::count(), 0); + auto registry = std::make_shared>(); + constexpr int kNumThreads = 7; + std::vector threads; + folly::Latch latch(kNumThreads); + folly::Baton<> batons[kNumThreads]; + const T* objects[kNumThreads]; + for (int i = 0; i < kNumThreads; ++i) { + threads.emplace_back([&, i] { + thread_local ThreadLocalRegistry::Reference ref(registry); + objects[i] = ref.withValue([](const T& x) { return &x; }); + latch.count_down(); + batons[i].wait(); + }); + } + latch.wait(); + std::vector indices; + registry->forAllValues([&](const T& x) { + auto it = std::find(std::begin(objects), std::end(objects), &x); + indices.push_back(it - std::begin(objects)); + }); + ASSERT_EQ(indices.size(), kNumThreads); + std::sort(indices.begin(), indices.end()); + for (int i = 0; i < kNumThreads; ++i) { + ASSERT_EQ(indices[i], i); + ASSERT_EQ(objects[i]->threadId(), threads[i].get_id()); + ASSERT_EQ(T::count(), kNumThreads - i); + batons[i].post(); + threads[i].join(); + } + ASSERT_EQ(T::count(), 0); + registry->forAllValues([](const T&) { FAIL(); }); +} + +} // namespace +} // namespace facebook::velox::process diff --git a/velox/common/process/tests/TraceContextTest.cpp b/velox/common/process/tests/TraceContextTest.cpp index cfa021432a8a7..130055e568fad 100644 --- a/velox/common/process/tests/TraceContextTest.cpp +++ b/velox/common/process/tests/TraceContextTest.cpp @@ -15,33 +15,125 @@ */ #include "velox/common/process/TraceContext.h" +#include "velox/common/process/TraceHistory.h" + #include +#include +#include +#include #include + #include -using namespace facebook::velox::process; +namespace facebook::velox::process { +namespace { + +class TraceContextTest : public testing::Test { + public: + void SetUp() override { + ASSERT_TRUE(TraceContext::status().empty()); + } -TEST(TraceContextTest, basic) { - constexpr int32_t kNumThreads = 10; + void TearDown() override { + ASSERT_TRUE(TraceContext::status().empty()); + } +}; + +TEST_F(TraceContextTest, basic) { + constexpr int kNumThreads = 3; std::vector threads; + folly::Baton<> batons[2][kNumThreads]; + folly::Latch latches[2] = { + folly::Latch(kNumThreads), + folly::Latch(kNumThreads), + }; threads.reserve(kNumThreads); - for (int32_t i = 0; i < kNumThreads; ++i) { - threads.push_back(std::thread([i]() { - TraceContext trace1("process data"); - TraceContext trace2(fmt::format("Process chunk {}", i), true); - std::this_thread::sleep_for(std::chrono::milliseconds(3)); - })); + for (int i = 0; i < kNumThreads; ++i) { + threads.emplace_back([&, i]() { + { + TraceContext trace1("process data"); + TraceContext trace2(fmt::format("Process chunk {}", i), true); + latches[0].count_down(); + batons[0][i].wait(); + } + latches[1].count_down(); + batons[1][i].wait(); + }); + } + latches[0].wait(); + auto status = TraceContext::status(); + ASSERT_EQ(1 + kNumThreads, status.size()); + ASSERT_EQ(kNumThreads, status.at("process data").numThreads); + for (int i = 0; i < kNumThreads; ++i) { + ASSERT_EQ(1, status.at(fmt::format("Process chunk {}", i)).numThreads); } - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - LOG(INFO) << TraceContext::statusLine(); - for (auto& thread : threads) { - thread.join(); + for (int i = 0; i < kNumThreads; ++i) { + batons[0][i].post(); } - LOG(INFO) << TraceContext::statusLine(); - // We expect one entry for "process data". The temporary entries - // are deleted after the treads complete. - auto after = TraceContext::status(); - EXPECT_EQ(1, after.size()); - EXPECT_EQ(kNumThreads, after["process data"].numEnters); - EXPECT_EQ(0, after["process data"].numThreads); + latches[1].wait(); + status = TraceContext::status(); + ASSERT_EQ(1, status.size()); + ASSERT_EQ(0, status.at("process data").numThreads); + ASSERT_EQ(kNumThreads, status.at("process data").numEnters); + for (int i = 0; i < kNumThreads; ++i) { + batons[1][i].post(); + threads[i].join(); + } +} + +TEST_F(TraceContextTest, traceHistory) { + std::thread([] { + TraceContext trace("test"); + TraceContext trace2( + std::string(TraceHistory::Entry::kLabelCapacity + 10, 'x')); + auto results = TraceHistory::listAll(); + ASSERT_EQ(results.size(), 1); + ASSERT_EQ(results[0].entries.size(), 2); + ASSERT_STREQ(results[0].entries[0].label, "test"); + ASSERT_EQ( + results[0].entries[1].label, + std::string(TraceHistory::Entry::kLabelCapacity - 1, 'x')); + }).join(); } + +TEST_F(TraceContextTest, transferBetweenThreads) { + auto [promise, future] = + folly::makePromiseContract>(); + folly::Baton<> batons[2]; + std::chrono::steady_clock::time_point timeLow, timeHigh; + std::thread receiver([&, future = std::move(future)]() mutable { + auto trace = std::move(future).get(std::chrono::seconds(1)); + { + SCOPE_EXIT { + batons[0].post(); + }; + auto status = TraceContext::status(); + ASSERT_EQ(1, status.size()); + auto& data = status.at("test"); + ASSERT_EQ(data.numThreads, 1); + ASSERT_EQ(data.numEnters, 1); + ASSERT_LE(timeLow, data.startTime); + ASSERT_LE(data.startTime, timeHigh); + } + batons[1].wait(); + auto status = TraceContext::status(); + ASSERT_EQ(1, status.size()); + auto& data = status.at("test"); + ASSERT_EQ(data.numThreads, 1); + ASSERT_EQ(data.numEnters, 1); + ASSERT_LE(timeLow, data.startTime); + ASSERT_LE(data.startTime, timeHigh); + }); + timeLow = std::chrono::steady_clock::now(); + std::thread([&, promise = std::move(promise)]() mutable { + auto trace = std::make_unique("test"); + timeHigh = std::chrono::steady_clock::now(); + promise.setValue(std::move(trace)); + batons[0].wait(); + }).join(); + batons[1].post(); + receiver.join(); +} + +} // namespace +} // namespace facebook::velox::process diff --git a/velox/common/process/tests/TraceHistoryTest.cpp b/velox/common/process/tests/TraceHistoryTest.cpp new file mode 100644 index 0000000000000..754fe6f389c31 --- /dev/null +++ b/velox/common/process/tests/TraceHistoryTest.cpp @@ -0,0 +1,127 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/process/TraceHistory.h" + +#include +#include +#include +#include + +namespace facebook::velox::process { +namespace { + +class TraceHistoryTest : public testing::Test { + public: + void SetUp() override { + ASSERT_TRUE(TraceHistory::listAll().empty()); + } + + void TearDown() override { + ASSERT_TRUE(TraceHistory::listAll().empty()); + } +}; + +TEST_F(TraceHistoryTest, basic) { + std::thread([] { + auto timeLow = std::chrono::steady_clock::now(); + constexpr int kStartLine = __LINE__; + for (int i = 0; i < TraceHistory::kCapacity + 10; ++i) { + VELOX_TRACE_HISTORY_PUSH("Test %d", i); + } + auto timeHigh = std::chrono::steady_clock::now(); + auto results = TraceHistory::listAll(); + ASSERT_EQ(results.size(), 1); + ASSERT_EQ(results[0].threadId, std::this_thread::get_id()); + ASSERT_EQ(results[0].osTid, folly::getOSThreadID()); + ASSERT_EQ(results[0].entries.size(), TraceHistory::kCapacity); + auto lastTime = timeLow; + for (int i = 0; i < TraceHistory::kCapacity; ++i) { + auto& entry = results[0].entries[i]; + ASSERT_EQ(entry.line, kStartLine + 2); + ASSERT_STREQ( + entry.file + strlen(entry.file) - 20, "TraceHistoryTest.cpp"); + ASSERT_LE(lastTime, entry.time); + lastTime = entry.time; + ASSERT_EQ(strncmp(entry.label, "Test ", 5), 0); + ASSERT_EQ(atoi(entry.label + 5), i + 10); + } + ASSERT_LE(lastTime, timeHigh); + }).join(); +} + +TEST_F(TraceHistoryTest, multiThread) { + constexpr int kNumThreads = 3; + folly::Latch latch(kNumThreads); + folly::Baton<> batons[kNumThreads]; + std::vector threads; + auto timeLow = std::chrono::steady_clock::now(); + constexpr int kStartLine = __LINE__; + for (int i = 0; i < kNumThreads; ++i) { + threads.emplace_back([&, i] { + VELOX_TRACE_HISTORY_PUSH("Test"); + VELOX_TRACE_HISTORY_PUSH("Test %d", i); + latch.count_down(); + batons[i].wait(); + }); + } + latch.wait(); + auto timeHigh = std::chrono::steady_clock::now(); + auto results = TraceHistory::listAll(); + ASSERT_EQ(results.size(), kNumThreads); + for (auto& result : results) { + auto threadIndex = + std::find_if( + threads.begin(), + threads.end(), + [&](auto& t) { return t.get_id() == result.threadId; }) - + threads.begin(); + ASSERT_EQ(result.entries.size(), 2); + ASSERT_EQ(result.entries[0].line, kStartLine + 3); + ASSERT_EQ(result.entries[1].line, kStartLine + 4); + ASSERT_STREQ(result.entries[0].label, "Test"); + ASSERT_EQ(result.entries[1].label, fmt::format("Test {}", threadIndex)); + for (auto& entry : result.entries) { + ASSERT_LE(timeLow, entry.time); + ASSERT_LE(entry.time, timeHigh); + ASSERT_TRUE(entry.file); + ASSERT_STREQ( + entry.file + strlen(entry.file) - 20, "TraceHistoryTest.cpp"); + } + } + for (int i = 0; i < kNumThreads; ++i) { + ASSERT_EQ(TraceHistory::listAll().size(), kNumThreads - i); + batons[i].post(); + threads[i].join(); + } +} + +TEST_F(TraceHistoryTest, largeLabel) { + std::thread([] { + VELOX_TRACE_HISTORY_PUSH( + "%s", + std::string(TraceHistory::Entry::kLabelCapacity + 10, 'x').c_str()); + auto results = TraceHistory::listAll(); + ASSERT_EQ(results.size(), 1); + ASSERT_EQ(results[0].entries.size(), 1); + ASSERT_EQ( + results[0].entries[0].label, + std::string(TraceHistory::Entry::kLabelCapacity - 1, 'x')); + }).join(); +} + +} // namespace +} // namespace facebook::velox::process diff --git a/velox/common/serialization/CMakeLists.txt b/velox/common/serialization/CMakeLists.txt index 4d5ccaebe7c24..c818597442d2c 100644 --- a/velox/common/serialization/CMakeLists.txt +++ b/velox/common/serialization/CMakeLists.txt @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_serialization DeserializationRegistry.cpp) +velox_add_library(velox_serialization DeserializationRegistry.cpp) -target_link_libraries(velox_serialization PUBLIC velox_exception Folly::folly - glog::glog) +velox_link_libraries(velox_serialization PUBLIC velox_exception Folly::folly + glog::glog) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) diff --git a/velox/common/serialization/Registry.h b/velox/common/serialization/Registry.h index 06b51d826d946..a290dd7ecd929 100644 --- a/velox/common/serialization/Registry.h +++ b/velox/common/serialization/Registry.h @@ -28,6 +28,7 @@ #include #include "folly/Preprocessor.h" +#include "folly/container/F14Map.h" #include "velox/common/base/Exceptions.h" #include "velox/core/Metaprogramming.h" @@ -47,7 +48,7 @@ template class Registry { public: using Creator = std::function; - using CreatorMap = std::unordered_map; + using CreatorMap = folly::F14NodeMap; Registry() : Create(creatorMap_) {} diff --git a/velox/common/serialization/Serializable.h b/velox/common/serialization/Serializable.h index cda8a4593f648..befe526f4b849 100644 --- a/velox/common/serialization/Serializable.h +++ b/velox/common/serialization/Serializable.h @@ -21,6 +21,7 @@ #include "folly/json.h" #include "velox/common/base/Exceptions.h" #include "velox/common/serialization/DeserializationRegistry.h" +#include "velox/core/Metaprogramming.h" namespace facebook { namespace velox { @@ -95,19 +96,27 @@ class ISerializable { template < typename T, - typename = std::enable_if_t::value>> - static folly::dynamic serialize(T& obj) { + std::enable_if_t< + has_serialize_type::value || + std::is_base_of_v>* = nullptr> + static folly::dynamic serialize(const T& obj) { return obj.serialize(); } template < typename T, - typename = std::enable_if_t< - is_any_of::value>> + std::enable_if_t>* = nullptr> static folly::dynamic serialize(const T& val) { return val; } + template < + typename T, + typename = std::enable_if_t::value>> + static folly::dynamic serialize(T val) { + return val; + } + static folly::dynamic serialize(int32_t val) { return folly::dynamic{(int64_t)val}; } @@ -132,7 +141,7 @@ class ISerializable { } template < - class T, + typename T, std::enable_if_t< std::is_same_v>>> static folly::dynamic serialize(const folly::Optional& val) { @@ -143,8 +152,8 @@ class ISerializable { return serialize(val.value()); } - template - static folly::dynamic serialize(const std::map& map) { + template ::value>* = nullptr> + static folly::dynamic serialize(const T& map) { folly::dynamic keys = folly::dynamic::array; folly::dynamic values = folly::dynamic::array; for (auto& pair : map) { @@ -160,11 +169,11 @@ class ISerializable { } template < - class T, + typename T, typename = std::enable_if_t>> static std::shared_ptr deserialize( const folly::dynamic& obj, - void* context) { + void* context = nullptr) { VELOX_USER_CHECK(obj.isObject()); // use the key to lookup creator and call it. // creator generally be a static method in the class. @@ -192,20 +201,13 @@ class ISerializable { return std::dynamic_pointer_cast(registry.Create(name, obj)); } - template < - class T, - typename = std::enable_if_t>> - static std::shared_ptr deserialize(const folly::dynamic& obj) { - return deserialize(obj, nullptr); - } - template < typename T, typename = std::enable_if_t::value>> using createReturnType = decltype(T::create(std::declval())); template < - class T, + typename T, typename = std::enable_if_t::value>> static createReturnType deserialize( const folly::dynamic& obj, @@ -214,7 +216,7 @@ class ISerializable { } template < - class T, + typename T, typename = std::enable_if_t && !std::is_same_v>> static T deserialize(const folly::dynamic& obj, void* context = nullptr) { @@ -224,12 +226,12 @@ class ISerializable { return (T)raw; } - template >> + template >> static bool deserialize(const folly::dynamic& obj, void* context = nullptr) { return obj.asBool(); } - template >> + template >> static double deserialize( const folly::dynamic& obj, void* context = nullptr) { @@ -237,7 +239,7 @@ class ISerializable { } template < - class T, + typename T, typename = std::enable_if_t>> static std::string deserialize( const folly::dynamic& obj, @@ -246,7 +248,7 @@ class ISerializable { } template < - class T, + typename T, typename = std::enable_if_t< std::is_same_v>>> static folly::Optional< @@ -268,13 +270,13 @@ class ISerializable { using deserializeType = decltype(ISerializable::deserialize(std::declval())); - template ::value>> + template ::value>* = nullptr> static auto deserialize( const folly::dynamic& array, void* context = nullptr) { using deserializeValType = decltype(ISerializable::deserialize( - std::declval(), context)); + std::declval())); VELOX_USER_CHECK(array.isArray()); std::vector exprs; @@ -286,34 +288,44 @@ class ISerializable { } template < - class T, - typename = std::enable_if_t>>> - static std::map< - decltype(ISerializable::deserialize( - std::declval())), - decltype(ISerializable::deserialize( - std::declval()))> - deserialize(const folly::dynamic& obj, void* context = nullptr) { - using deserializeKeyType = - decltype(ISerializable::deserialize( - std::declval())); - - using deserializeMappedType = - decltype(ISerializable::deserialize( - std::declval())); + std::map>>* = nullptr> + static auto deserialize(const folly::dynamic& obj, void* context = nullptr) { + return deserialize( + obj, context); + } - std::map map; + template < + template + typename TMap, + typename TKey, + typename TMapped, + typename... TArgs, + typename = std::enable_if_t< + util::is_mappish>::value && + std::is_same_v< + typename TMap::key_type, + TKey> && + std::is_same_v< + typename TMap::mapped_type, + TMapped>>> + static auto deserialize(const folly::dynamic& obj, void* context = nullptr) { + using deserializeKeyType = decltype(ISerializable::deserialize( + std::declval())); + + using deserializeMappedType = decltype(ISerializable::deserialize( + std::declval())); + + TMap map; const folly::dynamic& keys = obj["keys"]; const folly::dynamic& values = obj["values"]; VELOX_USER_CHECK(keys.isArray() && values.isArray()); VELOX_USER_CHECK_EQ(keys.size(), values.size()); for (size_t idx = 0; idx < keys.size(); ++idx) { - auto first = - ISerializable::deserialize(keys[idx], context); - auto second = ISerializable::deserialize( - values[idx], context); + auto first = ISerializable::deserialize(keys[idx], context); + auto second = ISerializable::deserialize(values[idx], context); map.insert({first, second}); } return map; @@ -323,7 +335,7 @@ class ISerializable { private: template < - class T, + typename T, typename = std::enable_if_t>> static auto deserializeAsUniquePtr(const folly::dynamic& obj) { auto name = obj["name"].asString(); diff --git a/velox/common/serialization/tests/CMakeLists.txt b/velox/common/serialization/tests/CMakeLists.txt index d135a48750550..74b2baf630eaf 100644 --- a/velox/common/serialization/tests/CMakeLists.txt +++ b/velox/common/serialization/tests/CMakeLists.txt @@ -16,5 +16,11 @@ add_executable(velox_serialization_test TestRegistry.cpp SerializableTest.cpp) add_test(velox_serialization_test velox_serialization_test) target_link_libraries( - velox_serialization_test PRIVATE velox_exception velox_serialization - Folly::folly glog::glog gtest gtest_main) + velox_serialization_test + PRIVATE + velox_exception + velox_serialization + Folly::folly + glog::glog + GTest::gtest + GTest::gtest_main) diff --git a/velox/common/serialization/tests/SerializableTest.cpp b/velox/common/serialization/tests/SerializableTest.cpp index 8ec543d2d262c..90b60852b9b78 100644 --- a/velox/common/serialization/tests/SerializableTest.cpp +++ b/velox/common/serialization/tests/SerializableTest.cpp @@ -16,6 +16,7 @@ #include "velox/common/serialization/Serializable.h" #include +#include "folly/container/F14Map.h" #include "folly/json.h" using namespace ::facebook::velox; @@ -158,4 +159,26 @@ TEST(SerializableTest, context) { } } +template < + template + typename TMap, + typename TKey, + typename TMapped, + typename TIt, + typename... TArgs> +void testMap(TIt first, TIt last) { + TMap map{first, last}; + auto serialized = ISerializable::serialize(map); + auto copy = ISerializable::deserialize(serialized); + ASSERT_EQ(map, copy); +} + +TEST(SerializableTest, map) { + std::vector> vals{ + {1, "a"}, {2, "b"}, {3, "c"}}; + testMap(vals.begin(), vals.end()); + testMap(vals.begin(), vals.end()); + testMap(vals.begin(), vals.end()); +} + } // namespace diff --git a/velox/common/testutil/CMakeLists.txt b/velox/common/testutil/CMakeLists.txt index 04d43ce5a9fba..f33bb98be06f2 100644 --- a/velox/common/testutil/CMakeLists.txt +++ b/velox/common/testutil/CMakeLists.txt @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_test_util TestValue.cpp) -target_link_libraries(velox_test_util PUBLIC velox_exception) +velox_add_library(velox_test_util ScopedTestTime.cpp TestValue.cpp) +velox_link_libraries(velox_test_util PUBLIC velox_exception) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) diff --git a/velox/common/testutil/ScopedTestTime.cpp b/velox/common/testutil/ScopedTestTime.cpp new file mode 100644 index 0000000000000..ec25f40cfdb5f --- /dev/null +++ b/velox/common/testutil/ScopedTestTime.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/testutil/ScopedTestTime.h" + +#include "velox/common/base/Exceptions.h" + +namespace facebook::velox::common::testutil { +bool ScopedTestTime::enabled_ = false; +std::optional ScopedTestTime::testTimeUs_ = {}; + +ScopedTestTime::ScopedTestTime() { +#ifndef NDEBUG + VELOX_CHECK(!enabled_, "Only one ScopedTestTime can be active at a time"); + enabled_ = true; +#else + VELOX_UNREACHABLE("ScopedTestTime should only be used in debug mode"); +#endif +} + +ScopedTestTime::~ScopedTestTime() { + testTimeUs_.reset(); + enabled_ = false; +} + +void ScopedTestTime::setCurrentTestTimeSec(size_t currentTimeSec) { + setCurrentTestTimeMicro(currentTimeSec * 1000000); +} + +void ScopedTestTime::setCurrentTestTimeMs(size_t currentTimeMs) { + setCurrentTestTimeMicro(currentTimeMs * 1000); +} + +void ScopedTestTime::setCurrentTestTimeMicro(size_t currentTimeUs) { + testTimeUs_ = currentTimeUs; +} + +std::optional ScopedTestTime::getCurrentTestTimeSec() { + return testTimeUs_.has_value() ? std::make_optional(*testTimeUs_ / 1000000L) + : testTimeUs_; +} +std::optional ScopedTestTime::getCurrentTestTimeMs() { + return testTimeUs_.has_value() ? std::make_optional(*testTimeUs_ / 1000L) + : testTimeUs_; +} + +std::optional ScopedTestTime::getCurrentTestTimeMicro() { + return testTimeUs_; +} +} // namespace facebook::velox::common::testutil diff --git a/velox/common/testutil/ScopedTestTime.h b/velox/common/testutil/ScopedTestTime.h new file mode 100644 index 0000000000000..3a03a603fb311 --- /dev/null +++ b/velox/common/testutil/ScopedTestTime.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace facebook::velox::common::testutil { +// Used to override the current time for testing purposes. +class ScopedTestTime { + public: + ScopedTestTime(); + ~ScopedTestTime(); + + void setCurrentTestTimeSec(size_t currentTimeSec); + void setCurrentTestTimeMs(size_t currentTimeMs); + void setCurrentTestTimeMicro(size_t currentTimeUs); + + static std::optional getCurrentTestTimeSec(); + static std::optional getCurrentTestTimeMs(); + static std::optional getCurrentTestTimeMicro(); + + private: + // Used to verify only one instance of ScopedTestTime exists at a time. + static bool enabled_; + // The overridden value of current time only. + static std::optional testTimeUs_; +}; +} // namespace facebook::velox::common::testutil diff --git a/velox/common/testutil/tests/CMakeLists.txt b/velox/common/testutil/tests/CMakeLists.txt index 02b7ee5f1045b..d164600656a38 100644 --- a/velox/common/testutil/tests/CMakeLists.txt +++ b/velox/common/testutil/tests/CMakeLists.txt @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. include(GoogleTest) -add_executable(velox_test_util_test TestValueTest.cpp SpillConfigTest.cpp) +add_executable(velox_test_util_test TestScopedTestTime.cpp TestValueTest.cpp) gtest_add_tests(velox_test_util_test "" AUTO) target_link_libraries( velox_test_util_test PRIVATE - velox_test_util - velox_exception - velox_spill_config - velox_exec - gtest - gtest_main) + velox_test_util + velox_exception + velox_exec + velox_time + GTest::gtest + GTest::gtest_main) diff --git a/velox/common/testutil/tests/TestScopedTestTime.cpp b/velox/common/testutil/tests/TestScopedTestTime.cpp new file mode 100644 index 0000000000000..4052ea256729e --- /dev/null +++ b/velox/common/testutil/tests/TestScopedTestTime.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "velox/common/base/Exceptions.h" +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/testutil/ScopedTestTime.h" +#include "velox/common/time/Timer.h" + +namespace { + +using namespace facebook::velox; +using namespace facebook::velox::common::testutil; + +// NOTE: we can only construct ScopedTestTime in debug builds. +DEBUG_ONLY_TEST(TestScopedTestTime, testSetCurrentTimeMs) { + { + ScopedTestTime scopedTestTime; + scopedTestTime.setCurrentTestTimeMs(1); + ASSERT_EQ(getCurrentTimeMs(), 1); + ASSERT_EQ(getCurrentTimeMicro(), 1000); + scopedTestTime.setCurrentTestTimeMs(2); + ASSERT_EQ(getCurrentTimeMs(), 2); + ASSERT_EQ(getCurrentTimeMicro(), 2000); + } + + // This should be the actual time, so we don't know what it is, but it + // shouldn't be equal to the overridden value. + ASSERT_NE(getCurrentTimeMs(), 2); + ASSERT_NE(getCurrentTimeMicro(), 2000); +} + +DEBUG_ONLY_TEST(TestScopedTestTime, testSetCurrentTimeMicro) { + { + ScopedTestTime scopedTestTime; + scopedTestTime.setCurrentTestTimeMicro(1000); + ASSERT_EQ(getCurrentTimeMs(), 1); + ASSERT_EQ(getCurrentTimeMicro(), 1000); + scopedTestTime.setCurrentTestTimeMicro(2000); + ASSERT_EQ(getCurrentTimeMs(), 2); + ASSERT_EQ(getCurrentTimeMicro(), 2000); + } + + // This should be the actual time, so we don't know what it is, but it + // shouldn't be equal to the overridden value. + ASSERT_NE(getCurrentTimeMs(), 2); + ASSERT_NE(getCurrentTimeMicro(), 2000); +} + +DEBUG_ONLY_TEST(TestScopedTestTime, multipleScopedTestTimes) { + { + ScopedTestTime scopedTestTime; + scopedTestTime.setCurrentTestTimeMs(1); + ASSERT_EQ(getCurrentTimeMs(), 1); + ASSERT_EQ(getCurrentTimeMicro(), 1000); + } + + { + ScopedTestTime scopedTestTime; + // The previous scoped test time should have been cleared. + ASSERT_NE(getCurrentTimeMs(), 1); + ASSERT_NE(getCurrentTimeMicro(), 1000); + + scopedTestTime.setCurrentTestTimeMs(1); + ASSERT_EQ(getCurrentTimeMs(), 1); + ASSERT_EQ(getCurrentTimeMicro(), 1000); + + // Trying to create another ScopedTestTime with one already in scope should + // fail. + auto createScopedTestTime = []() { ScopedTestTime scopedTestTime2; }; + VELOX_ASSERT_THROW( + createScopedTestTime(), + "Only one ScopedTestTime can be active at a time"); + } +} +} // namespace diff --git a/velox/common/time/CMakeLists.txt b/velox/common/time/CMakeLists.txt index fc5ca7c80403c..8c9e39f518351 100644 --- a/velox/common/time/CMakeLists.txt +++ b/velox/common/time/CMakeLists.txt @@ -15,5 +15,6 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() -add_library(velox_time Timer.cpp CpuWallTimer.cpp) -target_link_libraries(velox_time PUBLIC velox_process Folly::folly fmt::fmt) +velox_add_library(velox_time CpuWallTimer.cpp Timer.cpp) +velox_link_libraries(velox_time PUBLIC velox_process velox_test_util + Folly::folly fmt::fmt) diff --git a/velox/common/time/CpuWallTimer.h b/velox/common/time/CpuWallTimer.h index 223cde7751246..f60f23c19dc9e 100644 --- a/velox/common/time/CpuWallTimer.h +++ b/velox/common/time/CpuWallTimer.h @@ -65,23 +65,28 @@ template class DeltaCpuWallTimer { public: explicit DeltaCpuWallTimer(F&& func) - : cpuTimeStart_(process::threadCpuNanos()), - wallTimeStart_(std::chrono::steady_clock::now()), + : wallTimeStart_(std::chrono::steady_clock::now()), + cpuTimeStart_(process::threadCpuNanos()), func_(std::move(func)) {} ~DeltaCpuWallTimer() { - const CpuWallTiming deltaTiming{ - 1, - uint64_t(std::chrono::duration_cast( - std::chrono::steady_clock::now() - wallTimeStart_) - .count()), - process::threadCpuNanos() - cpuTimeStart_}; + // NOTE: End the cpu-time timing first, and then end the wall-time timing, + // so as to avoid the counter-intuitive phenomenon that the final calculated + // cpu-time is slightly larger than the wall-time. + uint64_t cpuTimeDuration = process::threadCpuNanos() - cpuTimeStart_; + uint64_t wallTimeDuration = + std::chrono::duration_cast( + std::chrono::steady_clock::now() - wallTimeStart_) + .count(); + const CpuWallTiming deltaTiming{1, wallTimeDuration, cpuTimeDuration}; func_(deltaTiming); } private: - const uint64_t cpuTimeStart_; + // NOTE: Put `wallTimeStart_` before `cpuTimeStart_`, so that wall-time starts + // counting earlier than cpu-time. const std::chrono::steady_clock::time_point wallTimeStart_; + const uint64_t cpuTimeStart_; F func_; }; diff --git a/velox/common/time/Timer.cpp b/velox/common/time/Timer.cpp index be82932f306c5..5598fb0ee6dbb 100644 --- a/velox/common/time/Timer.cpp +++ b/velox/common/time/Timer.cpp @@ -16,9 +16,36 @@ #include "velox/common/time/Timer.h" +#include "velox/common/testutil/ScopedTestTime.h" + namespace facebook::velox { using namespace std::chrono; +using common::testutil::ScopedTestTime; + +#ifndef NDEBUG + +size_t getCurrentTimeSec() { + return ScopedTestTime::getCurrentTestTimeSec().value_or( + duration_cast(system_clock::now().time_since_epoch()).count()); +} + +size_t getCurrentTimeMs() { + return ScopedTestTime::getCurrentTestTimeMs().value_or( + duration_cast(system_clock::now().time_since_epoch()) + .count()); +} + +size_t getCurrentTimeMicro() { + return ScopedTestTime::getCurrentTestTimeMicro().value_or( + duration_cast(system_clock::now().time_since_epoch()) + .count()); +} +#else + +size_t getCurrentTimeSec() { + return duration_cast(system_clock::now().time_since_epoch()).count(); +} size_t getCurrentTimeMs() { return duration_cast(system_clock::now().time_since_epoch()) @@ -29,5 +56,6 @@ size_t getCurrentTimeMicro() { return duration_cast(system_clock::now().time_since_epoch()) .count(); } +#endif } // namespace facebook::velox diff --git a/velox/common/time/Timer.h b/velox/common/time/Timer.h index e3325cbdba25e..ce3d8ceb03860 100644 --- a/velox/common/time/Timer.h +++ b/velox/common/time/Timer.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace facebook::velox { @@ -43,6 +44,24 @@ class MicrosecondTimer { uint64_t* timer_; }; +class NanosecondTimer { + public: + explicit NanosecondTimer(uint64_t* timer) : timer_(timer) { + start_ = std::chrono::steady_clock::now(); + } + + ~NanosecondTimer() { + auto duration = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_); + + (*timer_) += duration.count(); + } + + private: + std::chrono::steady_clock::time_point start_; + uint64_t* timer_; +}; + /// Measures the time between construction and destruction with CPU clock /// counter (rdtsc on X86) and increments a user-supplied counter with the cycle /// count. @@ -69,6 +88,9 @@ class ClockTimer { uint64_t start_; }; +// Returns the current epoch time in seconds. +size_t getCurrentTimeSec(); + /// Returns the current epoch time in milliseconds. size_t getCurrentTimeMs(); diff --git a/velox/common/time/tests/CMakeLists.txt b/velox/common/time/tests/CMakeLists.txt index 5dd964b97abfc..2b2bc5caa9dc6 100644 --- a/velox/common/time/tests/CMakeLists.txt +++ b/velox/common/time/tests/CMakeLists.txt @@ -15,7 +15,8 @@ include(GoogleTest) add_executable(velox_time_test CpuWallTimerTest.cpp) -target_link_libraries(velox_time_test PRIVATE velox_time glog::glog gtest - gtest_main) +target_link_libraries( + velox_time_test + PRIVATE velox_time glog::glog GTest::gtest GTest::gtest_main) gtest_add_tests(velox_time_test "" AUTO) diff --git a/velox/connectors/CMakeLists.txt b/velox/connectors/CMakeLists.txt index 6bf3f5b5f17a1..3cc600201f6b8 100644 --- a/velox/connectors/CMakeLists.txt +++ b/velox/connectors/CMakeLists.txt @@ -11,10 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_connector Connector.cpp) +velox_add_library(velox_connector Connector.cpp) -target_link_libraries(velox_connector velox_config velox_spill_config - velox_vector) +velox_link_libraries(velox_connector velox_common_config velox_vector) add_subdirectory(fuzzer) diff --git a/velox/connectors/Connector.cpp b/velox/connectors/Connector.cpp index a14e7538b9b5c..ed10fad669d4b 100644 --- a/velox/connectors/Connector.cpp +++ b/velox/connectors/Connector.cpp @@ -31,6 +31,18 @@ std::unordered_map>& connectors() { } } // namespace +bool DataSink::Stats::empty() const { + return numWrittenBytes == 0 && numWrittenFiles == 0 && spillStats.empty(); +} + +std::string DataSink::Stats::toString() const { + return fmt::format( + "numWrittenBytes {} numWrittenFiles {} {}", + succinctBytes(numWrittenBytes), + numWrittenFiles, + spillStats.toString()); +} + bool registerConnectorFactory(std::shared_ptr factory) { factory->initialize(); bool ok = @@ -42,6 +54,15 @@ bool registerConnectorFactory(std::shared_ptr factory) { return true; } +bool hasConnectorFactory(const std::string& connectorName) { + return connectorFactories().count(connectorName) == 1; +} + +bool unregisterConnectorFactory(const std::string& connectorName) { + auto count = connectorFactories().erase(connectorName); + return count == 1; +} + std::shared_ptr getConnectorFactory( const std::string& connectorName) { auto it = connectorFactories().find(connectorName); diff --git a/velox/connectors/Connector.h b/velox/connectors/Connector.h index 2f3ec9def722d..6aa1b55c9e2c4 100644 --- a/velox/connectors/Connector.h +++ b/velox/connectors/Connector.h @@ -15,38 +15,49 @@ */ #pragma once +#include "folly/CancellationToken.h" #include "velox/common/base/AsyncSource.h" +#include "velox/common/base/PrefixSortConfig.h" #include "velox/common/base/RuntimeMetrics.h" +#include "velox/common/base/SpillConfig.h" +#include "velox/common/base/SpillStats.h" #include "velox/common/caching/AsyncDataCache.h" #include "velox/common/caching/ScanTracker.h" -#include "velox/common/config/SpillConfig.h" #include "velox/common/future/VeloxPromise.h" #include "velox/core/ExpressionEvaluator.h" #include "velox/vector/ComplexVector.h" #include +namespace facebook::velox { +class Config; +} +namespace facebook::velox::wave { +class WaveDataSource; +} namespace facebook::velox::common { class Filter; } - -namespace facebook::velox { -class Config; +namespace facebook::velox::config { +class ConfigBase; } namespace facebook::velox::connector { class DataSource; -// A split represents a chunk of data that a connector should load and return -// as a RowVectorPtr, potentially after processing pushdowns. +/// A split represents a chunk of data that a connector should load and return +/// as a RowVectorPtr, potentially after processing pushdowns. struct ConnectorSplit { const std::string connectorId; + const int64_t splitWeight{0}; std::unique_ptr> dataSource; - explicit ConnectorSplit(const std::string& _connectorId) - : connectorId(_connectorId) {} + explicit ConnectorSplit( + const std::string& _connectorId, + int64_t _splitWeight = 0) + : connectorId(_connectorId), splitWeight(_splitWeight) {} virtual ~ConnectorSplit() {} @@ -93,19 +104,19 @@ class ConnectorTableHandle : public ISerializable { using ConnectorTableHandlePtr = std::shared_ptr; -/** - * Represents a request for writing to connector - */ +/// Represents a request for writing to connector class ConnectorInsertTableHandle : public ISerializable { public: virtual ~ConnectorInsertTableHandle() {} - // Whether multi-threaded write is supported by this connector. Planner uses - // this flag to determine number of drivers. + /// Whether multi-threaded write is supported by this connector. Planner uses + /// this flag to determine number of drivers. virtual bool supportsMultiThreading() const { return false; } + virtual std::string toString() const = 0; + folly::dynamic serialize() const override { VELOX_NYI(); } @@ -113,8 +124,10 @@ class ConnectorInsertTableHandle : public ISerializable { /// Represents the commit strategy for writing to connector. enum class CommitStrategy { - kNoCommit, // No more commit actions are needed. - kTaskCommit // Task level commit is needed. + /// No more commit actions are needed. + kNoCommit, + /// Task level commit is needed. + kTaskCommit }; /// Return a string encoding of the given commit strategy. @@ -135,29 +148,34 @@ CommitStrategy stringToCommitStrategy(const std::string& strategy); /// to be thread-safe. class DataSink { public: + struct Stats { + uint64_t numWrittenBytes{0}; + uint32_t numWrittenFiles{0}; + uint64_t writeIOTimeUs{0}; + common::SpillStats spillStats; + + bool empty() const; + + std::string toString() const; + }; + virtual ~DataSink() = default; /// Add the next data (vector) to be written. This call is blocking. /// TODO maybe at some point we want to make it async. virtual void appendData(RowVectorPtr input) = 0; - /// Returns the number of bytes written on disk by this data sink so far. - virtual int64_t getCompletedBytes() const { - return 0; - } - - /// Returns the number of files written on disk by this data sink so far. - virtual int32_t numWrittenFiles() const { - return 0; - } + /// Returns the stats of this data sink. + virtual Stats stats() const = 0; /// Called once after all data has been added via possibly multiple calls to /// appendData(). The function returns the metadata of written data in string - /// form on success. If 'success' is false, this function aborts any pending - /// data processing inside this data sink. - /// - /// NOTE: we don't expect any appendData() calls on a closed data sink object. - virtual std::vector close(bool success) = 0; + /// form. We don't expect any appendData() calls on a closed data sink object. + virtual std::vector close() = 0; + + /// Called to abort this data sink object and we don't expect any appendData() + /// calls on an aborted data sink object. + virtual void abort() = 0; }; class DataSource { @@ -165,62 +183,71 @@ class DataSource { static constexpr int64_t kUnknownRowSize = -1; virtual ~DataSource() = default; - // Add split to process, then call next multiple times to process the split. - // A split must be fully processed by next before another split can be - // added. Next returns nullptr to indicate that current split is fully - // processed. + /// Add split to process, then call next multiple times to process the split. + /// A split must be fully processed by next before another split can be + /// added. Next returns nullptr to indicate that current split is fully + /// processed. virtual void addSplit(std::shared_ptr split) = 0; - // Process a split added via addSplit. Returns nullptr if split has been fully - // processed. Returns std::nullopt and sets the 'future' if started - // asynchronous work and needs to wait for it to complete to continue - // processing. The caller will wait for the 'future' to complete before - // calling 'next' again. + /// Process a split added via addSplit. Returns nullptr if split has been + /// fully processed. Returns std::nullopt and sets the 'future' if started + /// asynchronous work and needs to wait for it to complete to continue + /// processing. The caller will wait for the 'future' to complete before + /// calling 'next' again. virtual std::optional next( uint64_t size, velox::ContinueFuture& future) = 0; - // Add dynamically generated filter. - // @param outputChannel index into outputType specified in - // Connector::createDataSource() that identifies the column this filter - // applies to. + /// Add dynamically generated filter. + /// @param outputChannel index into outputType specified in + /// Connector::createDataSource() that identifies the column this filter + /// applies to. virtual void addDynamicFilter( column_index_t outputChannel, const std::shared_ptr& filter) = 0; - // Returns the number of input bytes processed so far. + /// Returns the number of input bytes processed so far. virtual uint64_t getCompletedBytes() = 0; - // Returns the number of input rows processed so far. + /// Returns the number of input rows processed so far. virtual uint64_t getCompletedRows() = 0; virtual std::unordered_map runtimeStats() = 0; - // Returns true if 'this' has initiated all the prefetch this will - // initiate. This means that the caller should schedule next splits - // to prefetch in the background. false if the source does not - // prefetch. + /// Returns true if 'this' has initiated all the prefetch this will initiate. + /// This means that the caller should schedule next splits to prefetch in the + /// background. false if the source does not prefetch. virtual bool allPrefetchIssued() const { return false; } - // Initializes this from 'source'. 'source' is effectively moved - // into 'this' Adaptation like dynamic filters stay in effect but - // the parts dealing with open files, prefetched data etc. are moved. 'source' - // is freed after the move. + /// Initializes this from 'source'. 'source' is effectively moved into 'this' + /// Adaptation like dynamic filters stay in effect but the parts dealing with + /// open files, prefetched data etc. are moved. 'source' is freed after the + /// move. virtual void setFromDataSource(std::unique_ptr /*source*/) { VELOX_UNSUPPORTED("setFromDataSource"); } - // Returns a connector dependent row size if available. This can be - // called after addSplit(). This estimates uncompressed data - // sizes. This is better than getCompletedBytes()/getCompletedRows() - // since these track sizes before decompression and may include - // read-ahead and extra IO from coalescing reads and will not - // fully account for size of sparsely accessed columns. + /// Returns a connector dependent row size if available. This can be + /// called after addSplit(). This estimates uncompressed data + /// sizes. This is better than getCompletedBytes()/getCompletedRows() + /// since these track sizes before decompression and may include + /// read-ahead and extra IO from coalescing reads and will not + /// fully account for size of sparsely accessed columns. virtual int64_t estimatedRowSize() { return kUnknownRowSize; } + + /// Returns a Wave delegate that implements the Wave Operator + /// interface for a GPU table scan. This should be called after + /// construction and no other methods should be called on 'this' + /// after creating the delegate. Splits, dynamic filters etc. will + /// be added to the WaveDataSource instead of 'this'. 'this' should + /// stay live until after the destruction of the delegate. + virtual std::shared_ptr toWaveDataSource() { + VELOX_UNSUPPORTED(); + } }; /// Collection of context data for use in a DataSource or DataSink. One instance @@ -232,28 +259,32 @@ class ConnectorQueryCtx { ConnectorQueryCtx( memory::MemoryPool* operatorPool, memory::MemoryPool* connectorPool, - memory::SetMemoryReclaimer setMemoryReclaimer, - const Config* connectorConfig, + const config::ConfigBase* sessionProperties, const common::SpillConfig* spillConfig, + common::PrefixSortConfig prefixSortConfig, std::unique_ptr expressionEvaluator, cache::AsyncDataCache* cache, const std::string& queryId, const std::string& taskId, const std::string& planNodeId, - int driverId) + int driverId, + const std::string& sessionTimezone, + folly::CancellationToken cancellationToken = {}) : operatorPool_(operatorPool), connectorPool_(connectorPool), - setMemoryReclaimer_(std::move(setMemoryReclaimer)), - config_(connectorConfig), + sessionProperties_(sessionProperties), spillConfig_(spillConfig), + prefixSortConfig_(prefixSortConfig), expressionEvaluator_(std::move(expressionEvaluator)), cache_(cache), scanId_(fmt::format("{}.{}", taskId, planNodeId)), queryId_(queryId), taskId_(taskId), driverId_(driverId), - planNodeId_(planNodeId) { - VELOX_CHECK_NOT_NULL(connectorConfig); + planNodeId_(planNodeId), + sessionTimezone_(sessionTimezone), + cancellationToken_(std::move(cancellationToken)) { + VELOX_CHECK_NOT_NULL(sessionProperties); } /// Returns the associated operator's memory pool which is a leaf kind of @@ -269,19 +300,16 @@ class ConnectorQueryCtx { return connectorPool_; } - /// Returns the callback to set memory pool reclaimer if set. This is used by - /// file writer to set memory reclaimer for its internal used memory pools to - /// integrate with memory arbitration. - const memory::SetMemoryReclaimer& setMemoryReclaimer() const { - return setMemoryReclaimer_; + const config::ConfigBase* sessionProperties() const { + return sessionProperties_; } - const Config* config() const { - return config_; + const common::SpillConfig* spillConfig() const { + return spillConfig_; } - const common::SpillConfig* getSpillConfig() const { - return spillConfig_; + const common::PrefixSortConfig& prefixSortConfig() const { + return prefixSortConfig_; } core::ExpressionEvaluator* expressionEvaluator() const { @@ -292,10 +320,10 @@ class ConnectorQueryCtx { return cache_; } - // This is a combination of task id and the scan's PlanNodeId. This is an id - // that allows sharing state between different threads of the same scan. This - // is used for locating a scanTracker, which tracks the read density of - // columns for prefetch and other memory hierarchy purposes. + /// This is a combination of task id and the scan's PlanNodeId. This is an id + /// that allows sharing state between different threads of the same scan. This + /// is used for locating a scanTracker, which tracks the read density of + /// columns for prefetch and other memory hierarchy purposes. const std::string& scanId() const { return scanId_; } @@ -316,12 +344,32 @@ class ConnectorQueryCtx { return planNodeId_; } + /// Session timezone used for reading Timestamp. Stores a string with the + /// actual timezone name. If the session timezone is not set in the + /// QueryConfig, it will return an empty string. + const std::string& sessionTimezone() const { + return sessionTimezone_; + } + + /// Returns the cancellation token associated with this task. + const folly::CancellationToken& cancellationToken() const { + return cancellationToken_; + } + + bool selectiveNimbleReaderEnabled() const { + return selectiveNimbleReaderEnabled_; + } + + void setSelectiveNimbleReaderEnabled(bool value) { + selectiveNimbleReaderEnabled_ = value; + } + private: memory::MemoryPool* const operatorPool_; memory::MemoryPool* const connectorPool_; - const memory::SetMemoryReclaimer setMemoryReclaimer_; - const Config* config_; + const config::ConfigBase* const sessionProperties_; const common::SpillConfig* const spillConfig_; + const common::PrefixSortConfig prefixSortConfig_; std::unique_ptr expressionEvaluator_; cache::AsyncDataCache* cache_; const std::string scanId_; @@ -329,14 +377,14 @@ class ConnectorQueryCtx { const std::string taskId_; const int driverId_; const std::string planNodeId_; + const std::string sessionTimezone_; + const folly::CancellationToken cancellationToken_; + bool selectiveNimbleReaderEnabled_{false}; }; class Connector { public: - explicit Connector( - const std::string& id, - std::shared_ptr properties) - : id_(id), properties_(std::move(properties)) {} + explicit Connector(const std::string& id) : id_(id) {} virtual ~Connector() = default; @@ -344,12 +392,13 @@ class Connector { return id_; } - const std::shared_ptr& connectorProperties() const { - return properties_; + virtual const std::shared_ptr& connectorConfig() + const { + VELOX_NYI("connectorConfig is not supported yet"); } - // Returns true if this connector would accept a filter dynamically generated - // during query execution. + /// Returns true if this connector would accept a filter dynamically generated + /// during query execution. virtual bool canAddDynamicFilter() const { return false; } @@ -362,10 +411,10 @@ class Connector { std::shared_ptr>& columnHandles, ConnectorQueryCtx* connectorQueryCtx) = 0; - // Returns true if addSplit of DataSource can use 'dataSource' from - // ConnectorSplit in addSplit(). If so, TableScan can preload splits - // so that file opening and metadata operations are off the Driver' - // thread. + /// Returns true if addSplit of DataSource can use 'dataSource' from + /// ConnectorSplit in addSplit(). If so, TableScan can preload splits + /// so that file opening and metadata operations are off the Driver' + /// thread. virtual bool supportsSplitPreload() { return false; } @@ -376,15 +425,15 @@ class Connector { ConnectorQueryCtx* connectorQueryCtx, CommitStrategy commitStrategy) = 0; - // Returns a ScanTracker for 'id'. 'id' uniquely identifies the - // tracker and different threads will share the same - // instance. 'loadQuantum' is the largest single IO for the query - // being tracked. + /// Returns a ScanTracker for 'id'. 'id' uniquely identifies the + /// tracker and different threads will share the same + /// instance. 'loadQuantum' is the largest single IO for the query + /// being tracked. static std::shared_ptr getTracker( const std::string& scanId, int32_t loadQuantum); - virtual folly::Executor* FOLLY_NULLABLE executor() const { + virtual folly::Executor* executor() const { return nullptr; } @@ -396,8 +445,6 @@ class Connector { static folly::Synchronized< std::unordered_map>> trackers_; - - const std::shared_ptr properties_; }; class ConnectorFactory { @@ -406,7 +453,7 @@ class ConnectorFactory { virtual ~ConnectorFactory() = default; - // Initialize is called during the factory registration. + /// Initialize is called during the factory registration. virtual void initialize() {} const std::string& connectorName() const { @@ -415,8 +462,8 @@ class ConnectorFactory { virtual std::shared_ptr newConnector( const std::string& id, - std::shared_ptr properties, - folly::Executor* FOLLY_NULLABLE executor = nullptr) = 0; + std::shared_ptr config, + folly::Executor* executor = nullptr) = 0; private: const std::string name_; @@ -428,6 +475,15 @@ class ConnectorFactory { /// FB_ANONYMOUS_VARIABLE. bool registerConnectorFactory(std::shared_ptr factory); +/// Returns true if a connector with the specified name has been registered, +/// false otherwise. +bool hasConnectorFactory(const std::string& connectorName); + +/// Unregister a connector factory by name. +/// Returns true if a connector with the specified name has been unregistered, +/// false otherwise. +bool unregisterConnectorFactory(const std::string& connectorName); + /// Returns a factory for creating connectors with the specified name. Throws if /// factory doesn't exist. std::shared_ptr getConnectorFactory( diff --git a/velox/connectors/fuzzer/CMakeLists.txt b/velox/connectors/fuzzer/CMakeLists.txt index 06bc812de021d..a777f8a29a4f5 100644 --- a/velox/connectors/fuzzer/CMakeLists.txt +++ b/velox/connectors/fuzzer/CMakeLists.txt @@ -14,8 +14,8 @@ add_library(velox_fuzzer_connector OBJECT FuzzerConnector.cpp) -target_link_libraries(velox_fuzzer_connector velox_connector - velox_vector_fuzzer) +target_link_libraries( + velox_fuzzer_connector velox_connector velox_vector_fuzzer) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) diff --git a/velox/connectors/fuzzer/FuzzerConnector.cpp b/velox/connectors/fuzzer/FuzzerConnector.cpp index d2f44c6039dbd..7c75c956863a8 100644 --- a/velox/connectors/fuzzer/FuzzerConnector.cpp +++ b/velox/connectors/fuzzer/FuzzerConnector.cpp @@ -22,7 +22,7 @@ namespace facebook::velox::connector::fuzzer { FuzzerDataSource::FuzzerDataSource( const std::shared_ptr& outputType, const std::shared_ptr& tableHandle, - velox::memory::MemoryPool* FOLLY_NONNULL pool) + velox::memory::MemoryPool* pool) : outputType_(outputType), pool_(pool) { auto fuzzerTableHandle = std::dynamic_pointer_cast(tableHandle); diff --git a/velox/connectors/fuzzer/FuzzerConnector.h b/velox/connectors/fuzzer/FuzzerConnector.h index 89d544360b517..4e8f665608b40 100644 --- a/velox/connectors/fuzzer/FuzzerConnector.h +++ b/velox/connectors/fuzzer/FuzzerConnector.h @@ -15,6 +15,7 @@ */ #pragma once +#include "velox/common/config/Config.h" #include "velox/connectors/Connector.h" #include "velox/connectors/fuzzer/FuzzerConnectorSplit.h" #include "velox/vector/fuzzer/VectorFuzzer.h" @@ -56,7 +57,7 @@ class FuzzerDataSource : public DataSource { FuzzerDataSource( const std::shared_ptr& outputType, const std::shared_ptr& tableHandle, - velox::memory::MemoryPool* FOLLY_NONNULL pool); + velox::memory::MemoryPool* pool); void addSplit(std::shared_ptr split) override; @@ -96,16 +97,16 @@ class FuzzerDataSource : public DataSource { size_t completedRows_{0}; size_t completedBytes_{0}; - memory::MemoryPool* FOLLY_NONNULL pool_; + memory::MemoryPool* pool_; }; class FuzzerConnector final : public Connector { public: FuzzerConnector( const std::string& id, - std::shared_ptr properties, - folly::Executor* FOLLY_NULLABLE /*executor*/) - : Connector(id, properties) {} + std::shared_ptr config, + folly::Executor* /*executor*/) + : Connector(id) {} std::unique_ptr createDataSource( const std::shared_ptr& outputType, @@ -113,7 +114,7 @@ class FuzzerConnector final : public Connector { const std::unordered_map< std::string, std::shared_ptr>& /*columnHandles*/, - ConnectorQueryCtx* FOLLY_NONNULL connectorQueryCtx) override final { + ConnectorQueryCtx* connectorQueryCtx) override final { return std::make_unique( outputType, tableHandle, connectorQueryCtx->memoryPool()); } @@ -130,18 +131,18 @@ class FuzzerConnector final : public Connector { class FuzzerConnectorFactory : public ConnectorFactory { public: - static constexpr const char* FOLLY_NONNULL kFuzzerConnectorName{"fuzzer"}; + static constexpr const char* kFuzzerConnectorName{"fuzzer"}; FuzzerConnectorFactory() : ConnectorFactory(kFuzzerConnectorName) {} - explicit FuzzerConnectorFactory(const char* FOLLY_NONNULL connectorName) + explicit FuzzerConnectorFactory(const char* connectorName) : ConnectorFactory(connectorName) {} std::shared_ptr newConnector( const std::string& id, - std::shared_ptr properties, - folly::Executor* FOLLY_NULLABLE executor = nullptr) override { - return std::make_shared(id, properties, executor); + std::shared_ptr config, + folly::Executor* executor = nullptr) override { + return std::make_shared(id, config, executor); } }; diff --git a/velox/connectors/fuzzer/FuzzerConnectorSplit.h b/velox/connectors/fuzzer/FuzzerConnectorSplit.h index b82749e0be9b6..11080aa4382e5 100644 --- a/velox/connectors/fuzzer/FuzzerConnectorSplit.h +++ b/velox/connectors/fuzzer/FuzzerConnectorSplit.h @@ -28,3 +28,25 @@ struct FuzzerConnectorSplit : public connector::ConnectorSplit { }; } // namespace facebook::velox::connector::fuzzer + +template <> +struct fmt::formatter + : formatter { + auto format( + facebook::velox::connector::fuzzer::FuzzerConnectorSplit s, + format_context& ctx) const { + return formatter::format(s.toString(), ctx); + } +}; + +template <> +struct fmt::formatter< + std::shared_ptr> + : formatter { + auto format( + std::shared_ptr + s, + format_context& ctx) const { + return formatter::format(s->toString(), ctx); + } +}; diff --git a/velox/connectors/fuzzer/tests/CMakeLists.txt b/velox/connectors/fuzzer/tests/CMakeLists.txt index 18e5c913fc16e..8036193390975 100644 --- a/velox/connectors/fuzzer/tests/CMakeLists.txt +++ b/velox/connectors/fuzzer/tests/CMakeLists.txt @@ -21,5 +21,5 @@ target_link_libraries( velox_vector_test_lib velox_exec_test_lib velox_aggregates - gtest - gtest_main) + GTest::gtest + GTest::gtest_main) diff --git a/velox/connectors/fuzzer/tests/FuzzerConnectorTest.cpp b/velox/connectors/fuzzer/tests/FuzzerConnectorTest.cpp index fad06218be862..5b5fe277ac595 100644 --- a/velox/connectors/fuzzer/tests/FuzzerConnectorTest.cpp +++ b/velox/connectors/fuzzer/tests/FuzzerConnectorTest.cpp @@ -31,8 +31,12 @@ TEST_F(FuzzerConnectorTest, singleSplit) { const size_t numRows = 100; auto type = ROW({BIGINT(), DOUBLE(), VARCHAR()}); - auto plan = - PlanBuilder().tableScan(type, makeFuzzerTableHandle(), {}).planNode(); + auto plan = PlanBuilder() + .startTableScan() + .outputType(type) + .tableHandle(makeFuzzerTableHandle()) + .endTableScan() + .planNode(); exec::test::AssertQueryBuilder(plan) .split(makeFuzzerSplit(numRows)) @@ -43,8 +47,12 @@ TEST_F(FuzzerConnectorTest, floatingPoints) { const size_t numRows = 1000; auto type = ROW({REAL(), DOUBLE()}); - auto plan = - PlanBuilder().tableScan(type, makeFuzzerTableHandle(), {}).planNode(); + auto plan = PlanBuilder() + .startTableScan() + .outputType(type) + .tableHandle(makeFuzzerTableHandle()) + .endTableScan() + .planNode(); exec::test::AssertQueryBuilder(plan) .split(makeFuzzerSplit(numRows)) @@ -59,8 +67,12 @@ TEST_F(FuzzerConnectorTest, complexTypes) { REAL(), }); - auto plan = - PlanBuilder().tableScan(type, makeFuzzerTableHandle(), {}).planNode(); + auto plan = PlanBuilder() + .startTableScan() + .outputType(type) + .tableHandle(makeFuzzerTableHandle()) + .endTableScan() + .planNode(); exec::test::AssertQueryBuilder(plan) .split(makeFuzzerSplit(numRows)) @@ -72,8 +84,12 @@ TEST_F(FuzzerConnectorTest, multipleSplits) { const size_t numSplits = 10; auto type = ROW({BIGINT(), DOUBLE(), VARCHAR()}); - auto plan = - PlanBuilder().tableScan(type, makeFuzzerTableHandle(), {}).planNode(); + auto plan = PlanBuilder() + .startTableScan() + .outputType(type) + .tableHandle(makeFuzzerTableHandle()) + .endTableScan() + .planNode(); exec::test::AssertQueryBuilder(plan) .splits(makeFuzzerSplits(rowsPerSplit, numSplits)) @@ -89,8 +105,12 @@ TEST_F(FuzzerConnectorTest, randomTypes) { for (size_t i = 0; i < iterations; ++i) { auto type = VectorFuzzer({}, pool()).randRowType(); - auto plan = - PlanBuilder().tableScan(type, makeFuzzerTableHandle(), {}).planNode(); + auto plan = PlanBuilder() + .startTableScan() + .outputType(type) + .tableHandle(makeFuzzerTableHandle()) + .endTableScan() + .planNode(); exec::test::AssertQueryBuilder(plan) .splits(makeFuzzerSplits(rowsPerSplit, numSplits)) .assertTypeAndNumRows(type, rowsPerSplit * numSplits); @@ -101,14 +121,18 @@ TEST_F(FuzzerConnectorTest, reproducible) { const size_t numRows = 100; auto type = ROW({BIGINT(), ARRAY(INTEGER()), VARCHAR()}); - auto plan1 = - PlanBuilder() - .tableScan(type, makeFuzzerTableHandle(/*fuzerSeed=*/1234), {}) - .planNode(); - auto plan2 = - PlanBuilder() - .tableScan(type, makeFuzzerTableHandle(/*fuzerSeed=*/1234), {}) - .planNode(); + auto plan1 = PlanBuilder() + .startTableScan() + .outputType(type) + .tableHandle(makeFuzzerTableHandle(/*fuzerSeed=*/1234)) + .endTableScan() + .planNode(); + auto plan2 = PlanBuilder() + .startTableScan() + .outputType(type) + .tableHandle(makeFuzzerTableHandle(/*fuzerSeed=*/1234)) + .endTableScan() + .planNode(); auto results1 = exec::test::AssertQueryBuilder(plan1) .split(makeFuzzerSplit(numRows)) @@ -124,6 +148,6 @@ TEST_F(FuzzerConnectorTest, reproducible) { int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); - folly::init(&argc, &argv, false); + folly::Init init(&argc, &argv, false); return RUN_ALL_TESTS(); } diff --git a/velox/connectors/fuzzer/tests/FuzzerConnectorTestBase.h b/velox/connectors/fuzzer/tests/FuzzerConnectorTestBase.h index 9e7f536157e8a..e47cc810012fb 100644 --- a/velox/connectors/fuzzer/tests/FuzzerConnectorTestBase.h +++ b/velox/connectors/fuzzer/tests/FuzzerConnectorTestBase.h @@ -26,10 +26,11 @@ class FuzzerConnectorTestBase : public exec::test::OperatorTestBase { void SetUp() override { OperatorTestBase::SetUp(); + std::shared_ptr config; auto fuzzerConnector = connector::getConnectorFactory( connector::fuzzer::FuzzerConnectorFactory::kFuzzerConnectorName) - ->newConnector(kFuzzerConnectorId, nullptr); + ->newConnector(kFuzzerConnectorId, config); connector::registerConnector(fuzzerConnector); } diff --git a/velox/connectors/hive/CMakeLists.txt b/velox/connectors/hive/CMakeLists.txt index 5fbd19245169e..8beee704f79e6 100644 --- a/velox/connectors/hive/CMakeLists.txt +++ b/velox/connectors/hive/CMakeLists.txt @@ -12,15 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_hive_config OBJECT HiveConfig.cpp) +velox_add_library(velox_hive_config OBJECT HiveConfig.cpp) +velox_link_libraries(velox_hive_config velox_core velox_exception) -target_link_libraries(velox_hive_config velox_exception) +add_subdirectory(iceberg) -add_library( - velox_hive_connector OBJECT +velox_add_library( + velox_hive_connector + OBJECT FileHandle.cpp HiveConfig.cpp HiveConnector.cpp + HiveConnectorUtil.cpp HiveDataSink.cpp HiveDataSource.cpp HivePartitionUtil.cpp @@ -28,24 +31,29 @@ add_library( SplitReader.cpp TableHandle.cpp) -target_link_libraries( +velox_link_libraries( velox_hive_connector - velox_common_io - velox_connector - velox_dwio_catalog_fbhive - velox_dwio_dwrf_reader - velox_dwio_dwrf_writer - velox_dwio_parquet_reader - velox_dwio_parquet_writer - velox_file - velox_hive_partition_function - velox_s3fs - velox_hdfs - velox_gcs) - -add_library(velox_hive_partition_function HivePartitionFunction.cpp) - -target_link_libraries(velox_hive_partition_function velox_core velox_exec) + PUBLIC velox_hive_iceberg_splitreader + PRIVATE + velox_common_io + velox_connector + velox_dwio_catalog_fbhive + velox_dwio_dwrf_reader + velox_dwio_dwrf_writer + velox_dwio_orc_reader + velox_dwio_parquet_reader + velox_dwio_parquet_writer + velox_file + velox_hive_partition_function + velox_type_tz + velox_s3fs + velox_hdfs + velox_gcs + velox_abfs) + +velox_add_library(velox_hive_partition_function HivePartitionFunction.cpp) + +velox_link_libraries(velox_hive_partition_function velox_core velox_exec) add_subdirectory(storage_adapters) diff --git a/velox/connectors/hive/FileHandle.cpp b/velox/connectors/hive/FileHandle.cpp index 44c219b328505..7678fb7a6c35c 100644 --- a/velox/connectors/hive/FileHandle.cpp +++ b/velox/connectors/hive/FileHandle.cpp @@ -24,6 +24,12 @@ namespace facebook::velox { +uint64_t FileHandleSizer::operator()(const FileHandle& fileHandle) { + // TODO: add to support variable file cache size support when the file system + // underneath supports. + return 1; +} + namespace { // The group tracking is at the level of the directory, i.e. Hive partition. std::string groupName(const std::string& filename) { @@ -33,25 +39,30 @@ std::string groupName(const std::string& filename) { } } // namespace -std::shared_ptr FileHandleGenerator::operator()( - const std::string& filename) { +std::unique_ptr FileHandleGenerator::operator()( + const std::string& filename, + const FileProperties* properties) { // We have seen cases where drivers are stuck when creating file handles. // Adding a trace here to spot this more easily in future. process::TraceContext trace("FileHandleGenerator::operator()"); uint64_t elapsedTimeUs{0}; - std::shared_ptr fileHandle; + std::unique_ptr fileHandle; { MicrosecondTimer timer(&elapsedTimeUs); - fileHandle = std::make_shared(); + fileHandle = std::make_unique(); + filesystems::FileOptions options; + if (properties) { + options.fileSize = properties->fileSize; + } fileHandle->file = filesystems::getFileSystem(filename, properties_) - ->openFileForRead(filename); + ->openFileForRead(filename, options); fileHandle->uuid = StringIdLease(fileIds(), filename); fileHandle->groupId = StringIdLease(fileIds(), groupName(filename)); VLOG(1) << "Generating file handle for: " << filename << " uuid: " << fileHandle->uuid.id(); } - REPORT_ADD_HISTOGRAM_VALUE( - kCounterHiveFileHandleGenerateLatencyMs, elapsedTimeUs / 1000); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricHiveFileHandleGenerateLatencyMs, elapsedTimeUs / 1000); // TODO: build the hash map/etc per file type -- presumably after reading // the appropriate magic number from the file, or perhaps we include the file // type in the file handle key. diff --git a/velox/connectors/hive/FileHandle.h b/velox/connectors/hive/FileHandle.h index 15edd9d2ac2f4..5db30b1d7f4c3 100644 --- a/velox/connectors/hive/FileHandle.h +++ b/velox/connectors/hive/FileHandle.h @@ -25,19 +25,14 @@ #pragma once -#include -#include -#include - #include "velox/common/caching/CachedFactory.h" #include "velox/common/caching/FileIds.h" +#include "velox/common/config/Config.h" #include "velox/common/file/File.h" -#include "velox/dwio/common/InputStream.h" +#include "velox/connectors/hive/FileProperties.h" namespace facebook::velox { -class Config; - // See the file comment. struct FileHandle { std::shared_ptr file; @@ -59,24 +54,35 @@ struct FileHandle { // first diff we'll not include the map. }; +/// Estimates the memory usage of a FileHandle object. +struct FileHandleSizer { + uint64_t operator()(const FileHandle& a); +}; + using FileHandleCache = SimpleLRUCache; // Creates FileHandles via the Generator interface the CachedFactory requires. class FileHandleGenerator { public: FileHandleGenerator() {} - FileHandleGenerator(std::shared_ptr properties) + FileHandleGenerator(std::shared_ptr properties) : properties_(std::move(properties)) {} - std::shared_ptr operator()(const std::string& filename); + std::unique_ptr operator()( + const std::string& filename, + const FileProperties* properties); private: - const std::shared_ptr properties_; + const std::shared_ptr properties_; }; using FileHandleFactory = CachedFactory< std::string, - std::shared_ptr, - FileHandleGenerator>; + FileHandle, + FileHandleGenerator, + FileProperties, + FileHandleSizer>; + +using FileHandleCachedPtr = CachedPtr; using FileHandleCacheStats = SimpleLRUCacheStats; diff --git a/velox/connectors/hive/FileProperties.h b/velox/connectors/hive/FileProperties.h new file mode 100644 index 0000000000000..6e6a0dbe5ccb0 --- /dev/null +++ b/velox/connectors/hive/FileProperties.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// A FileHandle is a File pointer plus some (optional, file-type-dependent) +// extra information for speeding up loading columnar data. For example, when +// we open a file we might build a hash map saying what region(s) on disk +// correspond to a given column in a given stripe. +// +// The FileHandle will normally be used in conjunction with a CachedFactory +// to speed up queries that hit the same files repeatedly; see the +// FileHandleCache and FileHandleFactory. + +#pragma once + +#include + +namespace facebook::velox { + +struct FileProperties { + std::optional fileSize; + std::optional modificationTime; +}; + +} // namespace facebook::velox diff --git a/velox/connectors/hive/HiveConfig.cpp b/velox/connectors/hive/HiveConfig.cpp index 003e4e448166b..27ee4cd8c6338 100644 --- a/velox/connectors/hive/HiveConfig.cpp +++ b/velox/connectors/hive/HiveConfig.cpp @@ -15,7 +15,8 @@ */ #include "velox/connectors/hive/HiveConfig.h" -#include "velox/core/Config.h" +#include "velox/common/config/Config.h" +#include "velox/core/QueryConfig.h" #include @@ -38,16 +39,6 @@ stringToInsertExistingPartitionsBehavior(const std::string& strValue) { } // namespace -// static -HiveConfig::InsertExistingPartitionsBehavior -HiveConfig::insertExistingPartitionsBehavior(const Config* config) { - const auto behavior = - config->get(kInsertExistingPartitionsBehavior); - return behavior.has_value() - ? stringToInsertExistingPartitionsBehavior(behavior.value()) - : InsertExistingPartitionsBehavior::kError; -} - // static std::string HiveConfig::insertExistingPartitionsBehaviorString( InsertExistingPartitionsBehavior behavior) { @@ -61,108 +52,287 @@ std::string HiveConfig::insertExistingPartitionsBehaviorString( } } -// static -uint32_t HiveConfig::maxPartitionsPerWriters(const Config* config) { - return config->get(kMaxPartitionsPerWriters, 100); +HiveConfig::InsertExistingPartitionsBehavior +HiveConfig::insertExistingPartitionsBehavior( + const config::ConfigBase* session) const { + return stringToInsertExistingPartitionsBehavior(session->get( + kInsertExistingPartitionsBehaviorSession, + config_->get(kInsertExistingPartitionsBehavior, "ERROR"))); } -// static -bool HiveConfig::immutablePartitions(const Config* config) { - return config->get(kImmutablePartitions, false); +uint32_t HiveConfig::maxPartitionsPerWriters( + const config::ConfigBase* session) const { + return session->get( + kMaxPartitionsPerWritersSession, + config_->get(kMaxPartitionsPerWriters, 100)); } -// static -bool HiveConfig::s3UseVirtualAddressing(const Config* config) { - return !config->get(kS3PathStyleAccess, false); +bool HiveConfig::immutablePartitions() const { + return config_->get(kImmutablePartitions, false); } -// static -std::string HiveConfig::s3GetLogLevel(const Config* config) { - return config->get(kS3LogLevel, std::string("FATAL")); +bool HiveConfig::s3UseVirtualAddressing() const { + return !config_->get(kS3PathStyleAccess, false); } -// static -bool HiveConfig::s3UseSSL(const Config* config) { - return config->get(kS3SSLEnabled, true); +std::string HiveConfig::s3GetLogLevel() const { + return config_->get(kS3LogLevel, std::string("FATAL")); } -// static -bool HiveConfig::s3UseInstanceCredentials(const Config* config) { - return config->get(kS3UseInstanceCredentials, false); +bool HiveConfig::s3UseSSL() const { + return config_->get(kS3SSLEnabled, true); } -// static -std::string HiveConfig::s3Endpoint(const Config* config) { - return config->get(kS3Endpoint, std::string("")); +bool HiveConfig::s3UseInstanceCredentials() const { + return config_->get(kS3UseInstanceCredentials, false); } -// static -std::optional HiveConfig::s3AccessKey(const Config* config) { - if (config->isValueExists(kS3AwsAccessKey)) { - return config->get(kS3AwsAccessKey).value(); - } - return {}; +std::string HiveConfig::s3Endpoint() const { + return config_->get(kS3Endpoint, std::string("")); } -// static -std::optional HiveConfig::s3SecretKey(const Config* config) { - if (config->isValueExists(kS3AwsSecretKey)) { - return config->get(kS3AwsSecretKey).value(); - } - return {}; +std::optional HiveConfig::s3AccessKey() const { + return static_cast>( + config_->get(kS3AwsAccessKey)); } -// static -std::optional HiveConfig::s3IAMRole(const Config* config) { - if (config->isValueExists(kS3IamRole)) { - return config->get(kS3IamRole).value(); +std::optional HiveConfig::s3SecretKey() const { + return static_cast>( + config_->get(kS3AwsSecretKey)); +} + +std::optional HiveConfig::s3IAMRole() const { + return static_cast>( + config_->get(kS3IamRole)); +} + +std::string HiveConfig::s3IAMRoleSessionName() const { + return config_->get(kS3IamRoleSessionName, std::string("velox-session")); +} + +std::optional HiveConfig::s3ConnectTimeout() const { + return static_cast>( + config_->get(kS3ConnectTimeout)); +} + +std::optional HiveConfig::s3SocketTimeout() const { + return static_cast>( + config_->get(kS3SocketTimeout)); +} + +std::optional HiveConfig::s3MaxConnections() const { + return static_cast>( + config_->get(kS3MaxConnections)); +} + +std::optional HiveConfig::s3MaxAttempts() const { + return static_cast>( + config_->get(kS3MaxAttempts)); +} + +std::optional HiveConfig::s3RetryMode() const { + return static_cast>( + config_->get(kS3RetryMode)); +} + +std::string HiveConfig::gcsEndpoint() const { + return config_->get(kGCSEndpoint, std::string("")); +} + +std::string HiveConfig::gcsScheme() const { + return config_->get(kGCSScheme, std::string("https")); +} + +std::string HiveConfig::gcsCredentials() const { + return config_->get(kGCSCredentials, std::string("")); +} + +std::optional HiveConfig::gcsMaxRetryCount() const { + return static_cast>(config_->get(kGCSMaxRetryCount)); +} + +std::optional HiveConfig::gcsMaxRetryTime() const { + return static_cast>( + config_->get(kGCSMaxRetryTime)); +} + +bool HiveConfig::isOrcUseColumnNames(const config::ConfigBase* session) const { + return session->get( + kOrcUseColumnNamesSession, config_->get(kOrcUseColumnNames, false)); +} + +bool HiveConfig::isFileColumnNamesReadAsLowerCase( + const config::ConfigBase* session) const { + return session->get( + kFileColumnNamesReadAsLowerCaseSession, + config_->get(kFileColumnNamesReadAsLowerCase, false)); +} + +bool HiveConfig::isPartitionPathAsLowerCase( + const config::ConfigBase* session) const { + return session->get(kPartitionPathAsLowerCaseSession, true); +} + +bool HiveConfig::allowNullPartitionKeys( + const config::ConfigBase* session) const { + return session->get( + kAllowNullPartitionKeysSession, + config_->get(kAllowNullPartitionKeys, true)); +} + +bool HiveConfig::ignoreMissingFiles(const config::ConfigBase* session) const { + return session->get(kIgnoreMissingFilesSession, false); +} + +int64_t HiveConfig::maxCoalescedBytes() const { + return config_->get(kMaxCoalescedBytes, 128 << 20); +} + +int32_t HiveConfig::maxCoalescedDistanceBytes() const { + return config_->get(kMaxCoalescedDistanceBytes, 512 << 10); +} + +int32_t HiveConfig::prefetchRowGroups() const { + return config_->get(kPrefetchRowGroups, 1); +} + +int32_t HiveConfig::loadQuantum() const { + return config_->get(kLoadQuantum, 8 << 20); +} + +int32_t HiveConfig::numCacheFileHandles() const { + return config_->get(kNumCacheFileHandles, 20'000); +} + +bool HiveConfig::isFileHandleCacheEnabled() const { + return config_->get(kEnableFileHandleCache, true); +} + +uint64_t HiveConfig::orcWriterMaxStripeSize( + const config::ConfigBase* session) const { + return config::toCapacity( + session->get( + kOrcWriterMaxStripeSizeSession, + config_->get(kOrcWriterMaxStripeSize, "64MB")), + config::CapacityUnit::BYTE); +} + +uint64_t HiveConfig::orcWriterMaxDictionaryMemory( + const config::ConfigBase* session) const { + return config::toCapacity( + session->get( + kOrcWriterMaxDictionaryMemorySession, + config_->get(kOrcWriterMaxDictionaryMemory, "16MB")), + config::CapacityUnit::BYTE); +} + +bool HiveConfig::isOrcWriterIntegerDictionaryEncodingEnabled( + const config::ConfigBase* session) const { + return session->get( + kOrcWriterIntegerDictionaryEncodingEnabledSession, + config_->get(kOrcWriterIntegerDictionaryEncodingEnabled, true)); +} + +bool HiveConfig::isOrcWriterStringDictionaryEncodingEnabled( + const config::ConfigBase* session) const { + return session->get( + kOrcWriterStringDictionaryEncodingEnabledSession, + config_->get(kOrcWriterStringDictionaryEncodingEnabled, true)); +} + +bool HiveConfig::orcWriterLinearStripeSizeHeuristics( + const config::ConfigBase* session) const { + return session->get( + kOrcWriterLinearStripeSizeHeuristicsSession, + config_->get(kOrcWriterLinearStripeSizeHeuristics, true)); +} + +uint64_t HiveConfig::orcWriterMinCompressionSize( + const config::ConfigBase* session) const { + return session->get( + kOrcWriterMinCompressionSizeSession, + config_->get(kOrcWriterMinCompressionSize, 1024)); +} + +std::optional HiveConfig::orcWriterCompressionLevel( + const config::ConfigBase* session) const { + auto sessionProp = session->get(kOrcWriterCompressionLevelSession); + + if (sessionProp.has_value()) { + return sessionProp.value(); + } + + auto configProp = config_->get(kOrcWriterCompressionLevel); + + if (configProp.has_value()) { + return configProp.value(); } - return {}; + + // Presto has a single config controlling this value, but different defaults + // depending on the compression kind. + return std::nullopt; } -// static -std::string HiveConfig::s3IAMRoleSessionName(const Config* config) { - return config->get(kS3IamRoleSessionName, std::string("velox-session")); +uint8_t HiveConfig::orcWriterZLIBCompressionLevel( + const config::ConfigBase* session) const { + constexpr uint8_t kDefaultZlibCompressionLevel = 4; + return orcWriterCompressionLevel(session).value_or( + kDefaultZlibCompressionLevel); } -// static -std::string HiveConfig::gcsEndpoint(const Config* config) { - return config->get(kGCSEndpoint, std::string("")); +uint8_t HiveConfig::orcWriterZSTDCompressionLevel( + const config::ConfigBase* session) const { + constexpr uint8_t kDefaultZstdCompressionLevel = 3; + return orcWriterCompressionLevel(session).value_or( + kDefaultZstdCompressionLevel); } -// static -std::string HiveConfig::gcsScheme(const Config* config) { - return config->get(kGCSScheme, std::string("https")); +std::string HiveConfig::writeFileCreateConfig() const { + return config_->get(kWriteFileCreateConfig, ""); } -// static -std::string HiveConfig::gcsCredentials(const Config* config) { - return config->get(kGCSCredentials, std::string("")); +uint32_t HiveConfig::sortWriterMaxOutputRows( + const config::ConfigBase* session) const { + return session->get( + kSortWriterMaxOutputRowsSession, + config_->get(kSortWriterMaxOutputRows, 1024)); +} + +uint64_t HiveConfig::sortWriterMaxOutputBytes( + const config::ConfigBase* session) const { + return config::toCapacity( + session->get( + kSortWriterMaxOutputBytesSession, + config_->get(kSortWriterMaxOutputBytes, "10MB")), + config::CapacityUnit::BYTE); } -// static. -bool HiveConfig::isOrcUseColumnNames(const Config* config) { - return config->get(kOrcUseColumnNames, false); +uint64_t HiveConfig::footerEstimatedSize() const { + return config_->get(kFooterEstimatedSize, 1UL << 20); } -// static. -bool HiveConfig::isFileColumnNamesReadAsLowerCase(const Config* config) { - return config->get(kFileColumnNamesReadAsLowerCase, false); +uint64_t HiveConfig::filePreloadThreshold() const { + return config_->get(kFilePreloadThreshold, 8UL << 20); } -// static. -int64_t HiveConfig::maxCoalescedBytes(const Config* config) { - return config->get(kMaxCoalescedBytes, 128 << 20); +bool HiveConfig::s3UseProxyFromEnv() const { + return config_->get(kS3UseProxyFromEnv, false); } -// static. -int32_t HiveConfig::maxCoalescedDistanceBytes(const Config* config) { - return config->get(kMaxCoalescedDistanceBytes, 512 << 10); +uint8_t HiveConfig::readTimestampUnit(const config::ConfigBase* session) const { + const auto unit = session->get( + kReadTimestampUnitSession, + config_->get(kReadTimestampUnit, 3 /*milli*/)); + VELOX_CHECK( + unit == 3 || unit == 6 /*micro*/ || unit == 9 /*nano*/, + "Invalid timestamp unit."); + return unit; } -// static. -int32_t HiveConfig::numCacheFileHandles(const Config* config) { - return config->get(kNumCacheFileHandles, 20'000); +bool HiveConfig::cacheNoRetention(const config::ConfigBase* session) const { + return session->get( + kCacheNoRetentionSession, + config_->get(kCacheNoRetention, /*defaultValue=*/false)); } } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveConfig.h b/velox/connectors/hive/HiveConfig.h index 131081fb4039f..8f572a3dab57f 100644 --- a/velox/connectors/hive/HiveConfig.h +++ b/velox/connectors/hive/HiveConfig.h @@ -17,9 +17,10 @@ #include #include +#include "velox/common/base/Exceptions.h" -namespace facebook::velox { -class Config; +namespace facebook::velox::config { +class ConfigBase; } namespace facebook::velox::connector::hive { @@ -36,12 +37,19 @@ class HiveConfig { InsertExistingPartitionsBehavior behavior); /// Behavior on insert into existing partitions. - static constexpr const char* kInsertExistingPartitionsBehavior = + static constexpr const char* kInsertExistingPartitionsBehaviorSession = "insert_existing_partitions_behavior"; + static constexpr const char* kInsertExistingPartitionsBehavior = + "insert-existing-partitions-behavior"; /// Maximum number of (bucketed) partitions per a single table writer /// instance. + /// + /// TODO: remove hive_orc_use_column_names since it doesn't exist in presto, + /// right now this is only used for testing. static constexpr const char* kMaxPartitionsPerWriters = + "max-partitions-per-writers"; + static constexpr const char* kMaxPartitionsPerWritersSession = "max_partitions_per_writers"; /// Whether new data can be inserted into an unpartition table. @@ -81,72 +89,292 @@ class HiveConfig { static constexpr const char* kS3IamRoleSessionName = "hive.s3.iam-role-session-name"; - // The GCS storage endpoint server. + /// Socket connect timeout. + static constexpr const char* kS3ConnectTimeout = "hive.s3.connect-timeout"; + + /// Socket read timeout. + static constexpr const char* kS3SocketTimeout = "hive.s3.socket-timeout"; + + /// Maximum concurrent TCP connections for a single http client. + static constexpr const char* kS3MaxConnections = "hive.s3.max-connections"; + + /// Maximum retry attempts for a single http client. + static constexpr const char* kS3MaxAttempts = "hive.s3.max-attempts"; + + /// Retry mode for a single http client. + static constexpr const char* kS3RetryMode = "hive.s3.retry-mode"; + + /// The GCS storage endpoint server. static constexpr const char* kGCSEndpoint = "hive.gcs.endpoint"; - // The GCS storage scheme, https for default credentials. + /// The GCS storage scheme, https for default credentials. static constexpr const char* kGCSScheme = "hive.gcs.scheme"; - // The GCS service account configuration as json string + /// The GCS service account configuration as json string static constexpr const char* kGCSCredentials = "hive.gcs.credentials"; - // Map table field names to file field names using names, not indices. + /// The GCS maximum retry counter of transient errors. + static constexpr const char* kGCSMaxRetryCount = "hive.gcs.max-retry-count"; + + /// The GCS maximum time allowed to retry transient errors. + static constexpr const char* kGCSMaxRetryTime = "hive.gcs.max-retry-time"; + + /// Maps table field names to file field names using names, not indices. + // TODO: remove hive_orc_use_column_names since it doesn't exist in presto, + // right now this is only used for testing. static constexpr const char* kOrcUseColumnNames = "hive.orc.use-column-names"; + static constexpr const char* kOrcUseColumnNamesSession = + "hive_orc_use_column_names"; - // Read the source file column name as lower case. + /// Reads the source file column name as lower case. static constexpr const char* kFileColumnNamesReadAsLowerCase = + "file-column-names-read-as-lower-case"; + static constexpr const char* kFileColumnNamesReadAsLowerCaseSession = "file_column_names_read_as_lower_case"; - // Set the max coalesce bytes for a request. + static constexpr const char* kPartitionPathAsLowerCaseSession = + "partition_path_as_lower_case"; + + static constexpr const char* kAllowNullPartitionKeys = + "allow-null-partition-keys"; + static constexpr const char* kAllowNullPartitionKeysSession = + "allow_null_partition_keys"; + + static constexpr const char* kIgnoreMissingFilesSession = + "ignore_missing_files"; + + /// The max coalesce bytes for a request. static constexpr const char* kMaxCoalescedBytes = "max-coalesced-bytes"; - // Set the max coalesce distance bytes for combining requests. + /// The max coalesce distance bytes for combining requests. static constexpr const char* kMaxCoalescedDistanceBytes = "max-coalesced-distance-bytes"; + /// The number of prefetch rowgroups + static constexpr const char* kPrefetchRowGroups = "prefetch-rowgroups"; + + /// The total size in bytes for a direct coalesce request. Up to 8MB load + /// quantum size is supported when SSD cache is enabled. + static constexpr const char* kLoadQuantum = "load-quantum"; + /// Maximum number of entries in the file handle cache. static constexpr const char* kNumCacheFileHandles = "num_cached_file_handles"; - static InsertExistingPartitionsBehavior insertExistingPartitionsBehavior( - const Config* config); + /// Enable file handle cache. + static constexpr const char* kEnableFileHandleCache = + "file-handle-cache-enabled"; + + /// The size in bytes to be fetched with Meta data together, used when the + /// data after meta data will be used later. Optimization to decrease small IO + /// request + static constexpr const char* kFooterEstimatedSize = "footer-estimated-size"; + + /// The threshold of file size in bytes when the whole file is fetched with + /// meta data together. Optimization to decrease the small IO requests + static constexpr const char* kFilePreloadThreshold = "file-preload-threshold"; + + /// Maximum stripe size in orc writer. + static constexpr const char* kOrcWriterMaxStripeSize = + "hive.orc.writer.stripe-max-size"; + static constexpr const char* kOrcWriterMaxStripeSizeSession = + "orc_optimized_writer_max_stripe_size"; + + /// Maximum dictionary memory that can be used in orc writer. + static constexpr const char* kOrcWriterMaxDictionaryMemory = + "hive.orc.writer.dictionary-max-memory"; + static constexpr const char* kOrcWriterMaxDictionaryMemorySession = + "orc_optimized_writer_max_dictionary_memory"; + + /// Configs to control dictionary encoding. + static constexpr const char* kOrcWriterIntegerDictionaryEncodingEnabled = + "hive.orc.writer.integer-dictionary-encoding-enabled"; + static constexpr const char* + kOrcWriterIntegerDictionaryEncodingEnabledSession = + "orc_optimized_writer_integer_dictionary_encoding_enabled"; + static constexpr const char* kOrcWriterStringDictionaryEncodingEnabled = + "hive.orc.writer.string-dictionary-encoding-enabled"; + static constexpr const char* + kOrcWriterStringDictionaryEncodingEnabledSession = + "orc_optimized_writer_string_dictionary_encoding_enabled"; + + /// Enables historical based stripe size estimation after compression. + static constexpr const char* kOrcWriterLinearStripeSizeHeuristics = + "hive.orc.writer.linear-stripe-size-heuristics"; + static constexpr const char* kOrcWriterLinearStripeSizeHeuristicsSession = + "orc_writer_linear_stripe_size_heuristics"; + + /// Minimal number of items in an encoded stream. + static constexpr const char* kOrcWriterMinCompressionSize = + "hive.orc.writer.min-compression-size"; + static constexpr const char* kOrcWriterMinCompressionSizeSession = + "orc_writer_min_compression_size"; + + /// The compression level to use with ZLIB and ZSTD. + static constexpr const char* kOrcWriterCompressionLevel = + "hive.orc.writer.compression-level"; + static constexpr const char* kOrcWriterCompressionLevelSession = + "orc_optimized_writer_compression_level"; + + /// Config used to create write files. This config is provided to underlying + /// file system through hive connector and data sink. The config is free form. + /// The form should be defined by the underlying file system. + static constexpr const char* kWriteFileCreateConfig = + "hive.write_file_create_config"; + + /// Maximum number of rows for sort writer in one batch of output. + static constexpr const char* kSortWriterMaxOutputRows = + "sort-writer-max-output-rows"; + static constexpr const char* kSortWriterMaxOutputRowsSession = + "sort_writer_max_output_rows"; + + /// Maximum bytes for sort writer in one batch of output. + static constexpr const char* kSortWriterMaxOutputBytes = + "sort-writer-max-output-bytes"; + static constexpr const char* kSortWriterMaxOutputBytesSession = + "sort_writer_max_output_bytes"; + + static constexpr const char* kS3UseProxyFromEnv = + "hive.s3.use-proxy-from-env"; + + // The unit for reading timestamps from files. + static constexpr const char* kReadTimestampUnit = + "hive.reader.timestamp-unit"; + static constexpr const char* kReadTimestampUnitSession = + "hive.reader.timestamp_unit"; + + static constexpr const char* kCacheNoRetention = "cache.no_retention"; + static constexpr const char* kCacheNoRetentionSession = "cache.no_retention"; + + InsertExistingPartitionsBehavior insertExistingPartitionsBehavior( + const config::ConfigBase* session) const; + + uint32_t maxPartitionsPerWriters(const config::ConfigBase* session) const; + + bool immutablePartitions() const; + + bool s3UseVirtualAddressing() const; + + std::string s3GetLogLevel() const; + + bool s3UseSSL() const; + + bool s3UseInstanceCredentials() const; + + std::string s3Endpoint() const; + + std::optional s3AccessKey() const; + + std::optional s3SecretKey() const; + + std::optional s3IAMRole() const; + + std::string s3IAMRoleSessionName() const; + + std::optional s3ConnectTimeout() const; + + std::optional s3SocketTimeout() const; + + std::optional s3MaxConnections() const; + + std::optional s3MaxAttempts() const; + + std::optional s3RetryMode() const; + + std::string gcsEndpoint() const; + + std::string gcsScheme() const; + + std::string gcsCredentials() const; + + std::optional gcsMaxRetryCount() const; + + std::optional gcsMaxRetryTime() const; + + bool isOrcUseColumnNames(const config::ConfigBase* session) const; + + bool isFileColumnNamesReadAsLowerCase( + const config::ConfigBase* session) const; + + bool isPartitionPathAsLowerCase(const config::ConfigBase* session) const; + + bool allowNullPartitionKeys(const config::ConfigBase* session) const; + + bool ignoreMissingFiles(const config::ConfigBase* session) const; + + int64_t maxCoalescedBytes() const; + + int32_t maxCoalescedDistanceBytes() const; + + int32_t prefetchRowGroups() const; + + int32_t loadQuantum() const; + + int32_t numCacheFileHandles() const; + + bool isFileHandleCacheEnabled() const; + + uint64_t fileWriterFlushThresholdBytes() const; + + uint64_t orcWriterMaxStripeSize(const config::ConfigBase* session) const; - static uint32_t maxPartitionsPerWriters(const Config* config); + uint64_t orcWriterMaxDictionaryMemory( + const config::ConfigBase* session) const; - static bool immutablePartitions(const Config* config); + bool isOrcWriterIntegerDictionaryEncodingEnabled( + const config::ConfigBase* session) const; - static bool s3UseVirtualAddressing(const Config* config); + bool isOrcWriterStringDictionaryEncodingEnabled( + const config::ConfigBase* session) const; - static std::string s3GetLogLevel(const Config* config); + bool orcWriterLinearStripeSizeHeuristics( + const config::ConfigBase* session) const; - static bool s3UseSSL(const Config* config); + uint64_t orcWriterMinCompressionSize(const config::ConfigBase* session) const; - static bool s3UseInstanceCredentials(const Config* config); + std::optional orcWriterCompressionLevel( + const config::ConfigBase* session) const; - static std::string s3Endpoint(const Config* config); + uint8_t orcWriterZLIBCompressionLevel( + const config::ConfigBase* session) const; - static std::optional s3AccessKey(const Config* config); + uint8_t orcWriterZSTDCompressionLevel( + const config::ConfigBase* session) const; - static std::optional s3SecretKey(const Config* config); + std::string writeFileCreateConfig() const; - static std::optional s3IAMRole(const Config* config); + uint32_t sortWriterMaxOutputRows(const config::ConfigBase* session) const; - static std::string s3IAMRoleSessionName(const Config* config); + uint64_t sortWriterMaxOutputBytes(const config::ConfigBase* session) const; - static std::string gcsEndpoint(const Config* config); + uint64_t footerEstimatedSize() const; - static std::string gcsScheme(const Config* config); + uint64_t filePreloadThreshold() const; - static std::string gcsCredentials(const Config* config); + bool s3UseProxyFromEnv() const; - static bool isOrcUseColumnNames(const Config* config); + // Returns the timestamp unit used when reading timestamps from files. + uint8_t readTimestampUnit(const config::ConfigBase* session) const; - static bool isFileColumnNamesReadAsLowerCase(const Config* config); + /// Returns true to evict out a query scanned data out of in-memory cache + /// right after the access, and also skip staging to the ssd cache. This helps + /// to prevent the cache space pollution from the one-time table scan by large + /// batch query when mixed running with interactive query which has high data + /// locality. + bool cacheNoRetention(const config::ConfigBase* session) const; - static int64_t maxCoalescedBytes(const Config* config); + HiveConfig(std::shared_ptr config) { + VELOX_CHECK_NOT_NULL( + config, "Config is null for HiveConfig initialization"); + config_ = std::move(config); + // TODO: add sanity check + } - static int32_t maxCoalescedDistanceBytes(const Config* config); + const std::shared_ptr& config() const { + return config_; + } - static int32_t numCacheFileHandles(const Config* config); + private: + std::shared_ptr config_; }; } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveConnector.cpp b/velox/connectors/hive/HiveConnector.cpp index 522ec99058f4c..4b1a5c608b283 100644 --- a/velox/connectors/hive/HiveConnector.cpp +++ b/velox/connectors/hive/HiveConnector.cpp @@ -21,23 +21,18 @@ #include "velox/connectors/hive/HiveDataSink.h" #include "velox/connectors/hive/HiveDataSource.h" #include "velox/connectors/hive/HivePartitionFunction.h" -// Meta's buck build system needs this check. -#ifdef VELOX_ENABLE_GCS +#include "velox/dwio/dwrf/RegisterDwrfReader.h" +#include "velox/dwio/dwrf/RegisterDwrfWriter.h" + +#include "velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.h" // @manual #include "velox/connectors/hive/storage_adapters/gcs/RegisterGCSFileSystem.h" // @manual -#endif -#ifdef VELOX_ENABLE_HDFS3 #include "velox/connectors/hive/storage_adapters/hdfs/RegisterHdfsFileSystem.h" // @manual -#endif -#ifdef VELOX_ENABLE_S3 #include "velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.h" // @manual -#endif #include "velox/dwio/dwrf/reader/DwrfReader.h" #include "velox/dwio/dwrf/writer/Writer.h" -// Meta's buck build system needs this check. -#ifdef VELOX_ENABLE_PARQUET +#include "velox/dwio/orc/reader/OrcReader.h" #include "velox/dwio/parquet/RegisterParquetReader.h" // @manual #include "velox/dwio/parquet/RegisterParquetWriter.h" // @manual -#endif #include "velox/expression/FieldReference.h" #include @@ -48,24 +43,27 @@ using namespace facebook::velox::dwrf; namespace facebook::velox::connector::hive { -int32_t numCachedFileHandles(const Config* properties) { - return properties ? HiveConfig::numCacheFileHandles(properties) : 20'000; -} - HiveConnector::HiveConnector( const std::string& id, - std::shared_ptr properties, - folly::Executor* FOLLY_NULLABLE executor) - : Connector(id, properties), + std::shared_ptr config, + folly::Executor* executor) + : Connector(id), + hiveConfig_(std::make_shared(config)), fileHandleFactory_( - std::make_unique< - SimpleLRUCache>>( - numCachedFileHandles(properties.get())), - std::make_unique(properties)), + hiveConfig_->isFileHandleCacheEnabled() + ? std::make_unique>( + hiveConfig_->numCacheFileHandles()) + : nullptr, + std::make_unique(config)), executor_(executor) { - LOG(INFO) << "Hive connector " << connectorId() << " created with maximum of " - << numCachedFileHandles(properties.get()) - << " cached file handles."; + if (hiveConfig_->isFileHandleCacheEnabled()) { + LOG(INFO) << "Hive connector " << connectorId() + << " created with maximum of " + << hiveConfig_->numCacheFileHandles() << " cached file handles."; + } else { + LOG(INFO) << "Hive connector " << connectorId() + << " created with file handle cache disabled"; + } } std::unique_ptr HiveConnector::createDataSource( @@ -75,27 +73,14 @@ std::unique_ptr HiveConnector::createDataSource( std::string, std::shared_ptr>& columnHandles, ConnectorQueryCtx* connectorQueryCtx) { - dwio::common::ReaderOptions options(connectorQueryCtx->memoryPool()); - options.setMaxCoalesceBytes( - HiveConfig::maxCoalescedBytes(connectorQueryCtx->config())); - options.setMaxCoalesceDistance( - HiveConfig::maxCoalescedDistanceBytes(connectorQueryCtx->config())); - options.setFileColumnNamesReadAsLowerCase( - HiveConfig::isFileColumnNamesReadAsLowerCase( - connectorQueryCtx->config())); - options.setUseColumnNamesForColumnMapping( - HiveConfig::isOrcUseColumnNames(connectorQueryCtx->config())); - return std::make_unique( outputType, tableHandle, columnHandles, &fileHandleFactory_, - connectorQueryCtx->expressionEvaluator(), - connectorQueryCtx->cache(), - connectorQueryCtx->scanId(), executor_, - options); + connectorQueryCtx, + hiveConfig_); } std::unique_ptr HiveConnector::createDataSink( @@ -112,7 +97,7 @@ std::unique_ptr HiveConnector::createDataSink( hiveInsertHandle, connectorQueryCtx, commitStrategy, - connectorProperties()); + hiveConfig_); } std::unique_ptr HivePartitionFunctionSpec::create( @@ -136,25 +121,19 @@ std::unique_ptr HivePartitionFunctionSpec::create( } void HiveConnectorFactory::initialize() { - static bool once = []() { + [[maybe_unused]] static bool once = []() { dwio::common::registerFileSinks(); dwrf::registerDwrfReaderFactory(); dwrf::registerDwrfWriterFactory(); -// Meta's buck build system needs this check. -#ifdef VELOX_ENABLE_PARQUET + orc::registerOrcReaderFactory(); + parquet::registerParquetReaderFactory(); parquet::registerParquetWriterFactory(); -#endif -// Meta's buck build system needs this check. -#ifdef VELOX_ENABLE_S3 + filesystems::registerS3FileSystem(); -#endif -#ifdef VELOX_ENABLE_HDFS3 filesystems::registerHdfsFileSystem(); -#endif -#ifdef VELOX_ENABLE_GCS filesystems::registerGCSFileSystem(); -#endif + filesystems::abfs::registerAbfsFileSystem(); return true; }(); } diff --git a/velox/connectors/hive/HiveConnector.h b/velox/connectors/hive/HiveConnector.h index a8740594ad867..c0b0c1bf7013a 100644 --- a/velox/connectors/hive/HiveConnector.h +++ b/velox/connectors/hive/HiveConnector.h @@ -17,6 +17,7 @@ #include "velox/connectors/Connector.h" #include "velox/connectors/hive/FileHandle.h" +#include "velox/connectors/hive/HiveConfig.h" #include "velox/core/PlanNode.h" namespace facebook::velox::dwio::common { @@ -30,8 +31,13 @@ class HiveConnector : public Connector { public: HiveConnector( const std::string& id, - std::shared_ptr properties, - folly::Executor* FOLLY_NULLABLE executor); + std::shared_ptr config, + folly::Executor* executor); + + const std::shared_ptr& connectorConfig() + const override { + return hiveConfig_->config(); + } bool canAddDynamicFilter() const override { return true; @@ -55,7 +61,7 @@ class HiveConnector : public Connector { ConnectorQueryCtx* connectorQueryCtx, CommitStrategy commitStrategy) override final; - folly::Executor* FOLLY_NULLABLE executor() const override { + folly::Executor* executor() const override { return executor_; } @@ -70,19 +76,19 @@ class HiveConnector : public Connector { } protected: + const std::shared_ptr hiveConfig_; FileHandleFactory fileHandleFactory_; - folly::Executor* FOLLY_NULLABLE executor_; + folly::Executor* executor_; }; class HiveConnectorFactory : public ConnectorFactory { public: - static constexpr const char* FOLLY_NONNULL kHiveConnectorName = "hive"; - static constexpr const char* FOLLY_NONNULL kHiveHadoop2ConnectorName = - "hive-hadoop2"; + static constexpr const char* kHiveConnectorName = "hive"; + static constexpr const char* kHiveHadoop2ConnectorName = "hive-hadoop2"; HiveConnectorFactory() : ConnectorFactory(kHiveConnectorName) {} - explicit HiveConnectorFactory(const char* FOLLY_NONNULL connectorName) + explicit HiveConnectorFactory(const char* connectorName) : ConnectorFactory(connectorName) {} /// Register HiveConnector components such as Dwrf, Parquet readers and @@ -91,9 +97,9 @@ class HiveConnectorFactory : public ConnectorFactory { std::shared_ptr newConnector( const std::string& id, - std::shared_ptr properties, - folly::Executor* FOLLY_NULLABLE executor = nullptr) override { - return std::make_shared(id, properties, executor); + std::shared_ptr config, + folly::Executor* executor = nullptr) override { + return std::make_shared(id, config, executor); } }; diff --git a/velox/connectors/hive/HiveConnectorSplit.h b/velox/connectors/hive/HiveConnectorSplit.h index 5b46c5329a1ef..9c011ef3aea84 100644 --- a/velox/connectors/hive/HiveConnectorSplit.h +++ b/velox/connectors/hive/HiveConnectorSplit.h @@ -18,22 +18,50 @@ #include #include #include "velox/connectors/Connector.h" +#include "velox/connectors/hive/FileProperties.h" +#include "velox/connectors/hive/TableHandle.h" #include "velox/dwio/common/Options.h" namespace facebook::velox::connector::hive { +/// A bucket conversion that should happen on the split. This happens when we +/// increase the bucket count of a table, but the old partitions are still +/// generated using the old bucket count, so that multiple new buckets can exist +/// in the same file, and we need to apply extra filter when we read these files +/// to make sure we read the rows corresponding to the selected bucket number +/// only. +struct HiveBucketConversion { + int32_t tableBucketCount; + int32_t partitionBucketCount; + std::vector> bucketColumnHandles; +}; + struct HiveConnectorSplit : public connector::ConnectorSplit { const std::string filePath; dwio::common::FileFormat fileFormat; const uint64_t start; const uint64_t length; + + /// Mapping from partition keys to values. Values are specified as strings + /// formatted the same way as CAST(x as VARCHAR). Null values are specified as + /// std::nullopt. Date values must be formatted using ISO 8601 as YYYY-MM-DD. + /// All scalar types and date type are supported. const std::unordered_map> partitionKeys; std::optional tableBucketNumber; + std::optional bucketConversion; std::unordered_map customSplitInfo; std::shared_ptr extraFileInfo; std::unordered_map serdeParameters; + /// These represent columns like $file_size, $file_modified_time that are + /// associated with the HiveSplit. + std::unordered_map infoColumns; + + /// These represent file properties like file size that are used while opening + /// the file handle. + std::optional properties; + HiveConnectorSplit( const std::string& connectorId, const std::string& _filePath, @@ -45,8 +73,11 @@ struct HiveConnectorSplit : public connector::ConnectorSplit { std::optional _tableBucketNumber = std::nullopt, const std::unordered_map& _customSplitInfo = {}, const std::shared_ptr& _extraFileInfo = {}, - const std::unordered_map& _serdeParameters = {}) - : ConnectorSplit(connectorId), + const std::unordered_map& _serdeParameters = {}, + int64_t _splitWeight = 0, + const std::unordered_map& _infoColumns = {}, + std::optional _properties = std::nullopt) + : ConnectorSplit(connectorId, _splitWeight), filePath(_filePath), fileFormat(_fileFormat), start(_start), @@ -55,7 +86,9 @@ struct HiveConnectorSplit : public connector::ConnectorSplit { tableBucketNumber(_tableBucketNumber), customSplitInfo(_customSplitInfo), extraFileInfo(_extraFileInfo), - serdeParameters(_serdeParameters) {} + serdeParameters(_serdeParameters), + infoColumns(_infoColumns), + properties(_properties) {} std::string toString() const override { if (tableBucketNumber.has_value()) { diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp new file mode 100644 index 0000000000000..b6de7566dc11a --- /dev/null +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -0,0 +1,1014 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/HiveConnectorUtil.h" + +#include "velox/connectors/hive/FileHandle.h" +#include "velox/connectors/hive/HiveConfig.h" +#include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/connectors/hive/TableHandle.h" +#include "velox/dwio/common/BufferedInput.h" +#include "velox/dwio/common/CachedBufferedInput.h" +#include "velox/dwio/common/DirectBufferedInput.h" +#include "velox/dwio/common/Reader.h" +#include "velox/dwio/dwrf/common/Config.h" +#include "velox/dwio/dwrf/writer/Writer.h" + +#ifdef VELOX_ENABLE_PARQUET +#include "velox/dwio/parquet/writer/Writer.h" // @manual +#endif + +#include "velox/expression/Expr.h" +#include "velox/expression/ExprToSubfieldFilter.h" +#include "velox/type/TimestampConversion.h" +#include "velox/type/tz/TimeZoneMap.h" + +namespace facebook::velox::connector::hive { +namespace { + +struct SubfieldSpec { + const common::Subfield* subfield; + bool filterOnly; +}; + +template +void deduplicate(std::vector& values) { + std::sort(values.begin(), values.end()); + values.erase(std::unique(values.begin(), values.end()), values.end()); +} + +// Floating point map key subscripts are truncated toward 0 in Presto. For +// example given `a' as a map with floating point key, if user queries a[0.99], +// Presto coordinator will generate a required subfield a[0]; for a[-1.99] it +// will generate a[-1]; for anything larger than 9223372036854775807, it +// generates a[9223372036854775807]; for anything smaller than +// -9223372036854775808 it generates a[-9223372036854775808]. +template +std::unique_ptr makeFloatingPointMapKeyFilter( + const std::vector& subscripts) { + std::vector> filters; + for (auto subscript : subscripts) { + T lower = subscript; + T upper = subscript; + bool lowerUnbounded = subscript == std::numeric_limits::min(); + bool upperUnbounded = subscript == std::numeric_limits::max(); + bool lowerExclusive = false; + bool upperExclusive = false; + if (lower <= 0 && !lowerUnbounded) { + if (lower > subscript - 1) { + lower = subscript - 1; + } else { + lower = std::nextafter(lower, -std::numeric_limits::infinity()); + } + lowerExclusive = true; + } + if (upper >= 0 && !upperUnbounded) { + if (upper < subscript + 1) { + upper = subscript + 1; + } else { + upper = std::nextafter(upper, std::numeric_limits::infinity()); + } + upperExclusive = true; + } + if (lowerUnbounded && upperUnbounded) { + continue; + } + filters.push_back(std::make_unique>( + lower, + lowerUnbounded, + lowerExclusive, + upper, + upperUnbounded, + upperExclusive, + false)); + } + if (filters.size() == 1) { + return std::move(filters[0]); + } + return std::make_unique(std::move(filters), false); +} + +// Recursively add subfields to scan spec. +void addSubfields( + const Type& type, + std::vector& subfields, + int level, + memory::MemoryPool* pool, + common::ScanSpec& spec) { + int newSize = 0; + for (int i = 0; i < subfields.size(); ++i) { + if (level < subfields[i].subfield->path().size()) { + subfields[newSize++] = subfields[i]; + } else if (!subfields[i].filterOnly) { + spec.addAllChildFields(type); + return; + } + } + subfields.resize(newSize); + switch (type.kind()) { + case TypeKind::ROW: { + folly::F14FastMap> required; + for (auto& subfield : subfields) { + auto* element = subfield.subfield->path()[level].get(); + auto* nestedField = + dynamic_cast(element); + VELOX_CHECK( + nestedField, + "Unsupported for row subfields pruning: {}", + element->toString()); + required[nestedField->name()].push_back(subfield); + } + auto& rowType = type.asRow(); + for (int i = 0; i < rowType.size(); ++i) { + auto& childName = rowType.nameOf(i); + auto& childType = rowType.childAt(i); + auto* child = spec.addField(childName, i); + auto it = required.find(childName); + if (it == required.end()) { + child->setConstantValue( + BaseVector::createNullConstant(childType, 1, pool)); + } else { + addSubfields(*childType, it->second, level + 1, pool, *child); + } + } + break; + } + case TypeKind::MAP: { + auto& keyType = type.childAt(0); + auto* keys = spec.addMapKeyFieldRecursively(*keyType); + addSubfields( + *type.childAt(1), + subfields, + level + 1, + pool, + *spec.addMapValueField()); + if (subfields.empty()) { + return; + } + bool stringKey = keyType->isVarchar() || keyType->isVarbinary(); + std::vector stringSubscripts; + std::vector longSubscripts; + for (auto& subfield : subfields) { + auto* element = subfield.subfield->path()[level].get(); + if (dynamic_cast(element)) { + return; + } + if (stringKey) { + auto* subscript = + dynamic_cast(element); + VELOX_CHECK( + subscript, + "Unsupported for string map pruning: {}", + element->toString()); + stringSubscripts.push_back(subscript->index()); + } else { + auto* subscript = + dynamic_cast(element); + VELOX_CHECK( + subscript, + "Unsupported for long map pruning: {}", + element->toString()); + longSubscripts.push_back(subscript->index()); + } + } + std::unique_ptr filter; + if (stringKey) { + deduplicate(stringSubscripts); + filter = std::make_unique(stringSubscripts, false); + spec.setFlatMapFeatureSelection(std::move(stringSubscripts)); + } else { + deduplicate(longSubscripts); + if (keyType->isReal()) { + filter = makeFloatingPointMapKeyFilter(longSubscripts); + } else if (keyType->isDouble()) { + filter = makeFloatingPointMapKeyFilter(longSubscripts); + } else { + filter = common::createBigintValues(longSubscripts, false); + } + std::vector features; + for (auto num : longSubscripts) { + features.push_back(std::to_string(num)); + } + spec.setFlatMapFeatureSelection(std::move(features)); + } + keys->setFilter(std::move(filter)); + break; + } + case TypeKind::ARRAY: { + addSubfields( + *type.childAt(0), + subfields, + level + 1, + pool, + *spec.addArrayElementField()); + if (subfields.empty()) { + return; + } + constexpr long kMaxIndex = std::numeric_limits::max(); + long maxIndex = -1; + for (auto& subfield : subfields) { + auto* element = subfield.subfield->path()[level].get(); + if (dynamic_cast(element)) { + return; + } + auto* subscript = + dynamic_cast(element); + VELOX_CHECK( + subscript, + "Unsupported for array pruning: {}", + element->toString()); + VELOX_USER_CHECK_GT( + subscript->index(), + 0, + "Non-positive array subscript cannot be push down"); + maxIndex = std::max(maxIndex, std::min(kMaxIndex, subscript->index())); + } + spec.setMaxArrayElementsCount(maxIndex); + break; + } + default: + break; + } +} + +inline uint8_t parseDelimiter(const std::string& delim) { + for (char const& ch : delim) { + if (!std::isdigit(ch)) { + return delim[0]; + } + } + return stoi(delim); +} + +inline bool isSynthesizedColumn( + const std::string& name, + const std::unordered_map>& + infoColumns) { + return name == kPath || name == kBucket || infoColumns.count(name) != 0; +} + +inline bool isRowIndexColumn( + const std::string& name, + std::shared_ptr rowIndexColumn) { + return rowIndexColumn != nullptr && rowIndexColumn->name() == name; +} + +} // namespace + +const std::string& getColumnName(const common::Subfield& subfield) { + VELOX_CHECK_GT(subfield.path().size(), 0); + auto* field = dynamic_cast( + subfield.path()[0].get()); + VELOX_CHECK_NOT_NULL(field); + return field->name(); +} + +void checkColumnNameLowerCase(const std::shared_ptr& type) { + switch (type->kind()) { + case TypeKind::ARRAY: + checkColumnNameLowerCase(type->asArray().elementType()); + break; + case TypeKind::MAP: { + checkColumnNameLowerCase(type->asMap().keyType()); + checkColumnNameLowerCase(type->asMap().valueType()); + + } break; + case TypeKind::ROW: { + for (const auto& outputName : type->asRow().names()) { + VELOX_CHECK( + !std::any_of(outputName.begin(), outputName.end(), isupper)); + } + for (auto& childType : type->asRow().children()) { + checkColumnNameLowerCase(childType); + } + } break; + default: + VLOG(1) << "No need to check type lowercase mode" << type->toString(); + } +} + +void checkColumnNameLowerCase( + const SubfieldFilters& filters, + const std::unordered_map>& + infoColumns) { + for (const auto& filterIt : filters) { + const auto name = filterIt.first.toString(); + if (isSynthesizedColumn(name, infoColumns)) { + continue; + } + const auto& path = filterIt.first.path(); + + for (int i = 0; i < path.size(); ++i) { + auto* nestedField = + dynamic_cast(path[i].get()); + if (nestedField == nullptr) { + continue; + } + VELOX_CHECK(!std::any_of( + nestedField->name().begin(), nestedField->name().end(), isupper)); + } + } +} + +void checkColumnNameLowerCase(const core::TypedExprPtr& typeExpr) { + if (typeExpr == nullptr) { + return; + } + checkColumnNameLowerCase(typeExpr->type()); + for (auto& type : typeExpr->inputs()) { + checkColumnNameLowerCase(type); + } +} + +namespace { + +void processFieldSpec( + const RowTypePtr& dataColumns, + const TypePtr& outputType, + common::ScanSpec& fieldSpec) { + fieldSpec.visit(*outputType, [](const Type& type, common::ScanSpec& spec) { + if (type.isMap() && !spec.isConstant()) { + auto* keys = spec.childByName(common::ScanSpec::kMapKeysFieldName); + VELOX_CHECK_NOT_NULL(keys); + keys->addFilter(common::IsNotNull()); + } + }); + if (dataColumns) { + auto i = dataColumns->getChildIdxIfExists(fieldSpec.fieldName()); + if (i.has_value()) { + if (dataColumns->childAt(*i)->isMap() && outputType->isRow()) { + fieldSpec.setFlatMapAsStruct(true); + } + } + } +} + +} // namespace + +std::shared_ptr makeScanSpec( + const RowTypePtr& rowType, + const folly::F14FastMap>& + outputSubfields, + const SubfieldFilters& filters, + const RowTypePtr& dataColumns, + const std::unordered_map>& + partitionKeys, + const std::unordered_map>& + infoColumns, + const std::shared_ptr& rowIndexColumn, + memory::MemoryPool* pool) { + auto spec = std::make_shared("root"); + folly::F14FastMap> + filterSubfields; + std::vector subfieldSpecs; + for (auto& [subfield, _] : filters) { + if (auto name = subfield.toString(); + !isSynthesizedColumn(name, infoColumns) && + !isRowIndexColumn(name, rowIndexColumn) && + partitionKeys.count(name) == 0) { + filterSubfields[getColumnName(subfield)].push_back(&subfield); + } + } + + // Process columns that will be projected out. + for (int i = 0; i < rowType->size(); ++i) { + auto& name = rowType->nameOf(i); + auto& type = rowType->childAt(i); + auto it = outputSubfields.find(name); + if (it == outputSubfields.end()) { + auto* fieldSpec = spec->addFieldRecursively(name, *type, i); + if (isRowIndexColumn(name, rowIndexColumn)) { + VELOX_CHECK(type->isBigint()); + fieldSpec->setExplicitRowNumber(true); + } + processFieldSpec(dataColumns, type, *fieldSpec); + filterSubfields.erase(name); + continue; + } + for (auto* subfield : it->second) { + subfieldSpecs.push_back({subfield, false}); + } + it = filterSubfields.find(name); + if (it != filterSubfields.end()) { + for (auto* subfield : it->second) { + subfieldSpecs.push_back({subfield, true}); + } + filterSubfields.erase(it); + } + auto* fieldSpec = spec->addField(name, i); + addSubfields(*type, subfieldSpecs, 1, pool, *fieldSpec); + processFieldSpec(dataColumns, type, *fieldSpec); + subfieldSpecs.clear(); + } + + // Now process the columns that will not be projected out. + if (!filterSubfields.empty()) { + VELOX_CHECK_NOT_NULL(dataColumns); + for (auto& [fieldName, subfields] : filterSubfields) { + for (auto* subfield : subfields) { + subfieldSpecs.push_back({subfield, true}); + } + auto& type = dataColumns->findChild(fieldName); + auto* fieldSpec = spec->getOrCreateChild(fieldName); + addSubfields(*type, subfieldSpecs, 1, pool, *fieldSpec); + processFieldSpec(dataColumns, type, *fieldSpec); + subfieldSpecs.clear(); + } + } + + for (auto& pair : filters) { + const auto name = pair.first.toString(); + // SelectiveColumnReader doesn't support constant columns with filters, + // hence, we can't have a filter for a $path or $bucket column. + // + // Unfortunately, Presto happens to specify a filter for $path, $file_size, + // $file_modified_time or $bucket column. This filter is redundant and needs + // to be removed. + // TODO Remove this check when Presto is fixed to not specify a filter + // on $path and $bucket column. + if (isSynthesizedColumn(name, infoColumns)) { + continue; + } + VELOX_CHECK(!isRowIndexColumn(name, rowIndexColumn)); + auto fieldSpec = spec->getOrCreateChild(pair.first); + fieldSpec->addFilter(*pair.second); + } + + return spec; +} + +std::unique_ptr parseSerdeParameters( + const std::unordered_map& serdeParameters, + const std::unordered_map& tableParameters) { + auto fieldIt = serdeParameters.find(dwio::common::SerDeOptions::kFieldDelim); + if (fieldIt == serdeParameters.end()) { + fieldIt = serdeParameters.find("serialization.format"); + } + auto collectionIt = + serdeParameters.find(dwio::common::SerDeOptions::kCollectionDelim); + if (collectionIt == serdeParameters.end()) { + // For collection delimiter, Hive 1.x, 2.x uses "colelction.delim", but + // Hive 3.x uses "collection.delim". + // See: https://issues.apache.org/jira/browse/HIVE-16922) + collectionIt = serdeParameters.find("colelction.delim"); + } + auto mapKeyIt = + serdeParameters.find(dwio::common::SerDeOptions::kMapKeyDelim); + + auto escapeCharIt = + serdeParameters.find(dwio::common::SerDeOptions::kEscapeChar); + + auto nullStringIt = tableParameters.find( + dwio::common::TableParameter::kSerializationNullFormat); + + if (fieldIt == serdeParameters.end() && + collectionIt == serdeParameters.end() && + mapKeyIt == serdeParameters.end() && + escapeCharIt == serdeParameters.end() && + nullStringIt == tableParameters.end()) { + return nullptr; + } + + uint8_t fieldDelim = '\1'; + uint8_t collectionDelim = '\2'; + uint8_t mapKeyDelim = '\3'; + if (fieldIt != serdeParameters.end()) { + fieldDelim = parseDelimiter(fieldIt->second); + } + if (collectionIt != serdeParameters.end()) { + collectionDelim = parseDelimiter(collectionIt->second); + } + if (mapKeyIt != serdeParameters.end()) { + mapKeyDelim = parseDelimiter(mapKeyIt->second); + } + + // If escape character is specified then we use it, unless it is empty - in + // which case we default to '\\'. + // If escape character is not specified (not in the map) we turn escaping off. + // Logic is based on apache hive java code: + // https://github.com/apache/hive/blob/3f6f940af3f60cc28834268e5d7f5612e3b13c30/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java#L105-L108 + uint8_t escapeChar = '\\'; + const bool hasEscapeChar = (escapeCharIt != serdeParameters.end()); + if (hasEscapeChar) { + if (!escapeCharIt->second.empty()) { + // If delim is convertible to uint8_t then we use it as character code, + // otherwise we use the 1st character of the string. + escapeChar = folly::tryTo(escapeCharIt->second) + .value_or(escapeCharIt->second[0]); + } + } + + auto serDeOptions = hasEscapeChar + ? std::make_unique( + fieldDelim, collectionDelim, mapKeyDelim, escapeChar, true) + : std::make_unique( + fieldDelim, collectionDelim, mapKeyDelim); + if (nullStringIt != tableParameters.end()) { + serDeOptions->nullString = nullStringIt->second; + } + return serDeOptions; +} + +void configureReaderOptions( + dwio::common::ReaderOptions& readerOptions, + const std::shared_ptr& hiveConfig, + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveTableHandle, + const std::shared_ptr& hiveSplit) { + configureReaderOptions( + readerOptions, + hiveConfig, + connectorQueryCtx, + hiveTableHandle->dataColumns(), + hiveSplit, + hiveTableHandle->tableParameters()); +} + +void configureReaderOptions( + dwio::common::ReaderOptions& readerOptions, + const std::shared_ptr& hiveConfig, + const ConnectorQueryCtx* connectorQueryCtx, + const RowTypePtr& fileSchema, + const std::shared_ptr& hiveSplit, + const std::unordered_map& tableParameters) { + auto sessionProperties = connectorQueryCtx->sessionProperties(); + readerOptions.setLoadQuantum(hiveConfig->loadQuantum()); + readerOptions.setMaxCoalesceBytes(hiveConfig->maxCoalescedBytes()); + readerOptions.setMaxCoalesceDistance(hiveConfig->maxCoalescedDistanceBytes()); + readerOptions.setFileColumnNamesReadAsLowerCase( + hiveConfig->isFileColumnNamesReadAsLowerCase(sessionProperties)); + readerOptions.setUseColumnNamesForColumnMapping( + hiveConfig->isOrcUseColumnNames(sessionProperties)); + readerOptions.setFileSchema(fileSchema); + readerOptions.setFooterEstimatedSize(hiveConfig->footerEstimatedSize()); + readerOptions.setFilePreloadThreshold(hiveConfig->filePreloadThreshold()); + readerOptions.setPrefetchRowGroups(hiveConfig->prefetchRowGroups()); + readerOptions.setNoCacheRetention( + hiveConfig->cacheNoRetention(sessionProperties)); + const auto& sessionTzName = connectorQueryCtx->sessionTimezone(); + if (!sessionTzName.empty()) { + const auto timezone = tz::locateZone(sessionTzName); + readerOptions.setSessionTimezone(timezone); + } + readerOptions.setSelectiveNimbleReaderEnabled( + connectorQueryCtx->selectiveNimbleReaderEnabled()); + + if (readerOptions.fileFormat() != dwio::common::FileFormat::UNKNOWN) { + VELOX_CHECK( + readerOptions.fileFormat() == hiveSplit->fileFormat, + "HiveDataSource received splits of different formats: {} and {}", + dwio::common::toString(readerOptions.fileFormat()), + dwio::common::toString(hiveSplit->fileFormat)); + } else { + auto serDeOptions = + parseSerdeParameters(hiveSplit->serdeParameters, tableParameters); + if (serDeOptions) { + readerOptions.setSerDeOptions(*serDeOptions); + } + + readerOptions.setFileFormat(hiveSplit->fileFormat); + } +} + +void configureRowReaderOptions( + const std::unordered_map& tableParameters, + const std::shared_ptr& scanSpec, + std::shared_ptr metadataFilter, + const RowTypePtr& rowType, + const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveConfig, + const config::ConfigBase* sessionProperties, + dwio::common::RowReaderOptions& rowReaderOptions) { + auto skipRowsIt = + tableParameters.find(dwio::common::TableParameter::kSkipHeaderLineCount); + if (skipRowsIt != tableParameters.end()) { + rowReaderOptions.setSkipRows(folly::to(skipRowsIt->second)); + } + rowReaderOptions.setScanSpec(scanSpec); + rowReaderOptions.setMetadataFilter(std::move(metadataFilter)); + rowReaderOptions.setRequestedType(rowType); + rowReaderOptions.range(hiveSplit->start, hiveSplit->length); + if (hiveConfig && sessionProperties) { + rowReaderOptions.setTimestampPrecision(static_cast( + hiveConfig->readTimestampUnit(sessionProperties))); + } +} + +namespace { + +bool applyPartitionFilter( + const TypePtr& type, + const std::string& partitionValue, + common::Filter* filter) { + if (type->isDate()) { + const auto result = util::fromDateString( + StringView(partitionValue), util::ParseMode::kPrestoCast); + VELOX_CHECK(!result.hasError()); + return applyFilter(*filter, result.value()); + } + + switch (type->kind()) { + case TypeKind::BIGINT: + case TypeKind::INTEGER: + case TypeKind::SMALLINT: + case TypeKind::TINYINT: { + return applyFilter(*filter, folly::to(partitionValue)); + } + case TypeKind::REAL: + case TypeKind::DOUBLE: { + return applyFilter(*filter, folly::to(partitionValue)); + } + case TypeKind::BOOLEAN: { + return applyFilter(*filter, folly::to(partitionValue)); + } + case TypeKind::VARCHAR: { + return applyFilter(*filter, partitionValue); + } + default: + VELOX_FAIL( + "Bad type {} for partition value: {}", type->kind(), partitionValue); + } +} + +} // namespace + +bool testFilters( + const common::ScanSpec* scanSpec, + const dwio::common::Reader* reader, + const std::string& filePath, + const std::unordered_map>& + partitionKeys, + const std::unordered_map>& + partitionKeysHandle) { + const auto totalRows = reader->numberOfRows(); + const auto& fileTypeWithId = reader->typeWithId(); + const auto& rowType = reader->rowType(); + for (const auto& child : scanSpec->children()) { + if (child->filter()) { + const auto& name = child->fieldName(); + auto iter = partitionKeys.find(name); + // By design, the partition key columns for Iceberg tables are included in + // the data files to facilitate partition transform and partition + // evolution, so we need to test both cases. + if (!rowType->containsChild(name) || iter != partitionKeys.end()) { + if (iter != partitionKeys.end() && iter->second.has_value()) { + const auto handlesIter = partitionKeysHandle.find(name); + VELOX_CHECK(handlesIter != partitionKeysHandle.end()); + + // This is a non-null partition key + return applyPartitionFilter( + handlesIter->second->dataType(), + iter->second.value(), + child->filter()); + } + // Column is missing, most likely due to schema evolution. Or it's a + // partition key but the partition value is NULL. + if (child->filter()->isDeterministic() && + !child->filter()->testNull()) { + VLOG(1) << "Skipping " << filePath + << " because the filter testNull() failed for column " + << child->fieldName(); + return false; + } + } else { + const auto& typeWithId = fileTypeWithId->childByName(name); + const auto columnStats = reader->columnStatistics(typeWithId->id()); + if (columnStats != nullptr && + !testFilter( + child->filter(), + columnStats.get(), + totalRows.value(), + typeWithId->type())) { + VLOG(1) << "Skipping " << filePath + << " based on stats and filter for column " + << child->fieldName(); + return false; + } + } + } + } + + return true; +} + +std::unique_ptr createBufferedInput( + const FileHandle& fileHandle, + const dwio::common::ReaderOptions& readerOpts, + const ConnectorQueryCtx* connectorQueryCtx, + std::shared_ptr ioStats, + folly::Executor* executor) { + if (connectorQueryCtx->cache()) { + return std::make_unique( + fileHandle.file, + dwio::common::MetricsLog::voidLog(), + fileHandle.uuid.id(), + connectorQueryCtx->cache(), + Connector::getTracker( + connectorQueryCtx->scanId(), readerOpts.loadQuantum()), + fileHandle.groupId.id(), + ioStats, + executor, + readerOpts); + } + return std::make_unique( + fileHandle.file, + dwio::common::MetricsLog::voidLog(), + fileHandle.uuid.id(), + Connector::getTracker( + connectorQueryCtx->scanId(), readerOpts.loadQuantum()), + fileHandle.groupId.id(), + std::move(ioStats), + executor, + readerOpts); +} + +namespace { + +core::CallTypedExprPtr replaceInputs( + const core::CallTypedExpr* call, + std::vector&& inputs) { + return std::make_shared( + call->type(), std::move(inputs), call->name()); +} + +bool endWith(const std::string& str, const char* suffix) { + int len = strlen(suffix); + if (str.size() < len) { + return false; + } + for (int i = 0, j = str.size() - len; i < len; ++i, ++j) { + if (str[j] != suffix[i]) { + return false; + } + } + return true; +} + +bool isNotExpr( + const core::TypedExprPtr& expr, + const core::CallTypedExpr* call, + core::ExpressionEvaluator* evaluator) { + if (!endWith(call->name(), "not")) { + return false; + } + auto exprs = evaluator->compile(expr); + VELOX_CHECK_EQ(exprs->size(), 1); + auto& compiled = exprs->expr(0); + return compiled->vectorFunction() && + compiled->vectorFunction()->getCanonicalName() == + exec::FunctionCanonicalName::kNot; +} + +double getPrestoSampleRate( + const core::TypedExprPtr& expr, + const core::CallTypedExpr* call, + core::ExpressionEvaluator* evaluator) { + if (!endWith(call->name(), "lt")) { + return -1; + } + VELOX_CHECK_EQ(call->inputs().size(), 2); + auto exprs = evaluator->compile(expr); + VELOX_CHECK_EQ(exprs->size(), 1); + auto& lt = exprs->expr(0); + if (!(lt->vectorFunction() && + lt->vectorFunction()->getCanonicalName() == + exec::FunctionCanonicalName::kLt)) { + return -1; + } + auto& rand = lt->inputs()[0]; + if (!(rand->inputs().empty() && rand->vectorFunction() && + rand->vectorFunction()->getCanonicalName() == + exec::FunctionCanonicalName::kRand)) { + return -1; + } + auto* rate = + dynamic_cast(call->inputs()[1].get()); + if (!(rate && rate->type()->kind() == TypeKind::DOUBLE)) { + return -1; + } + return std::max(0.0, std::min(1.0, rate->value().value())); +} + +} // namespace + +core::TypedExprPtr extractFiltersFromRemainingFilter( + const core::TypedExprPtr& expr, + core::ExpressionEvaluator* evaluator, + bool negated, + SubfieldFilters& filters, + double& sampleRate) { + auto* call = dynamic_cast(expr.get()); + if (call == nullptr) { + return expr; + } + common::Filter* oldFilter = nullptr; + try { + common::Subfield subfield; + if (auto filter = exec::leafCallToSubfieldFilter( + *call, subfield, evaluator, negated)) { + if (auto it = filters.find(subfield); it != filters.end()) { + oldFilter = it->second.get(); + filter = filter->mergeWith(oldFilter); + } + filters.insert_or_assign(std::move(subfield), std::move(filter)); + return nullptr; + } + } catch (const VeloxException&) { + LOG(WARNING) << "Unexpected failure when extracting filter for: " + << expr->toString(); + if (oldFilter) { + LOG(WARNING) << "Merging with " << oldFilter->toString(); + } + } + + if (isNotExpr(expr, call, evaluator)) { + auto inner = extractFiltersFromRemainingFilter( + call->inputs()[0], evaluator, !negated, filters, sampleRate); + return inner ? replaceInputs(call, {inner}) : nullptr; + } + + if ((call->name() == "and" && !negated) || + (call->name() == "or" && negated)) { + auto lhs = extractFiltersFromRemainingFilter( + call->inputs()[0], evaluator, negated, filters, sampleRate); + auto rhs = extractFiltersFromRemainingFilter( + call->inputs()[1], evaluator, negated, filters, sampleRate); + if (!lhs) { + return rhs; + } + if (!rhs) { + return lhs; + } + return replaceInputs(call, {lhs, rhs}); + } + if (!negated) { + double rate = getPrestoSampleRate(expr, call, evaluator); + if (rate != -1) { + sampleRate *= rate; + return nullptr; + } + } + return expr; +} + +namespace { + +#ifdef VELOX_ENABLE_PARQUET +std::optional getTimestampUnit( + const config::ConfigBase& config, + const char* configKey) { + if (const auto unit = config.get(configKey)) { + VELOX_CHECK( + unit == 0 /*second*/ || unit == 3 /*milli*/ || unit == 6 /*micro*/ || + unit == 9 /*nano*/, + "Invalid timestamp unit: {}", + unit.value()); + return std::optional(static_cast(unit.value())); + } + return std::nullopt; +} + +std::optional getTimestampTimeZone( + const config::ConfigBase& config, + const char* configKey) { + if (const auto timezone = config.get(configKey)) { + return timezone.value(); + } + return std::nullopt; +} + +void updateParquetWriterOptions( + const std::shared_ptr& hiveConfig, + const config::ConfigBase* sessionProperties, + std::shared_ptr& writerOptions) { + auto parquetWriterOptions = + std::dynamic_pointer_cast(writerOptions); + VELOX_CHECK_NOT_NULL( + parquetWriterOptions, + "Parquet writer expected a Parquet WriterOptions object."); + + if (!parquetWriterOptions->parquetWriteTimestampUnit) { + parquetWriterOptions->parquetWriteTimestampUnit = + getTimestampUnit( + *sessionProperties, + parquet::WriterOptions::kParquetSessionWriteTimestampUnit) + .has_value() + ? getTimestampUnit( + *sessionProperties, + parquet::WriterOptions::kParquetSessionWriteTimestampUnit) + : getTimestampUnit( + *hiveConfig->config(), + parquet::WriterOptions::kParquetSessionWriteTimestampUnit); + } + + if (!parquetWriterOptions->parquetWriteTimestampTimeZone) { + parquetWriterOptions->parquetWriteTimestampTimeZone = + getTimestampTimeZone( + *sessionProperties, core::QueryConfig::kSessionTimezone) + .has_value() + ? getTimestampTimeZone( + *sessionProperties, core::QueryConfig::kSessionTimezone) + : getTimestampTimeZone( + *hiveConfig->config(), core::QueryConfig::kSessionTimezone); + } + + writerOptions = std::move(parquetWriterOptions); +} +#endif + +void updateDWRFWriterOptions( + const std::shared_ptr& hiveConfig, + const config::ConfigBase* sessionProperties, + std::shared_ptr& writerOptions) { + auto dwrfWriterOptions = + std::dynamic_pointer_cast(writerOptions); + VELOX_CHECK_NOT_NULL( + dwrfWriterOptions, "DWRF writer expected a DWRF WriterOptions object."); + std::map configs; + + if (writerOptions->compressionKind.has_value()) { + configs.emplace( + dwrf::Config::COMPRESSION.key, + std::to_string(writerOptions->compressionKind.value())); + } + + configs.emplace( + dwrf::Config::STRIPE_SIZE.key, + std::to_string(hiveConfig->orcWriterMaxStripeSize(sessionProperties))); + + configs.emplace( + dwrf::Config::MAX_DICTIONARY_SIZE.key, + std::to_string( + hiveConfig->orcWriterMaxDictionaryMemory(sessionProperties))); + + configs.emplace( + dwrf::Config::INTEGER_DICTIONARY_ENCODING_ENABLED.key, + std::to_string(hiveConfig->isOrcWriterIntegerDictionaryEncodingEnabled( + sessionProperties))); + configs.emplace( + dwrf::Config::STRING_DICTIONARY_ENCODING_ENABLED.key, + std::to_string(hiveConfig->isOrcWriterStringDictionaryEncodingEnabled( + sessionProperties))); + + configs.emplace( + dwrf::Config::COMPRESSION_BLOCK_SIZE_MIN.key, + std::to_string( + hiveConfig->orcWriterMinCompressionSize(sessionProperties))); + + configs.emplace( + dwrf::Config::LINEAR_STRIPE_SIZE_HEURISTICS.key, + std::to_string( + hiveConfig->orcWriterLinearStripeSizeHeuristics(sessionProperties))); + + configs.emplace( + dwrf::Config::ZLIB_COMPRESSION_LEVEL.key, + std::to_string( + hiveConfig->orcWriterZLIBCompressionLevel(sessionProperties))); + + configs.emplace( + dwrf::Config::ZSTD_COMPRESSION_LEVEL.key, + std::to_string( + hiveConfig->orcWriterZSTDCompressionLevel(sessionProperties))); + + dwrfWriterOptions->config = dwrf::Config::fromMap(configs); + writerOptions = std::move(dwrfWriterOptions); +} + +} // namespace + +void updateWriterOptionsFromHiveConfig( + dwio::common::FileFormat fileFormat, + const std::shared_ptr& hiveConfig, + const config::ConfigBase* sessionProperties, + std::shared_ptr& writerOptions) { + switch (fileFormat) { + case dwio::common::FileFormat::DWRF: + updateDWRFWriterOptions(hiveConfig, sessionProperties, writerOptions); + break; + case dwio::common::FileFormat::PARQUET: +#ifdef VELOX_ENABLE_PARQUET + updateParquetWriterOptions(hiveConfig, sessionProperties, writerOptions); +#endif + break; + case dwio::common::FileFormat::NIMBLE: + // No-op for now. + break; + default: + VELOX_UNSUPPORTED("{}", fileFormat); + } +} + +} // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveConnectorUtil.h b/velox/connectors/hive/HiveConnectorUtil.h new file mode 100644 index 0000000000000..3b5f25ad82ce0 --- /dev/null +++ b/velox/connectors/hive/HiveConnectorUtil.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include + +#include "velox/connectors/Connector.h" +#include "velox/connectors/hive/FileHandle.h" +#include "velox/dwio/common/BufferedInput.h" +#include "velox/dwio/common/Reader.h" + +namespace facebook::velox::connector::hive { + +class HiveColumnHandle; +class HiveTableHandle; +class HiveConfig; +struct HiveConnectorSplit; + +using SubfieldFilters = + std::unordered_map>; + +constexpr const char* kPath = "$path"; +constexpr const char* kBucket = "$bucket"; + +const std::string& getColumnName(const common::Subfield& subfield); + +void checkColumnNameLowerCase(const std::shared_ptr& type); + +void checkColumnNameLowerCase( + const SubfieldFilters& filters, + const std::unordered_map>& + infoColumns); + +void checkColumnNameLowerCase(const core::TypedExprPtr& typeExpr); + +std::shared_ptr makeScanSpec( + const RowTypePtr& rowType, + const folly::F14FastMap>& + outputSubfields, + const SubfieldFilters& filters, + const RowTypePtr& dataColumns, + const std::unordered_map>& + partitionKeys, + const std::unordered_map>& + infoColumns, + const std::shared_ptr& rowIndexColumn, + memory::MemoryPool* pool); + +void configureReaderOptions( + dwio::common::ReaderOptions& readerOptions, + const std::shared_ptr& config, + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveTableHandle, + const std::shared_ptr& hiveSplit); + +void configureReaderOptions( + dwio::common::ReaderOptions& readerOptions, + const std::shared_ptr& hiveConfig, + const ConnectorQueryCtx* connectorQueryCtx, + const RowTypePtr& fileSchema, + const std::shared_ptr& hiveSplit, + const std::unordered_map& tableParameters = {}); + +void configureRowReaderOptions( + const std::unordered_map& tableParameters, + const std::shared_ptr& scanSpec, + std::shared_ptr metadataFilter, + const RowTypePtr& rowType, + const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveConfig, + const config::ConfigBase* sessionProperties, + dwio::common::RowReaderOptions& rowReaderOptions); + +bool testFilters( + const common::ScanSpec* scanSpec, + const dwio::common::Reader* reader, + const std::string& filePath, + const std::unordered_map>& + partitionKey, + const std::unordered_map>& + partitionKeysHandle); + +std::unique_ptr createBufferedInput( + const FileHandle& fileHandle, + const dwio::common::ReaderOptions& readerOpts, + const ConnectorQueryCtx* connectorQueryCtx, + std::shared_ptr ioStats, + folly::Executor* executor); + +core::TypedExprPtr extractFiltersFromRemainingFilter( + const core::TypedExprPtr& expr, + core::ExpressionEvaluator* evaluator, + bool negated, + SubfieldFilters& filters, + double& sampleRate); + +/// Updates the file format's WriteOptions based on the HiveConfig. +void updateWriterOptionsFromHiveConfig( + dwio::common::FileFormat fileFormat, + const std::shared_ptr& hiveConfig, + const config::ConfigBase* sessionProperties, + std::shared_ptr& writerOptions); + +} // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveDataSink.cpp b/velox/connectors/hive/HiveDataSink.cpp index 4494b2f2ccb70..4c75432b6400e 100644 --- a/velox/connectors/hive/HiveDataSink.cpp +++ b/velox/connectors/hive/HiveDataSink.cpp @@ -16,24 +16,63 @@ #include "velox/connectors/hive/HiveDataSink.h" +#include "velox/common/base/Counters.h" #include "velox/common/base/Fs.h" +#include "velox/common/base/StatsReporter.h" +#include "velox/common/testutil/TestValue.h" #include "velox/connectors/hive/HiveConfig.h" +#include "velox/connectors/hive/HiveConnectorUtil.h" #include "velox/connectors/hive/HivePartitionFunction.h" +#include "velox/connectors/hive/TableHandle.h" #include "velox/core/ITypedExpr.h" #include "velox/dwio/common/SortingWriter.h" -#include "velox/dwio/dwrf/writer/Writer.h" -#include "velox/exec/SortBuffer.h" - -#include "velox/connectors/hive/TableHandle.h" #include "velox/exec/OperatorUtils.h" +#include "velox/exec/SortBuffer.h" #include #include #include +using facebook::velox::common::testutil::TestValue; + namespace facebook::velox::connector::hive { namespace { +// Returns the type of non-partition data columns. +RowTypePtr getNonPartitionTypes( + const std::vector& dataCols, + const RowTypePtr& inputType) { + std::vector childNames; + std::vector childTypes; + const auto& dataSize = dataCols.size(); + childNames.reserve(dataSize); + childTypes.reserve(dataSize); + for (int dataCol : dataCols) { + childNames.push_back(inputType->nameOf(dataCol)); + childTypes.push_back(inputType->childAt(dataCol)); + } + + return ROW(std::move(childNames), std::move(childTypes)); +} + +// Filters out partition columns if there is any. +RowVectorPtr makeDataInput( + const std::vector& dataCols, + const RowVectorPtr& input) { + std::vector childVectors; + childVectors.reserve(dataCols.size()); + for (int dataCol : dataCols) { + childVectors.push_back(input->childAt(dataCol)); + } + + return std::make_shared( + input->pool(), + getNonPartitionTypes(dataCols, asRowType(input->type())), + input->nulls(), + input->size(), + std::move(childVectors), + input->getNullCount()); +} // Returns a subset of column indices corresponding to partition keys. std::vector getPartitionChannels( @@ -50,6 +89,23 @@ std::vector getPartitionChannels( return channels; } +// Returns the column indices of non-partition data columns. +std::vector getNonPartitionChannels( + const std::vector& partitionChannels, + const column_index_t childrenSize) { + std::vector dataChannels; + dataChannels.reserve(childrenSize - partitionChannels.size()); + + for (column_index_t i = 0; i < childrenSize; i++) { + if (std::find(partitionChannels.cbegin(), partitionChannels.cend(), i) == + partitionChannels.cend()) { + dataChannels.push_back(i); + } + } + + return dataChannels; +} + std::string makePartitionDirectory( const std::string& tableDirectory, const std::optional& partitionSubdirectory) { @@ -113,6 +169,21 @@ std::string computeBucketedFileName( return fmt::format( "0{:0>{}}_0_{}", bucketValueStr, kMaxBucketCountPadding, queryId); } + +std::shared_ptr createSinkPool( + const std::shared_ptr& writerPool) { + return writerPool->addLeafChild(fmt::format("{}.sink", writerPool->name())); +} + +std::shared_ptr createSortPool( + const std::shared_ptr& writerPool) { + return writerPool->addLeafChild(fmt::format("{}.sort", writerPool->name())); +} + +#define WRITER_NON_RECLAIMABLE_SECTION_GUARD(index) \ + memory::NonReclaimableSectionGuard nonReclaimableGuard( \ + writerInfo_[(index)]->nonReclaimableSectionHolder.get()) + } // namespace const HiveWriterId& HiveWriterId::unpartitionedId() { @@ -121,14 +192,22 @@ const HiveWriterId& HiveWriterId::unpartitionedId() { } std::string HiveWriterId::toString() const { - if (!partitionId.has_value()) { - return "UNPARTITIONED"; + if (partitionId.has_value() && bucketId.has_value()) { + return fmt::format("part[{}.{}]", partitionId.value(), bucketId.value()); } - if (bucketId.has_value()) { - return fmt::format( - "PARTITIONED[{}.{}]", partitionId.value(), bucketId.value()); + + if (partitionId.has_value() && !bucketId.has_value()) { + return fmt::format("part[{}]", partitionId.value()); } - return fmt::format("PARTITIONED[{}]", partitionId.value()); + + // This WriterId is used to add an identifier in the MemoryPools. This could + // indicate unpart, but the bucket number needs to be disambiguated. So + // creating a new label using bucket. + if (!partitionId.has_value() && bucketId.has_value()) { + return fmt::format("bucket[{}]", bucketId.value()); + } + + return "unpart"; } const std::string LocationHandle::tableTypeName( @@ -275,22 +354,28 @@ HiveDataSink::HiveDataSink( std::shared_ptr insertTableHandle, const ConnectorQueryCtx* connectorQueryCtx, CommitStrategy commitStrategy, - const std::shared_ptr& connectorProperties) + const std::shared_ptr& hiveConfig) : inputType_(std::move(inputType)), insertTableHandle_(std::move(insertTableHandle)), connectorQueryCtx_(connectorQueryCtx), commitStrategy_(commitStrategy), - connectorProperties_(connectorProperties), - maxOpenWriters_( - HiveConfig::maxPartitionsPerWriters(connectorQueryCtx_->config())), + hiveConfig_(hiveConfig), + updateMode_(getUpdateMode()), + maxOpenWriters_(hiveConfig_->maxPartitionsPerWriters( + connectorQueryCtx->sessionProperties())), partitionChannels_(getPartitionChannels(insertTableHandle_)), partitionIdGenerator_( - !partitionChannels_.empty() ? std::make_unique( - inputType_, - partitionChannels_, - maxOpenWriters_, - connectorQueryCtx_->memoryPool()) - : nullptr), + !partitionChannels_.empty() + ? std::make_unique( + inputType_, + partitionChannels_, + maxOpenWriters_, + connectorQueryCtx_->memoryPool(), + hiveConfig_->isPartitionPathAsLowerCase( + connectorQueryCtx->sessionProperties())) + : nullptr), + dataChannels_( + getNonPartitionChannels(partitionChannels_, inputType_->size())), bucketCount_( insertTableHandle_->bucketProperty() == nullptr ? 0 @@ -302,9 +387,7 @@ HiveDataSink::HiveDataSink( : nullptr), writerFactory_(dwio::common::getWriterFactory( insertTableHandle_->tableStorageFormat())), - spillConfig_(connectorQueryCtx->getSpillConfig()) { - VELOX_USER_CHECK( - !isBucketed() || isPartitioned(), "A bucket table must be partitioned"); + spillConfig_(connectorQueryCtx->spillConfig()) { if (isBucketed()) { VELOX_USER_CHECK_LT( bucketCount_, maxBucketCount(), "bucketCount exceeds the limit"); @@ -323,30 +406,39 @@ HiveDataSink::HiveDataSink( sortColumnIndices_.reserve(sortedProperty.size()); sortCompareFlags_.reserve(sortedProperty.size()); for (int i = 0; i < sortedProperty.size(); ++i) { - sortColumnIndices_.push_back( - inputType_->getChildIdx(sortedProperty.at(i)->sortColumn())); - sortCompareFlags_.push_back( - {sortedProperty.at(i)->sortOrder().isNullsFirst(), - sortedProperty.at(i)->sortOrder().isAscending(), - false, - CompareFlags::NullHandlingMode::NoStop}); + auto columnIndex = + getNonPartitionTypes(dataChannels_, inputType_) + ->getChildIdxIfExists(sortedProperty.at(i)->sortColumn()); + if (columnIndex.has_value()) { + sortColumnIndices_.push_back(columnIndex.value()); + sortCompareFlags_.push_back( + {sortedProperty.at(i)->sortOrder().isNullsFirst(), + sortedProperty.at(i)->sortOrder().isAscending(), + false, + CompareFlags::NullHandlingMode::kNullAsValue}); + } } } } +bool HiveDataSink::canReclaim() const { + // Currently, we only support memory reclaim on dwrf file writer. + return (spillConfig_ != nullptr) && + (insertTableHandle_->tableStorageFormat() == + dwio::common::FileFormat::DWRF); +} + void HiveDataSink::appendData(RowVectorPtr input) { - checkNotAborted(); - checkNotClosed(); + checkRunning(); - // Write to unpartitioned table. - if (!isPartitioned()) { + // Write to unpartitioned (and unbucketed) table. + if (!isPartitioned() && !isBucketed()) { const auto index = ensureWriter(HiveWriterId::unpartitionedId()); - writers_[index]->write(input); - writerInfo_[index]->numWrittenRows += input->size(); + write(index, input); return; } - // Write to partitioned table. + // Compute partition and bucket numbers. computePartitionAndBucketIds(input); // Lazy load all the input columns. @@ -354,12 +446,11 @@ void HiveDataSink::appendData(RowVectorPtr input) { input->childAt(i)->loadedVector(); } - // All inputs belong to a single non-bucketed partition. The partition id must - // be zero. + // All inputs belong to a single non-bucketed partition. The partition id + // must be zero. if (!isBucketed() && partitionIdGenerator_->numPartitions() == 1) { const auto index = ensureWriter(HiveWriterId{0}); - writers_[index]->write(input); - writerInfo_[index]->numWrittenRows += input->size(); + write(index, input); return; } @@ -374,44 +465,139 @@ void HiveDataSink::appendData(RowVectorPtr input) { RowVectorPtr writerInput = partitionSize == input->size() ? input : exec::wrap(partitionSize, partitionRows_[index], input); - writers_[index]->write(writerInput); - writerInfo_[index]->numWrittenRows += partitionSize; + write(index, writerInput); + } +} + +void HiveDataSink::write(size_t index, RowVectorPtr input) { + WRITER_NON_RECLAIMABLE_SECTION_GUARD(index); + auto dataInput = makeDataInput(dataChannels_, input); + + writers_[index]->write(dataInput); + writerInfo_[index]->numWrittenRows += dataInput->size(); +} + +std::string HiveDataSink::stateString(State state) { + switch (state) { + case State::kRunning: + return "RUNNING"; + case State::kClosed: + return "CLOSED"; + case State::kAborted: + return "ABORTED"; + default: + VELOX_UNREACHABLE("BAD STATE: {}", static_cast(state)); } } void HiveDataSink::computePartitionAndBucketIds(const RowVectorPtr& input) { - VELOX_CHECK(isPartitioned()); - partitionIdGenerator_->run(input, partitionIds_); + VELOX_CHECK(isPartitioned() || isBucketed()); + if (isPartitioned()) { + if (!hiveConfig_->allowNullPartitionKeys( + connectorQueryCtx_->sessionProperties())) { + // Check that there are no nulls in the partition keys. + for (auto& partitionIdx : partitionChannels_) { + auto col = input->childAt(partitionIdx); + if (col->mayHaveNulls()) { + for (auto i = 0; i < col->size(); ++i) { + VELOX_USER_CHECK( + !col->isNullAt(i), + "Partition key must not be null: {}", + input->type()->asRow().nameOf(partitionIdx)); + } + } + } + } + partitionIdGenerator_->run(input, partitionIds_); + } + if (isBucketed()) { bucketFunction_->partition(*input, bucketIds_); } } -int64_t HiveDataSink::getCompletedBytes() const { - checkNotAborted(); +DataSink::Stats HiveDataSink::stats() const { + Stats stats; + if (state_ == State::kAborted) { + return stats; + } - int64_t completedBytes{0}; + int64_t numWrittenBytes{0}; + int64_t writeIOTimeUs{0}; for (const auto& ioStats : ioStats_) { - completedBytes += ioStats->rawBytesWritten(); + numWrittenBytes += ioStats->rawBytesWritten(); + writeIOTimeUs += ioStats->writeIOTimeUs(); } - return completedBytes; + stats.numWrittenBytes = numWrittenBytes; + stats.writeIOTimeUs = writeIOTimeUs; + + if (state_ != State::kClosed) { + return stats; + } + + stats.numWrittenFiles = writers_.size(); + for (int i = 0; i < writerInfo_.size(); ++i) { + const auto& info = writerInfo_.at(i); + VELOX_CHECK_NOT_NULL(info); + const auto spillStats = info->spillStats->rlock(); + if (!spillStats->empty()) { + stats.spillStats += *spillStats; + } + } + return stats; } -int32_t HiveDataSink::numWrittenFiles() const { - return writers_.size(); +std::shared_ptr HiveDataSink::createWriterPool( + const HiveWriterId& writerId) { + auto* connectorPool = connectorQueryCtx_->connectorMemoryPool(); + return connectorPool->addAggregateChild( + fmt::format("{}.{}", connectorPool->name(), writerId.toString())); } -std::vector HiveDataSink::close(bool success) { - closeInternal(!success); - if (!success) { - VELOX_CHECK(aborted_); - return {}; +void HiveDataSink::setMemoryReclaimers( + HiveWriterInfo* writerInfo, + io::IoStatistics* ioStats) { + auto* connectorPool = connectorQueryCtx_->connectorMemoryPool(); + if (connectorPool->reclaimer() == nullptr) { + return; + } + writerInfo->writerPool->setReclaimer( + WriterReclaimer::create(this, writerInfo, ioStats)); + writerInfo->sinkPool->setReclaimer(exec::MemoryReclaimer::create()); + // NOTE: we set the memory reclaimer for sort pool when we construct the sort + // writer. +} + +void HiveDataSink::setState(State newState) { + checkStateTransition(state_, newState); + state_ = newState; +} + +/// Validates the state transition from 'oldState' to 'newState'. +void HiveDataSink::checkStateTransition(State oldState, State newState) { + switch (oldState) { + case State::kRunning: + if (newState == State::kAborted || newState == State::kClosed) { + return; + } + break; + case State::kAborted: + [[fallthrough]]; + case State::kClosed: + [[fallthrough]]; + default: + break; } - VELOX_CHECK(closed_); + VELOX_FAIL("Unexpected state transition from {} to {}", oldState, newState); +} + +std::vector HiveDataSink::close() { + checkRunning(); + state_ = State::kClosed; + closeInternal(); std::vector partitionUpdates; partitionUpdates.reserve(writerInfo_.size()); - for (int i = 0; i < writerInfo_.size(); ++i) { const auto& info = writerInfo_.at(i); VELOX_CHECK_NOT_NULL(info); @@ -441,27 +627,27 @@ std::vector HiveDataSink::close(bool success) { return partitionUpdates; } -void HiveDataSink::closeInternal(bool abort) { - if (closedOrAborted()) { - if (abort) { - // We can't call abort on a closed data sink. - VELOX_CHECK(aborted_, "Can't abort a closed hive data sink"); - } else { - // We can't call close on an aborted data sink. - VELOX_CHECK(closed_, "Can't close an aborted hive data sink"); - } - return; - } +void HiveDataSink::abort() { + checkRunning(); + state_ = State::kAborted; + closeInternal(); +} + +void HiveDataSink::closeInternal() { + VELOX_CHECK_NE(state_, State::kRunning); - if (!abort) { - closed_ = true; - for (const auto& writer : writers_) { - writer->close(); + TestValue::adjust( + "facebook::velox::connector::hive::HiveDataSink::closeInternal", this); + + if (state_ == State::kClosed) { + for (int i = 0; i < writers_.size(); ++i) { + WRITER_NON_RECLAIMABLE_SECTION_GUARD(i); + writers_[i]->close(); } } else { - aborted_ = true; - for (const auto& writer : writers_) { - writer->abort(); + for (int i = 0; i < writers_.size(); ++i) { + WRITER_NON_RECLAIMABLE_SECTION_GUARD(i); + writers_[i]->abort(); } } } @@ -492,23 +678,84 @@ uint32_t HiveDataSink::appendWriter(const HiveWriterId& id) { auto writerParameters = getWriterParameters(partitionName, id.bucketId); const auto writePath = fs::path(writerParameters.writeDirectory()) / writerParameters.writeFileName(); - writerInfo_.emplace_back( - std::make_shared(std::move(writerParameters))); - - dwio::common::WriterOptions options; - options.schema = inputType_; - options.memoryPool = connectorQueryCtx_->connectorMemoryPool(); - options.compressionKind = insertTableHandle_->compressionKind(); - options.setMemoryReclaimer = connectorQueryCtx_->setMemoryReclaimer(); + auto writerPool = createWriterPool(id); + auto sinkPool = createSinkPool(writerPool); + std::shared_ptr sortPool{nullptr}; + if (sortWrite()) { + sortPool = createSortPool(writerPool); + } + writerInfo_.emplace_back(std::make_shared( + std::move(writerParameters), + std::move(writerPool), + std::move(sinkPool), + std::move(sortPool))); ioStats_.emplace_back(std::make_shared()); + setMemoryReclaimers(writerInfo_.back().get(), ioStats_.back().get()); + + // Take the writer options provided by the user as a starting point, or + // allocate a new one. + auto options = insertTableHandle_->writerOptions(); + if (!options) { + options = writerFactory_->createWriterOptions(); + } + + const auto* connectorSessionProperties = + connectorQueryCtx_->sessionProperties(); + + // Only overwrite options in case they were not already provided. + if (options->schema == nullptr) { + options->schema = getNonPartitionTypes(dataChannels_, inputType_); + } + + if (options->memoryPool == nullptr) { + options->memoryPool = writerInfo_.back()->writerPool.get(); + } + + if (!options->compressionKind) { + options->compressionKind = insertTableHandle_->compressionKind(); + } + + if (options->spillConfig == nullptr && canReclaim()) { + options->spillConfig = spillConfig_; + } + + if (options->nonReclaimableSection == nullptr) { + options->nonReclaimableSection = + writerInfo_.back()->nonReclaimableSectionHolder.get(); + } + + if (options->memoryReclaimerFactory == nullptr || + options->memoryReclaimerFactory() == nullptr) { + options->memoryReclaimerFactory = []() { + return exec::MemoryReclaimer::create(); + }; + } + + if (options->serdeParameters.empty()) { + options->serdeParameters = std::map( + insertTableHandle_->serdeParameters().begin(), + insertTableHandle_->serdeParameters().end()); + } + + updateWriterOptionsFromHiveConfig( + insertTableHandle_->tableStorageFormat(), + hiveConfig_, + connectorSessionProperties, + options); + + // Prevents the memory allocation during the writer creation. + WRITER_NON_RECLAIMABLE_SECTION_GUARD(writerInfo_.size() - 1); auto writer = writerFactory_->createWriter( dwio::common::FileSink::create( writePath, - {.bufferWrite = false, - .connectorProperties = connectorProperties_, - .pool = connectorQueryCtx_->memoryPool(), - .metricLogger = dwio::common::MetricsLog::voidLog(), - .stats = ioStats_.back().get()}), + { + .bufferWrite = false, + .connectorProperties = hiveConfig_->config(), + .fileCreateConfig = hiveConfig_->writeFileCreateConfig(), + .pool = writerInfo_.back()->sinkPool.get(), + .metricLogger = dwio::common::MetricsLog::voidLog(), + .stats = ioStats_.back().get(), + }), options); writer = maybeCreateBucketSortWriter(std::move(writer)); writers_.emplace_back(std::move(writer)); @@ -524,36 +771,57 @@ uint32_t HiveDataSink::appendWriter(const HiveWriterId& id) { std::unique_ptr HiveDataSink::maybeCreateBucketSortWriter( std::unique_ptr writer) { - if (sortColumnIndices_.empty()) { + if (!sortWrite()) { return writer; } + auto* sortPool = writerInfo_.back()->sortPool.get(); + VELOX_CHECK_NOT_NULL(sortPool); auto sortBuffer = std::make_unique( - inputType_, + getNonPartitionTypes(dataChannels_, inputType_), sortColumnIndices_, sortCompareFlags_, - 1000, // todo batch size - connectorQueryCtx_->memoryPool(), - &nonReclaimableSection_, - &numSpillRuns_, - spillConfig_); + sortPool, + writerInfo_.back()->nonReclaimableSectionHolder.get(), + connectorQueryCtx_->prefixSortConfig(), + spillConfig_, + writerInfo_.back()->spillStats.get()); return std::make_unique( - std::move(writer), std::move(sortBuffer)); + std::move(writer), + std::move(sortBuffer), + hiveConfig_->sortWriterMaxOutputRows( + connectorQueryCtx_->sessionProperties()), + hiveConfig_->sortWriterMaxOutputBytes( + connectorQueryCtx_->sessionProperties())); } -void HiveDataSink::splitInputRowsAndEnsureWriters() { - VELOX_CHECK(isPartitioned()); +HiveWriterId HiveDataSink::getWriterId(size_t row) const { + std::optional partitionId; + if (isPartitioned()) { + VELOX_CHECK_LT(partitionIds_[row], std::numeric_limits::max()); + partitionId = static_cast(partitionIds_[row]); + } + + std::optional bucketId; if (isBucketed()) { + bucketId = bucketIds_[row]; + } + return HiveWriterId{partitionId, bucketId}; +} + +void HiveDataSink::splitInputRowsAndEnsureWriters() { + VELOX_CHECK(isPartitioned() || isBucketed()); + if (isBucketed() && isPartitioned()) { VELOX_CHECK_EQ(bucketIds_.size(), partitionIds_.size()); } + std::fill(partitionSizes_.begin(), partitionSizes_.end(), 0); - const auto numRows = partitionIds_.size(); + const auto numRows = + isPartitioned() ? partitionIds_.size() : bucketIds_.size(); for (auto row = 0; row < numRows; ++row) { - VELOX_CHECK_LT(partitionIds_[row], std::numeric_limits::max()); - const uint32_t partitionId = static_cast(partitionIds_[row]); - const auto id = isBucketed() ? HiveWriterId{partitionId, bucketIds_[row]} - : HiveWriterId{partitionId}; - const uint32_t index = ensureWriter(id); + auto id = getWriterId(row); + uint32_t index = ensureWriter(id); + VELOX_DCHECK_LT(index, partitionSizes_.size()); VELOX_DCHECK_EQ(partitionSizes_.size(), partitionRows_.size()); VELOX_DCHECK_EQ(partitionRows_.size(), rawPartitionRows_.size()); @@ -579,12 +847,10 @@ void HiveDataSink::splitInputRowsAndEnsureWriters() { HiveWriterParameters HiveDataSink::getWriterParameters( const std::optional& partition, std::optional bucketId) const { - const auto updateMode = getUpdateMode(); - auto [targetFileName, writeFileName] = getWriterFileNames(bucketId); return HiveWriterParameters{ - updateMode, + updateMode_, partition, targetFileName, makePartitionDirectory( @@ -596,15 +862,17 @@ HiveWriterParameters HiveDataSink::getWriterParameters( std::pair HiveDataSink::getWriterFileNames( std::optional bucketId) const { - std::string targetFileName; + auto targetFileName = insertTableHandle_->locationHandle()->targetFileName(); + const bool generateFileName = targetFileName.empty(); if (bucketId.has_value()) { + VELOX_CHECK(generateFileName); // TODO: add hive.file_renaming_enabled support. targetFileName = computeBucketedFileName( connectorQueryCtx_->queryId(), bucketId.value()); - } else { - // targetFileName includes planNodeId and Uuid. As a result, different table - // writers run by the same task driver or the same table writer run in - // different task tries would have different targetFileNames. + } else if (generateFileName) { + // targetFileName includes planNodeId and Uuid. As a result, different + // table writers run by the same task driver or the same table writer + // run in different task tries would have different targetFileNames. targetFileName = fmt::format( "{}_{}_{}_{}", connectorQueryCtx_->taskId(), @@ -612,17 +880,25 @@ std::pair HiveDataSink::getWriterFileNames( connectorQueryCtx_->planNodeId(), makeUuid()); } + VELOX_CHECK(!targetFileName.empty()); const std::string writeFileName = isCommitRequired() ? fmt::format(".tmp.velox.{}_{}", targetFileName, makeUuid()) : targetFileName; + if (generateFileName && + insertTableHandle_->tableStorageFormat() == + dwio::common::FileFormat::PARQUET) { + return { + fmt::format("{}{}", targetFileName, ".parquet"), + fmt::format("{}{}", writeFileName, ".parquet")}; + } return {targetFileName, writeFileName}; } HiveWriterParameters::UpdateMode HiveDataSink::getUpdateMode() const { - if (insertTableHandle_->isInsertTable()) { + if (insertTableHandle_->isExistingTable()) { if (insertTableHandle_->isPartitioned()) { - const auto insertBehavior = HiveConfig::insertExistingPartitionsBehavior( - connectorQueryCtx_->config()); + const auto insertBehavior = hiveConfig_->insertExistingPartitionsBehavior( + connectorQueryCtx_->sessionProperties()); switch (insertBehavior) { case HiveConfig::InsertExistingPartitionsBehavior::kOverwrite: return HiveWriterParameters::UpdateMode::kOverwrite; @@ -638,7 +914,7 @@ HiveWriterParameters::UpdateMode HiveDataSink::getUpdateMode() const { if (insertTableHandle_->isBucketed()) { VELOX_USER_FAIL("Cannot insert into bucketed unpartitioned Hive table"); } - if (HiveConfig::immutablePartitions(connectorProperties_.get())) { + if (hiveConfig_->immutablePartitions()) { VELOX_USER_FAIL("Unpartitioned Hive tables are immutable."); } return HiveWriterParameters::UpdateMode::kAppend; @@ -663,7 +939,7 @@ bool HiveInsertTableHandle::isBucketed() const { return bucketProperty() != nullptr; } -bool HiveInsertTableHandle::isInsertTable() const { +bool HiveInsertTableHandle::isExistingTable() const { return locationHandle_->tableType() == LocationHandle::TableType::kExisting; } @@ -696,11 +972,22 @@ void HiveInsertTableHandle::registerSerDe() { std::string HiveInsertTableHandle::toString() const { std::ostringstream out; - out << "HiveInsertTableHandle [inputColumns: ["; + out << "HiveInsertTableHandle [" + << dwio::common::toString(tableStorageFormat_); + if (compressionKind_.has_value()) { + out << " " << common::compressionKindToString(compressionKind_.value()); + } else { + out << " none"; + } + out << "], [inputColumns: ["; for (const auto& i : inputColumns_) { out << " " << i->toString(); } - out << " ], locationHandle: " << locationHandle_->toString() << "]"; + out << " ], locationHandle: " << locationHandle_->toString(); + if (bucketProperty_) { + out << ", bucketProperty: " << bucketProperty_->toString(); + } + out << "]"; return out.str(); } @@ -733,4 +1020,69 @@ LocationHandlePtr LocationHandle::create(const folly::dynamic& obj) { return std::make_shared(targetPath, writePath, tableType); } +std::unique_ptr HiveDataSink::WriterReclaimer::create( + HiveDataSink* dataSink, + HiveWriterInfo* writerInfo, + io::IoStatistics* ioStats) { + return std::unique_ptr( + new HiveDataSink::WriterReclaimer(dataSink, writerInfo, ioStats)); +} + +bool HiveDataSink::WriterReclaimer::reclaimableBytes( + const memory::MemoryPool& pool, + uint64_t& reclaimableBytes) const { + VELOX_CHECK_EQ(pool.name(), writerInfo_->writerPool->name()); + reclaimableBytes = 0; + if (!dataSink_->canReclaim()) { + return false; + } + return exec::MemoryReclaimer::reclaimableBytes(pool, reclaimableBytes); +} + +uint64_t HiveDataSink::WriterReclaimer::reclaim( + memory::MemoryPool* pool, + uint64_t targetBytes, + uint64_t maxWaitMs, + memory::MemoryReclaimer::Stats& stats) { + VELOX_CHECK_EQ(pool->name(), writerInfo_->writerPool->name()); + if (!dataSink_->canReclaim()) { + return 0; + } + + if (*writerInfo_->nonReclaimableSectionHolder.get()) { + RECORD_METRIC_VALUE(kMetricMemoryNonReclaimableCount); + LOG(WARNING) << "Can't reclaim from hive writer pool " << pool->name() + << " which is under non-reclaimable section, " + << " reserved memory: " + << succinctBytes(pool->reservedBytes()); + ++stats.numNonReclaimableAttempts; + return 0; + } + + const uint64_t memoryUsageBeforeReclaim = pool->reservedBytes(); + const std::string memoryUsageTreeBeforeReclaim = pool->treeMemoryUsage(); + const auto writtenBytesBeforeReclaim = ioStats_->rawBytesWritten(); + const auto reclaimedBytes = + exec::MemoryReclaimer::reclaim(pool, targetBytes, maxWaitMs, stats); + const auto earlyFlushedRawBytes = + ioStats_->rawBytesWritten() - writtenBytesBeforeReclaim; + addThreadLocalRuntimeStat( + kEarlyFlushedRawBytes, + RuntimeCounter(earlyFlushedRawBytes, RuntimeCounter::Unit::kBytes)); + if (earlyFlushedRawBytes > 0) { + RECORD_METRIC_VALUE( + kMetricFileWriterEarlyFlushedRawBytes, earlyFlushedRawBytes); + } + const uint64_t memoryUsageAfterReclaim = pool->reservedBytes(); + if (memoryUsageAfterReclaim > memoryUsageBeforeReclaim) { + VELOX_FAIL( + "Unexpected memory growth after memory reclaim from {}, the memory usage before reclaim: {}, after reclaim: {}\nThe memory tree usage before reclaim:\n{}\nThe memory tree usage after reclaim:\n{}", + pool->name(), + succinctBytes(memoryUsageBeforeReclaim), + succinctBytes(memoryUsageAfterReclaim), + memoryUsageTreeBeforeReclaim, + pool->treeMemoryUsage()); + } + return reclaimedBytes; +} } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveDataSink.h b/velox/connectors/hive/HiveDataSink.h index a18e6c2d87024..1b3d3bd464f49 100644 --- a/velox/connectors/hive/HiveDataSink.h +++ b/velox/connectors/hive/HiveDataSink.h @@ -17,16 +17,19 @@ #include "velox/common/compression/Compression.h" #include "velox/connectors/Connector.h" +#include "velox/connectors/hive/HiveConfig.h" #include "velox/connectors/hive/PartitionIdGenerator.h" #include "velox/dwio/common/Options.h" #include "velox/dwio/common/Writer.h" #include "velox/dwio/common/WriterFactory.h" +#include "velox/exec/MemoryReclaimer.h" namespace facebook::velox::dwrf { class Writer; } namespace facebook::velox::connector::hive { + class HiveColumnHandle; class LocationHandle; @@ -45,8 +48,10 @@ class LocationHandle : public ISerializable { LocationHandle( std::string targetPath, std::string writePath, - TableType tableType) + TableType tableType, + std::string targetFileName = "") : targetPath_(std::move(targetPath)), + targetFileName_(std::move(targetFileName)), writePath_(std::move(writePath)), tableType_(tableType) {} @@ -54,6 +59,10 @@ class LocationHandle : public ISerializable { return targetPath_; } + const std::string& targetFileName() const { + return targetFileName_; + } + const std::string& writePath() const { return writePath_; } @@ -77,6 +86,8 @@ class LocationHandle : public ISerializable { private: // Target directory path. const std::string targetPath_; + // If non-empty, use this name instead of generating our own. + const std::string targetFileName_; // Staging directory path. const std::string writePath_; // Whether the table to be written is new, already existing or temporary. @@ -184,9 +195,7 @@ FOLLY_ALWAYS_INLINE std::ostream& operator<<( class HiveInsertTableHandle; using HiveInsertTableHandlePtr = std::shared_ptr; -/** - * Represents a request for Hive write. - */ +/// Represents a request for Hive write. class HiveInsertTableHandle : public ConnectorInsertTableHandle { public: HiveInsertTableHandle( @@ -195,16 +204,21 @@ class HiveInsertTableHandle : public ConnectorInsertTableHandle { dwio::common::FileFormat tableStorageFormat = dwio::common::FileFormat::DWRF, std::shared_ptr bucketProperty = nullptr, - std::optional compressionKind = {}) + std::optional compressionKind = {}, + const std::unordered_map& serdeParameters = {}, + const std::shared_ptr& writerOptions = + nullptr) : inputColumns_(std::move(inputColumns)), locationHandle_(std::move(locationHandle)), tableStorageFormat_(tableStorageFormat), bucketProperty_(std::move(bucketProperty)), - compressionKind_(compressionKind) { + compressionKind_(compressionKind), + serdeParameters_(serdeParameters), + writerOptions_(writerOptions) { if (compressionKind.has_value()) { VELOX_CHECK( compressionKind.value() != common::CompressionKind_MAX, - "Unsupported compression type: CompressionKind_MAX") + "Unsupported compression type: CompressionKind_MAX"); } } @@ -227,6 +241,14 @@ class HiveInsertTableHandle : public ConnectorInsertTableHandle { return tableStorageFormat_; } + const std::unordered_map& serdeParameters() const { + return serdeParameters_; + } + + const std::shared_ptr& writerOptions() const { + return writerOptions_; + } + bool supportsMultiThreading() const override { return true; } @@ -237,7 +259,7 @@ class HiveInsertTableHandle : public ConnectorInsertTableHandle { const HiveBucketProperty* bucketProperty() const; - bool isInsertTable() const; + bool isExistingTable() const; folly::dynamic serialize() const override; @@ -245,7 +267,7 @@ class HiveInsertTableHandle : public ConnectorInsertTableHandle { static void registerSerDe(); - std::string toString() const; + std::string toString() const override; private: const std::vector> inputColumns_; @@ -253,6 +275,8 @@ class HiveInsertTableHandle : public ConnectorInsertTableHandle { const dwio::common::FileFormat tableStorageFormat_; const std::shared_ptr bucketProperty_; const std::optional compressionKind_; + const std::unordered_map serdeParameters_; + const std::shared_ptr writerOptions_; }; /// Parameters for Hive writers. @@ -339,10 +363,26 @@ class HiveWriterParameters { }; struct HiveWriterInfo { - explicit HiveWriterInfo(HiveWriterParameters parameters) - : writerParameters(std::move(parameters)) {} + HiveWriterInfo( + HiveWriterParameters parameters, + std::shared_ptr _writerPool, + std::shared_ptr _sinkPool, + std::shared_ptr _sortPool) + : writerParameters(std::move(parameters)), + nonReclaimableSectionHolder(new tsan_atomic(false)), + spillStats(std::make_unique>()), + writerPool(std::move(_writerPool)), + sinkPool(std::move(_sinkPool)), + sortPool(std::move(_sortPool)) {} const HiveWriterParameters writerParameters; + const std::unique_ptr> nonReclaimableSectionHolder; + /// Collects the spill stats from sort writer if the spilling has been + /// triggered. + const std::unique_ptr> spillStats; + const std::shared_ptr writerPool; + const std::shared_ptr sinkPool; + const std::shared_ptr sortPool; int64_t numWrittenRows = 0; }; @@ -353,13 +393,13 @@ struct HiveWriterId { HiveWriterId() = default; - explicit HiveWriterId(uint32_t _partitionId) - : HiveWriterId(_partitionId, std::nullopt) {} - - HiveWriterId(uint32_t _partitionId, std::optional _bucketId) + HiveWriterId( + std::optional _partitionId, + std::optional _bucketId = std::nullopt) : partitionId(_partitionId), bucketId(_bucketId) {} - /// Returns the special writer id for the un-partitioned table. + /// Returns the special writer id for the un-partitioned (and non-bucketed) + /// table. static const HiveWriterId& unpartitionedId(); std::string toString() const; @@ -386,12 +426,15 @@ struct HiveWriterIdEq { class HiveDataSink : public DataSink { public: + /// The list of runtime stats reported by hive data sink + static constexpr const char* kEarlyFlushedRawBytes = "earlyFlushedRawBytes"; + HiveDataSink( RowTypePtr inputType, std::shared_ptr insertTableHandle, const ConnectorQueryCtx* connectorQueryCtx, CommitStrategy commitStrategy, - const std::shared_ptr& connectorProperties); + const std::shared_ptr& hiveConfig); static uint32_t maxBucketCount() { static const uint32_t kMaxBucketCount = 100'000; @@ -400,13 +443,65 @@ class HiveDataSink : public DataSink { void appendData(RowVectorPtr input) override; - int64_t getCompletedBytes() const override; + Stats stats() const override; - int32_t numWrittenFiles() const override; + std::vector close() override; - std::vector close(bool success) override; + void abort() override; + + bool canReclaim() const; private: + enum class State { kRunning = 0, kAborted = 1, kClosed = 2 }; + friend struct fmt::formatter< + facebook::velox::connector::hive::HiveDataSink::State>; + + static std::string stateString(State state); + + // Validates the state transition from 'oldState' to 'newState'. + void checkStateTransition(State oldState, State newState); + void setState(State newState); + + class WriterReclaimer : public exec::MemoryReclaimer { + public: + static std::unique_ptr create( + HiveDataSink* dataSink, + HiveWriterInfo* writerInfo, + io::IoStatistics* ioStats); + + bool reclaimableBytes( + const memory::MemoryPool& pool, + uint64_t& reclaimableBytes) const override; + + uint64_t reclaim( + memory::MemoryPool* pool, + uint64_t targetBytes, + uint64_t maxWaitMs, + memory::MemoryReclaimer::Stats& stats) override; + + private: + WriterReclaimer( + HiveDataSink* dataSink, + HiveWriterInfo* writerInfo, + io::IoStatistics* ioStats) + : exec::MemoryReclaimer(), + dataSink_(dataSink), + writerInfo_(writerInfo), + ioStats_(ioStats) { + VELOX_CHECK_NOT_NULL(dataSink_); + VELOX_CHECK_NOT_NULL(writerInfo_); + VELOX_CHECK_NOT_NULL(ioStats_); + } + + HiveDataSink* const dataSink_; + HiveWriterInfo* const writerInfo_; + io::IoStatistics* const ioStats_; + }; + + FOLLY_ALWAYS_INLINE bool sortWrite() const { + return !sortColumnIndices_.empty(); + } + // Returns true if the table is partitioned. FOLLY_ALWAYS_INLINE bool isPartitioned() const { return partitionIdGenerator_ != nullptr; @@ -421,9 +516,20 @@ class HiveDataSink : public DataSink { return commitStrategy_ != CommitStrategy::kNoCommit; } + std::shared_ptr createWriterPool( + const HiveWriterId& writerId); + + void setMemoryReclaimers( + HiveWriterInfo* writerInfo, + io::IoStatistics* ioStats); + // Compute the partition id and bucket id for each row in 'input'. void computePartitionAndBucketIds(const RowVectorPtr& input); + // Get the HiveWriter corresponding to the row + // from partitionIds and bucketIds. + FOLLY_ALWAYS_INLINE HiveWriterId getWriterId(size_t row) const; + // Computes the number of input rows as well as the actual input row indices // to each corresponding (bucketed) partition based on the partition and // bucket ids calculated by 'computePartitionAndBucketIds'. The function also @@ -458,29 +564,26 @@ class HiveDataSink : public DataSink { HiveWriterParameters::UpdateMode getUpdateMode() const; - FOLLY_ALWAYS_INLINE bool closedOrAborted() const { - VELOX_CHECK(!(closed_ && aborted_)); - return closed_ || aborted_; + FOLLY_ALWAYS_INLINE void checkRunning() const { + VELOX_CHECK_EQ(state_, State::kRunning, "Hive data sink is not running"); } - FOLLY_ALWAYS_INLINE void checkNotClosed() const { - VELOX_CHECK(!closed_, "Hive data sink has been closed"); - } + // Invoked to write 'input' to the specified file writer. + void write(size_t index, RowVectorPtr input); - FOLLY_ALWAYS_INLINE void checkNotAborted() const { - VELOX_CHECK(!aborted_, "Hive data sink hash been aborted"); - } - - void closeInternal(bool abort); + void closeInternal(); const RowTypePtr inputType_; const std::shared_ptr insertTableHandle_; const ConnectorQueryCtx* const connectorQueryCtx_; const CommitStrategy commitStrategy_; - const std::shared_ptr connectorProperties_; + const std::shared_ptr hiveConfig_; + const HiveWriterParameters::UpdateMode updateMode_; const uint32_t maxOpenWriters_; const std::vector partitionChannels_; const std::unique_ptr partitionIdGenerator_; + // Indices of dataChannel are stored in ascending order + const std::vector dataChannels_; const int32_t bucketCount_{0}; const std::unique_ptr bucketFunction_; const std::shared_ptr writerFactory_; @@ -489,10 +592,8 @@ class HiveDataSink : public DataSink { std::vector sortColumnIndices_; std::vector sortCompareFlags_; - bool closed_{false}; - bool aborted_{false}; + State state_{State::kRunning}; - uint32_t numSpillRuns_{0}; tsan_atomic nonReclaimableSection_{false}; // The map from writer id to the writer index in 'writers_' and 'writerInfo_'. @@ -519,3 +620,24 @@ class HiveDataSink : public DataSink { }; } // namespace facebook::velox::connector::hive + +template <> +struct fmt::formatter + : formatter { + auto format( + facebook::velox::connector::hive::HiveDataSink::State s, + format_context& ctx) const { + return formatter::format(static_cast(s), ctx); + } +}; + +template <> +struct fmt::formatter< + facebook::velox::connector::hive::LocationHandle::TableType> + : formatter { + auto format( + facebook::velox::connector::hive::LocationHandle::TableType s, + format_context& ctx) const { + return formatter::format(static_cast(s), ctx); + } +}; diff --git a/velox/connectors/hive/HiveDataSource.cpp b/velox/connectors/hive/HiveDataSource.cpp index e120f63759821..3f41f23b2f530 100644 --- a/velox/connectors/hive/HiveDataSource.cpp +++ b/velox/connectors/hive/HiveDataSource.cpp @@ -16,14 +16,18 @@ #include "velox/connectors/hive/HiveDataSource.h" +#include #include #include -#include "velox/dwio/common/CachedBufferedInput.h" +#include "velox/common/testutil/TestValue.h" +#include "velox/connectors/hive/HiveConfig.h" +#include "velox/connectors/hive/HiveConnectorUtil.h" #include "velox/dwio/common/ReaderFactory.h" -#include "velox/expression/ExprToSubfieldFilter.h" #include "velox/expression/FieldReference.h" +using facebook::velox::common::testutil::TestValue; + namespace facebook::velox::connector::hive { class HiveTableHandle; @@ -31,321 +35,28 @@ class HiveColumnHandle; namespace { -struct SubfieldSpec { - const common::Subfield* subfield; - bool filterOnly; -}; - -template -void deduplicate(std::vector& values) { - std::sort(values.begin(), values.end()); - values.erase(std::unique(values.begin(), values.end()), values.end()); -} - -// Floating point map key subscripts are truncated toward 0 in Presto. For -// example given `a' as a map with floating point key, if user queries a[0.99], -// Presto coordinator will generate a required subfield a[0]; for a[-1.99] it -// will generate a[-1]; for anything larger than 9223372036854775807, it -// generates a[9223372036854775807]; for anything smaller than -// -9223372036854775808 it generates a[-9223372036854775808]. -template -std::unique_ptr makeFloatingPointMapKeyFilter( - const std::vector& subscripts) { - std::vector> filters; - for (auto subscript : subscripts) { - T lower = subscript; - T upper = subscript; - bool lowerUnbounded = subscript == std::numeric_limits::min(); - bool upperUnbounded = subscript == std::numeric_limits::max(); - bool lowerExclusive = false; - bool upperExclusive = false; - if (lower <= 0 && !lowerUnbounded) { - if (lower > subscript - 1) { - lower = subscript - 1; - } else { - lower = std::nextafter(lower, -std::numeric_limits::infinity()); - } - lowerExclusive = true; - } - if (upper >= 0 && !upperUnbounded) { - if (upper < subscript + 1) { - upper = subscript + 1; - } else { - upper = std::nextafter(upper, std::numeric_limits::infinity()); - } - upperExclusive = true; - } - if (lowerUnbounded && upperUnbounded) { - continue; - } - filters.push_back(std::make_unique>( - lower, - lowerUnbounded, - lowerExclusive, - upper, - upperUnbounded, - upperExclusive, - false)); - } - if (filters.size() == 1) { - return std::move(filters[0]); - } - return std::make_unique(std::move(filters), false, false); -} - -// Recursively add subfields to scan spec. -void addSubfields( - const Type& type, - std::vector& subfields, - int level, - memory::MemoryPool* pool, - common::ScanSpec& spec) { - int newSize = 0; - for (int i = 0; i < subfields.size(); ++i) { - if (level < subfields[i].subfield->path().size()) { - subfields[newSize++] = subfields[i]; - } else if (!subfields[i].filterOnly) { - spec.addAllChildFields(type); - return; - } - } - subfields.resize(newSize); - switch (type.kind()) { - case TypeKind::ROW: { - folly::F14FastMap> required; - for (auto& subfield : subfields) { - auto* element = subfield.subfield->path()[level].get(); - auto* nestedField = - dynamic_cast(element); - VELOX_CHECK( - nestedField, - "Unsupported for row subfields pruning: {}", - element->toString()); - required[nestedField->name()].push_back(subfield); - } - auto& rowType = type.asRow(); - for (int i = 0; i < rowType.size(); ++i) { - auto& childName = rowType.nameOf(i); - auto& childType = rowType.childAt(i); - auto* child = spec.addField(childName, i); - auto it = required.find(childName); - if (it == required.end()) { - child->setConstantValue( - BaseVector::createNullConstant(childType, 1, pool)); - } else { - addSubfields(*childType, it->second, level + 1, pool, *child); - } - } - break; - } - case TypeKind::MAP: { - auto& keyType = type.childAt(0); - auto* keys = spec.addMapKeyFieldRecursively(*keyType); - addSubfields( - *type.childAt(1), - subfields, - level + 1, - pool, - *spec.addMapValueField()); - if (subfields.empty()) { - return; - } - bool stringKey = keyType->isVarchar() || keyType->isVarbinary(); - std::vector stringSubscripts; - std::vector longSubscripts; - for (auto& subfield : subfields) { - auto* element = subfield.subfield->path()[level].get(); - if (dynamic_cast(element)) { - return; - } - if (stringKey) { - auto* subscript = - dynamic_cast(element); - VELOX_CHECK( - subscript, - "Unsupported for string map pruning: {}", - element->toString()); - stringSubscripts.push_back(subscript->index()); - } else { - auto* subscript = - dynamic_cast(element); - VELOX_CHECK( - subscript, - "Unsupported for long map pruning: {}", - element->toString()); - longSubscripts.push_back(subscript->index()); - } - } - std::unique_ptr filter; - if (stringKey) { - deduplicate(stringSubscripts); - filter = std::make_unique(stringSubscripts, false); - } else { - deduplicate(longSubscripts); - if (keyType->isReal()) { - filter = makeFloatingPointMapKeyFilter(longSubscripts); - } else if (keyType->isDouble()) { - filter = makeFloatingPointMapKeyFilter(longSubscripts); - } else { - filter = common::createBigintValues(longSubscripts, false); - } - } - keys->setFilter(std::move(filter)); - break; - } - case TypeKind::ARRAY: { - addSubfields( - *type.childAt(0), - subfields, - level + 1, - pool, - *spec.addArrayElementField()); - if (subfields.empty()) { - return; - } - constexpr long kMaxIndex = std::numeric_limits::max(); - long maxIndex = -1; - for (auto& subfield : subfields) { - auto* element = subfield.subfield->path()[level].get(); - if (dynamic_cast(element)) { - return; - } - auto* subscript = - dynamic_cast(element); - VELOX_CHECK( - subscript, - "Unsupported for array pruning: {}", - element->toString()); - maxIndex = std::max(maxIndex, std::min(kMaxIndex, subscript->index())); - } - spec.setMaxArrayElementsCount(maxIndex); - break; - } - default: - break; - } -} - -core::CallTypedExprPtr replaceInputs( - const core::CallTypedExpr* call, - std::vector&& inputs) { - return std::make_shared( - call->type(), std::move(inputs), call->name()); +bool isMember( + const std::vector& fields, + const exec::FieldReference& field) { + return std::find(fields.begin(), fields.end(), &field) != fields.end(); } -void checkColumnNameLowerCase(const std::shared_ptr& type) { - switch (type->kind()) { - case TypeKind::ARRAY: - checkColumnNameLowerCase(type->asArray().elementType()); - break; - case TypeKind::MAP: { - checkColumnNameLowerCase(type->asMap().keyType()); - checkColumnNameLowerCase(type->asMap().valueType()); - - } break; - case TypeKind::ROW: { - for (auto& outputName : type->asRow().names()) { - VELOX_CHECK( - !std::any_of(outputName.begin(), outputName.end(), isupper)); - } - for (auto& childType : type->asRow().children()) { - checkColumnNameLowerCase(childType); - } - } break; - default: - VLOG(1) << "No need to check type lowercase mode" << type->toString(); +bool shouldEagerlyMaterialize( + const exec::Expr& remainingFilter, + const exec::FieldReference& field) { + if (!remainingFilter.evaluatesArgumentsOnNonIncreasingSelection()) { + return true; } -} - -void checkColumnNameLowerCase(const SubfieldFilters& filters) { - for (auto& pair : filters) { - if (auto name = pair.first.toString(); name == kPath || name == kBucket) { - continue; + for (auto& input : remainingFilter.inputs()) { + if (isMember(input->distinctFields(), field) && input->hasConditionals()) { + return true; } - auto& path = pair.first.path(); - - for (int i = 0; i < path.size(); ++i) { - auto nestedField = - dynamic_cast(path[i].get()); - if (nestedField == nullptr) { - continue; - } - VELOX_CHECK(!std::any_of( - nestedField->name().begin(), nestedField->name().end(), isupper)); - } - } -} - -void checkColumnNameLowerCase(const core::TypedExprPtr& typeExpr) { - if (typeExpr == nullptr) { - return; - } - checkColumnNameLowerCase(typeExpr->type()); - for (auto& type : typeExpr->inputs()) { - checkColumnNameLowerCase(type); } -} - -const std::string& getColumnName(const common::Subfield& subfield) { - VELOX_CHECK_GT(subfield.path().size(), 0); - auto* field = dynamic_cast( - subfield.path()[0].get()); - VELOX_CHECK(field); - return field->name(); + return false; } } // namespace -core::TypedExprPtr HiveDataSource::extractFiltersFromRemainingFilter( - const core::TypedExprPtr& expr, - core::ExpressionEvaluator* evaluator, - bool negated, - SubfieldFilters& filters) { - auto* call = dynamic_cast(expr.get()); - if (!call) { - return expr; - } - common::Filter* oldFilter = nullptr; - try { - common::Subfield subfield; - if (auto filter = exec::leafCallToSubfieldFilter( - *call, subfield, evaluator, negated)) { - if (auto it = filters.find(subfield); it != filters.end()) { - oldFilter = it->second.get(); - filter = filter->mergeWith(oldFilter); - } - filters.insert_or_assign(std::move(subfield), std::move(filter)); - return nullptr; - } - } catch (const VeloxException&) { - LOG(WARNING) << "Unexpected failure when extracting filter for: " - << expr->toString(); - if (oldFilter) { - LOG(WARNING) << "Merging with " << oldFilter->toString(); - } - } - if (call->name() == "not") { - auto inner = extractFiltersFromRemainingFilter( - call->inputs()[0], evaluator, !negated, filters); - return inner ? replaceInputs(call, {inner}) : nullptr; - } - if ((call->name() == "and" && !negated) || - (call->name() == "or" && negated)) { - auto lhs = extractFiltersFromRemainingFilter( - call->inputs()[0], evaluator, negated, filters); - auto rhs = extractFiltersFromRemainingFilter( - call->inputs()[1], evaluator, negated, filters); - if (!lhs) { - return rhs; - } - if (!rhs) { - return lhs; - } - return replaceInputs(call, {lhs, rhs}); - } - return expr; -} - HiveDataSource::HiveDataSource( const RowTypePtr& outputType, const std::shared_ptr& tableHandle, @@ -353,37 +64,41 @@ HiveDataSource::HiveDataSource( std::string, std::shared_ptr>& columnHandles, FileHandleFactory* fileHandleFactory, - core::ExpressionEvaluator* expressionEvaluator, - cache::AsyncDataCache* cache, - const std::string& scanId, folly::Executor* executor, - const dwio::common::ReaderOptions& options) + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig) : fileHandleFactory_(fileHandleFactory), - readerOpts_(options), - pool_(&options.getMemoryPool()), + executor_(executor), + connectorQueryCtx_(connectorQueryCtx), + hiveConfig_(hiveConfig), + pool_(connectorQueryCtx->memoryPool()), outputType_(outputType), - expressionEvaluator_(expressionEvaluator), - cache_(cache), - scanId_(scanId), - executor_(executor) { + expressionEvaluator_(connectorQueryCtx->expressionEvaluator()) { // Column handled keyed on the column alias, the name used in the query. for (const auto& [canonicalizedName, columnHandle] : columnHandles) { auto handle = std::dynamic_pointer_cast(columnHandle); - VELOX_CHECK( - handle != nullptr, + VELOX_CHECK_NOT_NULL( + handle, "ColumnHandle must be an instance of HiveColumnHandle for {}", canonicalizedName); if (handle->columnType() == HiveColumnHandle::ColumnType::kPartitionKey) { partitionKeys_.emplace(handle->name(), handle); } + + if (handle->columnType() == HiveColumnHandle::ColumnType::kSynthesized) { + infoColumns_.emplace(handle->name(), handle); + } + + if (handle->columnType() == HiveColumnHandle::ColumnType::kRowIndex) { + VELOX_CHECK_NULL(rowIndexColumn_); + rowIndexColumn_ = handle; + } } - std::vector readerRowNames; - auto readerRowTypes = outputType_->children(); - folly::F14FastMap> - subfields; - for (auto& outputName : outputType_->names()) { + std::vector readColumnNames; + auto readColumnTypes = outputType_->children(); + for (const auto& outputName : outputType_->names()) { auto it = columnHandles.find(outputName); VELOX_CHECK( it != columnHandles.end(), @@ -391,77 +106,61 @@ HiveDataSource::HiveDataSource( outputName); auto* handle = static_cast(it->second.get()); - readerRowNames.push_back(handle->name()); + readColumnNames.push_back(handle->name()); for (auto& subfield : handle->requiredSubfields()) { VELOX_USER_CHECK_EQ( getColumnName(subfield), handle->name(), "Required subfield does not match column name"); - subfields[handle->name()].push_back(&subfield); + subfields_[handle->name()].push_back(&subfield); } } hiveTableHandle_ = std::dynamic_pointer_cast(tableHandle); - VELOX_CHECK( - hiveTableHandle_ != nullptr, - "TableHandle must be an instance of HiveTableHandle"); - if (readerOpts_.isFileColumnNamesReadAsLowerCase()) { + VELOX_CHECK_NOT_NULL( + hiveTableHandle_, "TableHandle must be an instance of HiveTableHandle"); + if (hiveConfig_->isFileColumnNamesReadAsLowerCase( + connectorQueryCtx->sessionProperties())) { checkColumnNameLowerCase(outputType_); - checkColumnNameLowerCase(hiveTableHandle_->subfieldFilters()); + checkColumnNameLowerCase(hiveTableHandle_->subfieldFilters(), infoColumns_); checkColumnNameLowerCase(hiveTableHandle_->remainingFilter()); } - SubfieldFilters filters; - core::TypedExprPtr remainingFilter; - if (hiveTableHandle_->isFilterPushdownEnabled()) { - for (auto& [k, v] : hiveTableHandle_->subfieldFilters()) { - filters.emplace(k.clone(), v->clone()); - } - remainingFilter = extractFiltersFromRemainingFilter( - hiveTableHandle_->remainingFilter(), - expressionEvaluator_, - false, - filters); - } else { - for (auto& [field, _] : hiveTableHandle_->subfieldFilters()) { - VELOX_USER_CHECK_EQ( - field.path().size(), - 1, - "Unexpected filter on table {}, field {}", - hiveTableHandle_->tableName(), - field.toString()); - auto* nestedField = dynamic_cast( - field.path()[0].get()); - VELOX_USER_CHECK_NOT_NULL( - nestedField, - "Unexpected filter on table {}, field {}", - hiveTableHandle_->tableName(), - field.toString()); - VELOX_USER_CHECK_GT( - partitionKeys_.count(nestedField->name()), - 0, - "Unexpected filter on table {}, field {}", - hiveTableHandle_->tableName(), - field.toString()); - } - remainingFilter = hiveTableHandle_->remainingFilter(); + for (const auto& [k, v] : hiveTableHandle_->subfieldFilters()) { + filters_.emplace(k.clone(), v->clone()); + } + double sampleRate = 1; + auto remainingFilter = extractFiltersFromRemainingFilter( + hiveTableHandle_->remainingFilter(), + expressionEvaluator_, + false, + filters_, + sampleRate); + if (sampleRate != 1) { + randomSkip_ = std::make_shared(sampleRate); } std::vector remainingFilterSubfields; if (remainingFilter) { remainingFilterExprSet_ = expressionEvaluator_->compile(remainingFilter); auto& remainingFilterExpr = remainingFilterExprSet_->expr(0); - folly::F14FastSet columnNames( - readerRowNames.begin(), readerRowNames.end()); + folly::F14FastMap columnNames; + for (int i = 0; i < readColumnNames.size(); ++i) { + columnNames[readColumnNames[i]] = i; + } for (auto& input : remainingFilterExpr->distinctFields()) { - if (columnNames.count(input->field()) > 0) { + auto it = columnNames.find(input->field()); + if (it != columnNames.end()) { + if (shouldEagerlyMaterialize(*remainingFilterExpr, *input)) { + multiReferencedFields_.push_back(it->second); + } continue; } // Remaining filter may reference columns that are not used otherwise, // e.g. are not being projected out and are not used in range filters. // Make sure to add these columns to readerOutputType_. - readerRowNames.push_back(input->field()); - readerRowTypes.push_back(input->type()); + readColumnNames.push_back(input->field()); + readColumnTypes.push_back(input->type()); } remainingFilterSubfields = remainingFilterExpr->extractSubfields(); if (VLOG_IS_ON(1)) { @@ -470,129 +169,176 @@ HiveDataSource::HiveDataSource( fmt::join(remainingFilterSubfields, ", ")); } for (auto& subfield : remainingFilterSubfields) { - auto& name = getColumnName(subfield); - auto it = subfields.find(name); - if (it != subfields.end()) { + const auto& name = getColumnName(subfield); + auto it = subfields_.find(name); + if (it != subfields_.end()) { // Only subfields of the column are projected out. it->second.push_back(&subfield); } else if (columnNames.count(name) == 0) { // Column appears only in remaining filter. - subfields[name].push_back(&subfield); + subfields_[name].push_back(&subfield); } } } - readerOutputType_ = ROW(std::move(readerRowNames), std::move(readerRowTypes)); + readerOutputType_ = + ROW(std::move(readColumnNames), std::move(readColumnTypes)); scanSpec_ = makeScanSpec( readerOutputType_, - subfields, - filters, + subfields_, + filters_, hiveTableHandle_->dataColumns(), + partitionKeys_, + infoColumns_, + rowIndexColumn_, pool_); if (remainingFilter) { metadataFilter_ = std::make_shared( *scanSpec_, *remainingFilter, expressionEvaluator_); } - readerOpts_.setFileSchema(hiveTableHandle_->dataColumns()); ioStats_ = std::make_shared(); } -inline uint8_t parseDelimiter(const std::string& delim) { - for (char const& ch : delim) { - if (!std::isdigit(ch)) { - return delim[0]; - } - } - return stoi(delim); -} - -void HiveDataSource::parseSerdeParameters( - const std::unordered_map& serdeParameters) { - auto fieldIt = serdeParameters.find(dwio::common::SerDeOptions::kFieldDelim); - if (fieldIt == serdeParameters.end()) { - fieldIt = serdeParameters.find("serialization.format"); - } - auto collectionIt = - serdeParameters.find(dwio::common::SerDeOptions::kCollectionDelim); - if (collectionIt == serdeParameters.end()) { - // For collection delimiter, Hive 1.x, 2.x uses "colelction.delim", but - // Hive 3.x uses "collection.delim". - // See: https://issues.apache.org/jira/browse/HIVE-16922) - collectionIt = serdeParameters.find("colelction.delim"); - } - auto mapKeyIt = - serdeParameters.find(dwio::common::SerDeOptions::kMapKeyDelim); - - if (fieldIt == serdeParameters.end() && - collectionIt == serdeParameters.end() && - mapKeyIt == serdeParameters.end()) { - return; - } - - uint8_t fieldDelim = '\1'; - uint8_t collectionDelim = '\2'; - uint8_t mapKeyDelim = '\3'; - if (fieldIt != serdeParameters.end()) { - fieldDelim = parseDelimiter(fieldIt->second); - } - if (collectionIt != serdeParameters.end()) { - collectionDelim = parseDelimiter(collectionIt->second); - } - if (mapKeyIt != serdeParameters.end()) { - mapKeyDelim = parseDelimiter(mapKeyIt->second); - } - dwio::common::SerDeOptions serDeOptions( - fieldDelim, collectionDelim, mapKeyDelim); - readerOpts_.setSerDeOptions(serDeOptions); -} - std::unique_ptr HiveDataSource::createSplitReader() { return SplitReader::create( - split_, readerOutputType_, partitionKeys_, scanSpec_, pool_); + split_, + hiveTableHandle_, + &partitionKeys_, + connectorQueryCtx_, + hiveConfig_, + readerOutputType_, + ioStats_, + fileHandleFactory_, + executor_, + scanSpec_); +} + +std::unique_ptr HiveDataSource::setupBucketConversion() { + VELOX_CHECK_NE( + split_->bucketConversion->tableBucketCount, + split_->bucketConversion->partitionBucketCount); + VELOX_CHECK(split_->tableBucketNumber.has_value()); + VELOX_CHECK_NOT_NULL(hiveTableHandle_->dataColumns()); + ++numBucketConversion_; + bool rebuildScanSpec = false; + std::vector names; + std::vector types; + std::vector bucketChannels; + for (auto& handle : split_->bucketConversion->bucketColumnHandles) { + VELOX_CHECK(handle->columnType() == HiveColumnHandle::ColumnType::kRegular); + if (subfields_.erase(handle->name()) > 0) { + rebuildScanSpec = true; + } + auto index = readerOutputType_->getChildIdxIfExists(handle->name()); + if (!index.has_value()) { + if (names.empty()) { + names = readerOutputType_->names(); + types = readerOutputType_->children(); + } + index = names.size(); + names.push_back(handle->name()); + types.push_back( + hiveTableHandle_->dataColumns()->findChild(handle->name())); + rebuildScanSpec = true; + } + bucketChannels.push_back(*index); + } + if (!names.empty()) { + readerOutputType_ = ROW(std::move(names), std::move(types)); + } + if (rebuildScanSpec) { + auto newScanSpec = makeScanSpec( + readerOutputType_, + subfields_, + filters_, + hiveTableHandle_->dataColumns(), + partitionKeys_, + infoColumns_, + rowIndexColumn_, + pool_); + newScanSpec->moveAdaptationFrom(*scanSpec_); + scanSpec_ = std::move(newScanSpec); + } + return std::make_unique( + split_->bucketConversion->tableBucketCount, std::move(bucketChannels)); } void HiveDataSource::addSplit(std::shared_ptr split) { - VELOX_CHECK( - split_ == nullptr, + VELOX_CHECK_NULL( + split_, "Previous split has not been processed yet. Call next to process the split."); split_ = std::dynamic_pointer_cast(split); - VELOX_CHECK(split_, "Wrong type of split"); + VELOX_CHECK_NOT_NULL(split_, "Wrong type of split"); VLOG(1) << "Adding split " << split_->toString(); - if (readerOpts_.getFileFormat() != dwio::common::FileFormat::UNKNOWN) { - VELOX_CHECK( - readerOpts_.getFileFormat() == split_->fileFormat, - "HiveDataSource received splits of different formats: {} and {}", - toString(readerOpts_.getFileFormat()), - toString(split_->fileFormat)); + if (splitReader_) { + splitReader_.reset(); + } + + if (split_->bucketConversion.has_value()) { + partitionFunction_ = setupBucketConversion(); } else { - parseSerdeParameters(split_->serdeParameters); - readerOpts_.setFileFormat(split_->fileFormat); + partitionFunction_.reset(); } - auto fileHandle = fileHandleFactory_->generate(split_->filePath).second; - auto input = createBufferedInput(*fileHandle, readerOpts_); + splitReader_ = createSplitReader(); + // Split reader subclasses may need to use the reader options in prepareSplit + // so we initialize it beforehand. + splitReader_->configureReaderOptions(randomSkip_); + splitReader_->prepareSplit(metadataFilter_, runtimeStats_, rowIndexColumn_); +} - if (splitReader_) { - splitReader_.reset(); +vector_size_t HiveDataSource::applyBucketConversion( + const RowVectorPtr& rowVector, + BufferPtr& indices) { + partitions_.clear(); + partitionFunction_->partition(*rowVector, partitions_); + const auto bucketToKeep = *split_->tableBucketNumber; + const auto partitionBucketCount = + split_->bucketConversion->partitionBucketCount; + for (vector_size_t i = 0; i < rowVector->size(); ++i) { + VELOX_CHECK_EQ((partitions_[i] - bucketToKeep) % partitionBucketCount, 0); + } + + if (remainingFilterExprSet_) { + for (vector_size_t i = 0; i < rowVector->size(); ++i) { + if (partitions_[i] != bucketToKeep) { + filterRows_.setValid(i, false); + } + } + filterRows_.updateBounds(); + return filterRows_.countSelected(); + } + vector_size_t size = 0; + for (vector_size_t i = 0; i < rowVector->size(); ++i) { + size += partitions_[i] == bucketToKeep; + } + if (size == 0) { + return 0; + } + indices = allocateIndices(size, pool_); + size = 0; + auto* rawIndices = indices->asMutable(); + for (vector_size_t i = 0; i < rowVector->size(); ++i) { + if (partitions_[i] == bucketToKeep) { + rawIndices[size++] = i; + } } - splitReader_ = createSplitReader(); - splitReader_->prepareSplit( - hiveTableHandle_, - readerOpts_, - std::move(input), - metadataFilter_, - runtimeStats_); + return size; } std::optional HiveDataSource::next( uint64_t size, velox::ContinueFuture& /*future*/) { VELOX_CHECK(split_ != nullptr, "No split to process. Call addSplit first."); + VELOX_CHECK_NOT_NULL(splitReader_, "No split reader present"); - if (splitReader_ && splitReader_->emptySplit()) { + TestValue::adjust( + "facebook::velox::connector::hive::HiveDataSource::next", this); + + if (splitReader_->emptySplit()) { resetSplit(); return nullptr; } @@ -601,67 +347,76 @@ std::optional HiveDataSource::next( output_ = BaseVector::create(readerOutputType_, 0, pool_); } - // TODO Check if remaining filter has a conjunct that doesn't depend on - // any column, e.g. rand() < 0.1. Evaluate that conjunct first, then scan - // only rows that passed. - - auto rowsScanned = splitReader_->next(size, output_); + const auto rowsScanned = splitReader_->next(size, output_); completedRows_ += rowsScanned; + if (rowsScanned == 0) { + splitReader_->updateRuntimeStats(runtimeStats_); + resetSplit(); + return nullptr; + } - if (rowsScanned) { - VELOX_CHECK( - !output_->mayHaveNulls(), "Top-level row vector cannot have nulls"); - auto rowsRemaining = output_->size(); + VELOX_CHECK( + !output_->mayHaveNulls(), "Top-level row vector cannot have nulls"); + auto rowsRemaining = output_->size(); + if (rowsRemaining == 0) { + // no rows passed the pushed down filters. + return getEmptyOutput(); + } + + auto rowVector = std::dynamic_pointer_cast(output_); + + // In case there is a remaining filter that excludes some but not all + // rows, collect the indices of the passing rows. If there is no filter, + // or it passes on all rows, leave this as null and let exec::wrap skip + // wrapping the results. + BufferPtr remainingIndices; + if (remainingFilterExprSet_) { + if (numBucketConversion_ > 0) { + filterRows_.resizeFill(rowVector->size()); + } else { + filterRows_.resize(rowVector->size()); + } + } + if (partitionFunction_) { + rowsRemaining = applyBucketConversion(rowVector, remainingIndices); if (rowsRemaining == 0) { - // no rows passed the pushed down filters. return getEmptyOutput(); } + } - auto rowVector = std::dynamic_pointer_cast(output_); - - // In case there is a remaining filter that excludes some but not all - // rows, collect the indices of the passing rows. If there is no filter, - // or it passes on all rows, leave this as null and let exec::wrap skip - // wrapping the results. - BufferPtr remainingIndices; - if (remainingFilterExprSet_) { - rowsRemaining = evaluateRemainingFilter(rowVector); - VELOX_CHECK_LE(rowsRemaining, rowsScanned); - if (rowsRemaining == 0) { - // No rows passed the remaining filter. - return getEmptyOutput(); - } - - if (rowsRemaining < rowVector->size()) { - // Some, but not all rows passed the remaining filter. - remainingIndices = filterEvalCtx_.selectedIndices; - } + if (remainingFilterExprSet_) { + rowsRemaining = evaluateRemainingFilter(rowVector); + VELOX_CHECK_LE(rowsRemaining, rowsScanned); + if (rowsRemaining == 0) { + // No rows passed the remaining filter. + return getEmptyOutput(); } - if (outputType_->size() == 0) { - return exec::wrap(rowsRemaining, remainingIndices, rowVector); + if (rowsRemaining < rowVector->size()) { + // Some, but not all rows passed the remaining filter. + remainingIndices = filterEvalCtx_.selectedIndices; } + } - std::vector outputColumns; - outputColumns.reserve(outputType_->size()); - for (int i = 0; i < outputType_->size(); i++) { - auto& child = rowVector->childAt(i); - if (remainingIndices) { - // Disable dictionary values caching in expression eval so that we - // don't need to reallocate the result for every batch. - child->disableMemo(); - } - outputColumns.emplace_back( - exec::wrapChild(rowsRemaining, remainingIndices, child)); - } + if (outputType_->size() == 0) { + return exec::wrap(rowsRemaining, remainingIndices, rowVector); + } - return std::make_shared( - pool_, outputType_, BufferPtr(nullptr), rowsRemaining, outputColumns); + std::vector outputColumns; + outputColumns.reserve(outputType_->size()); + for (int i = 0; i < outputType_->size(); ++i) { + auto& child = rowVector->childAt(i); + if (remainingIndices) { + // Disable dictionary values caching in expression eval so that we + // don't need to reallocate the result for every batch. + child->disableMemo(); + } + outputColumns.emplace_back( + exec::wrapChild(rowsRemaining, remainingIndices, child)); } - splitReader_->updateRuntimeStats(runtimeStats_); - resetSplit(); - return nullptr; + return std::make_shared( + pool_, outputType_, BufferPtr(nullptr), rowsRemaining, outputColumns); } void HiveDataSource::addDynamicFilter( @@ -695,34 +450,46 @@ std::unordered_map HiveDataSource::runtimeStats() { {"totalScanTime", RuntimeCounter( ioStats_->totalScanTime(), RuntimeCounter::Unit::kNanos)}, - {"ioWaitNanos", + {"totalRemainingFilterTime", + RuntimeCounter( + totalRemainingFilterTime_.load(std::memory_order_relaxed), + RuntimeCounter::Unit::kNanos)}, + {"ioWaitWallNanos", RuntimeCounter( ioStats_->queryThreadIoLatency().sum() * 1000, RuntimeCounter::Unit::kNanos)}, + {"maxSingleIoWaitWallNanos", + RuntimeCounter( + ioStats_->queryThreadIoLatency().max() * 1000, + RuntimeCounter::Unit::kNanos)}, {"overreadBytes", RuntimeCounter( - ioStats_->rawOverreadBytes(), RuntimeCounter::Unit::kBytes)}, - {"queryThreadIoLatency", - RuntimeCounter(ioStats_->queryThreadIoLatency().count())}}); + ioStats_->rawOverreadBytes(), RuntimeCounter::Unit::kBytes)}}); + if (numBucketConversion_ > 0) { + res.insert({"numBucketConversion", RuntimeCounter(numBucketConversion_)}); + } return res; } void HiveDataSource::setFromDataSource( std::unique_ptr sourceUnique) { auto source = dynamic_cast(sourceUnique.get()); - VELOX_CHECK(source, "Bad DataSource type"); + VELOX_CHECK_NOT_NULL(source, "Bad DataSource type"); split_ = std::move(source->split_); - if (source->splitReader_ && source->splitReader_->emptySplit()) { - return; - } + runtimeStats_.skippedSplits += source->runtimeStats_.skippedSplits; + runtimeStats_.skippedSplitBytes += source->runtimeStats_.skippedSplitBytes; + readerOutputType_ = std::move(source->readerOutputType_); source->scanSpec_->moveAdaptationFrom(*scanSpec_); scanSpec_ = std::move(source->scanSpec_); splitReader_ = std::move(source->splitReader_); + splitReader_->setConnectorQueryCtx(connectorQueryCtx_); // New io will be accounted on the stats of 'source'. Add the existing // balance to that. source->ioStats_->merge(*ioStats_); ioStats_ = std::move(source->ioStats_); + numBucketConversion_ += source->numBucketConversion_; + partitionFunction_ = std::move(source->partitionFunction_); } int64_t HiveDataSource::estimatedRowSize() { @@ -732,109 +499,26 @@ int64_t HiveDataSource::estimatedRowSize() { return splitReader_->estimatedRowSize(); } -std::shared_ptr HiveDataSource::makeScanSpec( - const RowTypePtr& rowType, - const folly::F14FastMap>& - outputSubfields, - const SubfieldFilters& filters, - const RowTypePtr& dataColumns, - memory::MemoryPool* pool) { - auto spec = std::make_shared("root"); - folly::F14FastMap> - filterSubfields; - std::vector subfieldSpecs; - for (auto& [subfield, _] : filters) { - if (auto name = subfield.toString(); name != kPath && name != kBucket) { - filterSubfields[getColumnName(subfield)].push_back(&subfield); - } - } - - // Process columns that will be projected out. - for (int i = 0; i < rowType->size(); ++i) { - auto& name = rowType->nameOf(i); - auto& type = rowType->childAt(i); - auto it = outputSubfields.find(name); - if (it == outputSubfields.end()) { - spec->addFieldRecursively(name, *type, i); - filterSubfields.erase(name); - continue; - } - for (auto* subfield : it->second) { - subfieldSpecs.push_back({subfield, false}); - } - it = filterSubfields.find(name); - if (it != filterSubfields.end()) { - for (auto* subfield : it->second) { - subfieldSpecs.push_back({subfield, true}); - } - filterSubfields.erase(it); - } - addSubfields(*type, subfieldSpecs, 1, pool, *spec->addField(name, i)); - subfieldSpecs.clear(); - } - - // Now process the columns that will not be projected out. - if (!filterSubfields.empty()) { - VELOX_CHECK_NOT_NULL(dataColumns); - for (auto& [fieldName, subfields] : filterSubfields) { - for (auto* subfield : subfields) { - subfieldSpecs.push_back({subfield, true}); - } - auto& type = dataColumns->findChild(fieldName); - auto* fieldSpec = spec->getOrCreateChild(common::Subfield(fieldName)); - addSubfields(*type, subfieldSpecs, 1, pool, *fieldSpec); - subfieldSpecs.clear(); - } - } - - for (auto& pair : filters) { - // SelectiveColumnReader doesn't support constant columns with filters, - // hence, we can't have a filter for a $path or $bucket column. - // - // Unfortunately, Presto happens to specify a filter for $path or - // $bucket column. This filter is redundant and needs to be removed. - // TODO Remove this check when Presto is fixed to not specify a filter - // on $path and $bucket column. - if (auto name = pair.first.toString(); name == kPath || name == kBucket) { - continue; - } - auto fieldSpec = spec->getOrCreateChild(pair.first); - fieldSpec->addFilter(*pair.second); - } - - return spec; -} - -std::unique_ptr -HiveDataSource::createBufferedInput( - const FileHandle& fileHandle, - const dwio::common::ReaderOptions& readerOpts) { - if (cache_) { - return std::make_unique( - fileHandle.file, - dwio::common::MetricsLog::voidLog(), - fileHandle.uuid.id(), - cache_, - Connector::getTracker(scanId_, readerOpts.loadQuantum()), - fileHandle.groupId.id(), - ioStats_, - executor_, - readerOpts); - } - return std::make_unique( - fileHandle.file, - readerOpts.getMemoryPool(), - dwio::common::MetricsLog::voidLog(), - ioStats_.get()); -} - vector_size_t HiveDataSource::evaluateRemainingFilter(RowVectorPtr& rowVector) { - filterRows_.resize(output_->size()); - - expressionEvaluator_->evaluate( - remainingFilterExprSet_.get(), filterRows_, *rowVector, filterResult_); - return exec::processFilterResults( - filterResult_, filterRows_, filterEvalCtx_, pool_); + for (auto fieldIndex : multiReferencedFields_) { + LazyVector::ensureLoadedRows( + rowVector->childAt(fieldIndex), + filterRows_, + filterLazyDecoded_, + filterLazyBaseRows_); + } + uint64_t filterTimeUs{0}; + vector_size_t rowsRemaining{0}; + { + MicrosecondTimer timer(&filterTimeUs); + expressionEvaluator_->evaluate( + remainingFilterExprSet_.get(), filterRows_, *rowVector, filterResult_); + rowsRemaining = exec::processFilterResults( + filterResult_, filterRows_, filterEvalCtx_, pool_); + } + totalRemainingFilterTime_.fetch_add( + filterTimeUs * 1000, std::memory_order_relaxed); + return rowsRemaining; } void HiveDataSource::resetSplit() { @@ -843,4 +527,31 @@ void HiveDataSource::resetSplit() { // Keep readers around to hold adaptation. } +HiveDataSource::WaveDelegateHookFunction HiveDataSource::waveDelegateHook_; + +std::shared_ptr HiveDataSource::toWaveDataSource() { + VELOX_CHECK_NOT_NULL(waveDelegateHook_); + if (!waveDataSource_) { + waveDataSource_ = waveDelegateHook_( + hiveTableHandle_, + scanSpec_, + readerOutputType_, + &partitionKeys_, + fileHandleFactory_, + executor_, + connectorQueryCtx_, + hiveConfig_, + ioStats_, + remainingFilterExprSet_.get(), + metadataFilter_); + } + return waveDataSource_; +} + +// static +void HiveDataSource::registerWaveDelegateHook(WaveDelegateHookFunction hook) { + waveDelegateHook_ = hook; +} +std::shared_ptr toWaveDataSource(); + } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveDataSource.h b/velox/connectors/hive/HiveDataSource.h index ab8eeb452f254..b7ca9bcb63444 100644 --- a/velox/connectors/hive/HiveDataSource.h +++ b/velox/connectors/hive/HiveDataSource.h @@ -15,20 +15,25 @@ */ #pragma once +#include "velox/common/base/RandomUtil.h" #include "velox/common/io/IoStatistics.h" #include "velox/connectors/Connector.h" #include "velox/connectors/hive/FileHandle.h" #include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/connectors/hive/HivePartitionFunction.h" #include "velox/connectors/hive/SplitReader.h" #include "velox/connectors/hive/TableHandle.h" -#include "velox/dwio/common/BufferedInput.h" -#include "velox/dwio/common/Reader.h" -#include "velox/dwio/common/ScanSpec.h" +#include "velox/dwio/common/Statistics.h" #include "velox/exec/OperatorUtils.h" #include "velox/expression/Expr.h" namespace facebook::velox::connector::hive { +class HiveConfig; + +using SubfieldFilters = + std::unordered_map>; + class HiveDataSource : public DataSource { public: HiveDataSource( @@ -38,11 +43,9 @@ class HiveDataSource : public DataSource { std::string, std::shared_ptr>& columnHandles, FileHandleFactory* fileHandleFactory, - core::ExpressionEvaluator* expressionEvaluator, - cache::AsyncDataCache* cache, - const std::string& scanId, folly::Executor* executor, - const dwio::common::ReaderOptions& options); + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig); void addSplit(std::shared_ptr split) override; @@ -71,37 +74,43 @@ class HiveDataSource : public DataSource { int64_t estimatedRowSize() override; - // Internal API, made public to be accessible in unit tests. Do not use in - // other places. - static std::shared_ptr makeScanSpec( - const RowTypePtr& rowType, - const folly::F14FastMap< - std::string, - std::vector>& outputSubfields, - const SubfieldFilters& filters, - const RowTypePtr& dataColumns, - memory::MemoryPool* pool); - - // Internal API, made public to be accessible in unit tests. Do not use in - // other places. - static core::TypedExprPtr extractFiltersFromRemainingFilter( - const core::TypedExprPtr& expr, - core::ExpressionEvaluator* evaluator, - bool negated, - SubfieldFilters& filters); + std::shared_ptr toWaveDataSource() override; + + using WaveDelegateHookFunction = + std::function( + const std::shared_ptr& hiveTableHandle, + const std::shared_ptr& scanSpec, + const RowTypePtr& readerOutputType, + std::unordered_map>* + partitionKeys, + FileHandleFactory* fileHandleFactory, + folly::Executor* executor, + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig, + const std::shared_ptr& ioStats, + const exec::ExprSet* remainingFilter, + std::shared_ptr metadataFilter)>; + + static WaveDelegateHookFunction waveDelegateHook_; + + static void registerWaveDelegateHook(WaveDelegateHookFunction hook); + + const ConnectorQueryCtx* testingConnectorQueryCtx() const { + return connectorQueryCtx_; + } protected: virtual std::unique_ptr createSplitReader(); - std::unique_ptr createBufferedInput( - const FileHandle&, - const dwio::common::ReaderOptions&); + FileHandleFactory* const fileHandleFactory_; + folly::Executor* const executor_; + const ConnectorQueryCtx* const connectorQueryCtx_; + const std::shared_ptr hiveConfig_; + memory::MemoryPool* const pool_; std::shared_ptr split_; - FileHandleFactory* fileHandleFactory_; - dwio::common::ReaderOptions readerOpts_; + std::shared_ptr hiveTableHandle_; std::shared_ptr scanSpec_; - memory::MemoryPool* pool_; VectorPtr output_; std::unique_ptr splitReader_; @@ -115,7 +124,15 @@ class HiveDataSource : public DataSource { std::unordered_map> partitionKeys_; + std::shared_ptr ioStats_; + std::shared_ptr rowIndexColumn_; + private: + std::unique_ptr setupBucketConversion(); + vector_size_t applyBucketConversion( + const RowVectorPtr& rowVector, + BufferPtr& indices); + // Evaluates remainingFilter_ on the specified vector. Returns number of rows // passed. Populates filterEvalCtx_.selectedIndices and selectedBits if only // some rows passed the filter. If none or all rows passed @@ -126,9 +143,6 @@ class HiveDataSource : public DataSource { // hold adaptation. void resetSplit(); - void parseSerdeParameters( - const std::unordered_map& serdeParameters); - const RowVectorPtr& getEmptyOutput() { if (!emptyOutput_) { emptyOutput_ = RowVector::createEmpty(outputType_, pool_); @@ -136,26 +150,42 @@ class HiveDataSource : public DataSource { return emptyOutput_; } - std::shared_ptr hiveTableHandle_; - - // The row type for the data source output, not including filter only columns + // The row type for the data source output, not including filter-only columns const RowTypePtr outputType_; - std::shared_ptr ioStats_; + core::ExpressionEvaluator* const expressionEvaluator_; + + // Column handles for the Split info columns keyed on their column names. + std::unordered_map> + infoColumns_; + folly::F14FastMap> + subfields_; + SubfieldFilters filters_; std::shared_ptr metadataFilter_; std::unique_ptr remainingFilterExprSet_; RowVectorPtr emptyOutput_; dwio::common::RuntimeStatistics runtimeStats_; - core::ExpressionEvaluator* expressionEvaluator_; + std::atomic totalRemainingFilterTime_{0}; uint64_t completedRows_ = 0; + // Field indices referenced in both remaining filter and output type. These + // columns need to be materialized eagerly to avoid missing values in output. + std::vector multiReferencedFields_; + + std::shared_ptr randomSkip_; + + int64_t numBucketConversion_ = 0; + std::unique_ptr partitionFunction_; + std::vector partitions_; + // Reusable memory for remaining filter evaluation. VectorPtr filterResult_; SelectivityVector filterRows_; + DecodedVector filterLazyDecoded_; + SelectivityVector filterLazyBaseRows_; exec::FilterEvalCtx filterEvalCtx_; - cache::AsyncDataCache* const cache_{nullptr}; - const std::string& scanId_; - folly::Executor* executor_; + // Remembers the WaveDataSource. Successive calls to toWaveDataSource() will + // return the same. + std::shared_ptr waveDataSource_; }; - } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HivePartitionFunction.cpp b/velox/connectors/hive/HivePartitionFunction.cpp index 117b4992b731b..8f583164c1a6d 100644 --- a/velox/connectors/hive/HivePartitionFunction.cpp +++ b/velox/connectors/hive/HivePartitionFunction.cpp @@ -46,63 +46,68 @@ int32_t hashTimestamp(const Timestamp& ts) { } template -inline uint32_t hashOne( - const typename TypeTraits::NativeType& /* value */) { +inline uint32_t hashOne(typename TypeTraits::NativeType /* value */) { VELOX_UNSUPPORTED( "Hive partitioning function doesn't support {} type", TypeTraits::name); + return 0; // Make compiler happy. } template <> -inline uint32_t hashOne(const bool& value) { +inline uint32_t hashOne(bool value) { return value ? 1 : 0; } template <> -inline uint32_t hashOne(const int8_t& value) { +inline uint32_t hashOne(int8_t value) { return static_cast(value); } template <> -inline uint32_t hashOne(const int16_t& value) { +inline uint32_t hashOne(int16_t value) { return static_cast(value); } template <> -inline uint32_t hashOne(const int32_t& value) { +inline uint32_t hashOne(int32_t value) { return static_cast(value); } template <> -inline uint32_t hashOne(const float& value) { +inline uint32_t hashOne(float value) { return static_cast(*reinterpret_cast(&value)); } template <> -inline uint32_t hashOne(const int64_t& value) { +inline uint32_t hashOne(int64_t value) { return hashInt64(value); } template <> -inline uint32_t hashOne(const double& value) { +inline uint32_t hashOne(double value) { return hashInt64(*reinterpret_cast(&value)); } template <> -inline uint32_t hashOne(const StringView& value) { +inline uint32_t hashOne(StringView value) { return hashBytes(value, 0); } template <> -inline uint32_t hashOne(const StringView& value) { +inline uint32_t hashOne(StringView value) { return hashBytes(value, 0); } template <> -inline uint32_t hashOne(const Timestamp& value) { +inline uint32_t hashOne(Timestamp value) { return hashTimestamp(value); } +template <> +inline uint32_t hashOne(UnknownValue /*value*/) { + VELOX_FAIL("Unknown values cannot be non-NULL"); +} + template void hashPrimitive( const DecodedVector& values, @@ -245,6 +250,26 @@ void HivePartitionFunction::hashTyped( hashPrimitive(values, rows, mix, hashes); } +template <> +void HivePartitionFunction::hashTyped( + const DecodedVector& values, + const SelectivityVector& rows, + bool mix, + std::vector& hashes, + size_t /* poolIndex */) { + hashPrimitive(values, rows, mix, hashes); +} + +template <> +void HivePartitionFunction::hashTyped( + const DecodedVector& /*values*/, + const SelectivityVector& /*rows*/, + bool /*mix*/, + std::vector& /*hashes*/, + size_t /* poolIndex */) { + VELOX_UNSUPPORTED("Hive partitioning function doesn't support OPAQUE type"); +} + template <> void HivePartitionFunction::hashTyped( const DecodedVector& values, @@ -426,7 +451,7 @@ void HivePartitionFunction::hash( // gets implemented, this function will need to change // significantly. - VELOX_DYNAMIC_TYPE_DISPATCH( + VELOX_DYNAMIC_TYPE_DISPATCH_ALL( hashTyped, typeKind, values, rows, mix, hashes, poolIndex); } diff --git a/velox/connectors/hive/HivePartitionUtil.cpp b/velox/connectors/hive/HivePartitionUtil.cpp index 3adf9d38a82b8..cbc53c79b5eaf 100644 --- a/velox/connectors/hive/HivePartitionUtil.cpp +++ b/velox/connectors/hive/HivePartitionUtil.cpp @@ -51,13 +51,23 @@ template std::pair makePartitionKeyValueString( const BaseVector* partitionVector, vector_size_t row, - const std::string& name) { + const std::string& name, + bool isDate) { using T = typename TypeTraits::NativeType; + if (partitionVector->as>()->isNullAt(row)) { + return std::make_pair(name, ""); + } + if (isDate) { + return std::make_pair( + name, + DATE()->toString( + partitionVector->as>()->valueAt(row))); + } return std::make_pair( name, makePartitionValueString( partitionVector->as>()->valueAt(row))); -}; +} } // namespace @@ -66,21 +76,13 @@ std::vector> extractPartitionKeyValues( vector_size_t row) { std::vector> partitionKeyValues; for (auto i = 0; i < partitionsVector->childrenSize(); i++) { - if (partitionsVector->childAt(i)->type()->isDate()) { - auto partitionVector = partitionsVector->childAt(i)->loadedVector(); - auto partitionName = asRowType(partitionsVector->type())->nameOf(i); - partitionKeyValues.push_back( - {partitionName, - DATE()->toString( - partitionVector->as>()->valueAt(row))}); - } else { - partitionKeyValues.push_back(PARTITION_TYPE_DISPATCH( - makePartitionKeyValueString, - partitionsVector->childAt(i)->typeKind(), - partitionsVector->childAt(i)->loadedVector(), - row, - asRowType(partitionsVector->type())->nameOf(i))); - } + partitionKeyValues.push_back(PARTITION_TYPE_DISPATCH( + makePartitionKeyValueString, + partitionsVector->childAt(i)->typeKind(), + partitionsVector->childAt(i)->loadedVector(), + row, + asRowType(partitionsVector->type())->nameOf(i), + partitionsVector->childAt(i)->type()->isDate())); } return partitionKeyValues; } diff --git a/velox/connectors/hive/PartitionIdGenerator.cpp b/velox/connectors/hive/PartitionIdGenerator.cpp index e795a08962b18..deec8dc5b005c 100644 --- a/velox/connectors/hive/PartitionIdGenerator.cpp +++ b/velox/connectors/hive/PartitionIdGenerator.cpp @@ -27,9 +27,11 @@ PartitionIdGenerator::PartitionIdGenerator( const RowTypePtr& inputType, std::vector partitionChannels, uint32_t maxPartitions, - memory::MemoryPool* pool) + memory::MemoryPool* pool, + bool partitionPathAsLowerCase) : partitionChannels_(std::move(partitionChannels)), - maxPartitions_(maxPartitions) { + maxPartitions_(maxPartitions), + partitionPathAsLowerCase_(partitionPathAsLowerCase) { VELOX_USER_CHECK( !partitionChannels_.empty(), "There must be at least one partition key."); for (auto channel : partitionChannels_) { @@ -64,8 +66,6 @@ void PartitionIdGenerator::run( const auto numRows = input->size(); result.resize(numRows); - // TODO Check that there are no nulls in the partition keys. - // Compute value IDs using VectorHashers and store these in 'result'. computeValueIds(input, result); @@ -98,7 +98,8 @@ void PartitionIdGenerator::run( std::string PartitionIdGenerator::partitionName(uint64_t partitionId) const { return FileUtils::makePartName( - extractPartitionKeyValues(partitionValues_, partitionId)); + extractPartitionKeyValues(partitionValues_, partitionId), + partitionPathAsLowerCase_); } void PartitionIdGenerator::computeValueIds( @@ -109,6 +110,9 @@ void PartitionIdGenerator::computeValueIds( bool rehash = false; for (auto& hasher : hashers_) { + // NOTE: for boolean column type, computeValueIds() always returns true and + // this might cause problem in case of multiple boolean partition columns as + // we might not set the multiplier properly. auto partitionVector = input->childAt(hasher->channel())->loadedVector(); hasher->decode(*partitionVector, allRows_); if (!hasher->computeValueIds(allRows_, valueIds)) { @@ -116,12 +120,13 @@ void PartitionIdGenerator::computeValueIds( } } - if (!rehash) { + if (!rehash && hasMultiplierSet_) { return; } uint64_t multiplier = 1; for (auto& hasher : hashers_) { + hasMultiplierSet_ = true; multiplier = hasher->typeKind() == TypeKind::BOOLEAN ? hasher->enableValueRange(multiplier, 50) : hasher->enableValueIds(multiplier, 50); diff --git a/velox/connectors/hive/PartitionIdGenerator.h b/velox/connectors/hive/PartitionIdGenerator.h index 1a4844398b0e4..01b638c0f3ad4 100644 --- a/velox/connectors/hive/PartitionIdGenerator.h +++ b/velox/connectors/hive/PartitionIdGenerator.h @@ -20,8 +20,7 @@ namespace facebook::velox::connector::hive { /// Generate sequential integer IDs for distinct partition values, which could -/// be used as vector index. Only single partition key is supported at the -/// moment. +/// be used as vector index. class PartitionIdGenerator { public: /// @param inputType RowType of the input. @@ -30,11 +29,14 @@ class PartitionIdGenerator { /// @param maxPartitions The max number of distinct partitions. /// @param pool Memory pool. Used to allocate memory for storing unique /// partition key values. + /// @param partitionPathAsLowerCase Used to control whether the partition path + /// need to convert to lower case. PartitionIdGenerator( const RowTypePtr& inputType, std::vector partitionChannels, uint32_t maxPartitions, - memory::MemoryPool* pool); + memory::MemoryPool* pool, + bool partitionPathAsLowerCase); /// Generate sequential partition IDs for input vector. /// @param input Input RowVector. @@ -77,7 +79,10 @@ class PartitionIdGenerator { const uint32_t maxPartitions_; + const bool partitionPathAsLowerCase_; + std::vector> hashers_; + bool hasMultiplierSet_ = false; // A mapping from value ID produced by VectorHashers to a partition ID. std::unordered_map partitionIds_; diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index a42b64e37141b..dcdb9441fc3d5 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -16,234 +16,153 @@ #include "velox/connectors/hive/SplitReader.h" +#include "velox/common/caching/CacheTTLController.h" +#include "velox/connectors/hive/HiveConfig.h" #include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/connectors/hive/HiveConnectorUtil.h" #include "velox/connectors/hive/TableHandle.h" +#include "velox/connectors/hive/iceberg/IcebergSplitReader.h" +#include "velox/dwio/common/CachedBufferedInput.h" #include "velox/dwio/common/ReaderFactory.h" +#include "velox/type/TimestampConversion.h" namespace facebook::velox::connector::hive { - namespace { -bool applyPartitionFilter( - TypeKind kind, - const std::string& partitionValue, - common::Filter* filter) { - switch (kind) { - case TypeKind::BIGINT: - case TypeKind::INTEGER: - case TypeKind::SMALLINT: - case TypeKind::TINYINT: { - return applyFilter(*filter, folly::to(partitionValue)); - } - case TypeKind::REAL: - case TypeKind::DOUBLE: { - return applyFilter(*filter, folly::to(partitionValue)); - } - case TypeKind::BOOLEAN: { - return applyFilter(*filter, folly::to(partitionValue)); - } - case TypeKind::VARCHAR: { - return applyFilter(*filter, partitionValue); - } - default: - VELOX_FAIL("Bad type {} for partition value: {}", kind, partitionValue); - break; +template +VectorPtr newConstantFromString( + const TypePtr& type, + const std::optional& value, + vector_size_t size, + velox::memory::MemoryPool* pool) { + using T = typename TypeTraits::NativeType; + if (!value.has_value()) { + return std::make_shared>(pool, size, true, type, T()); } -} -bool testFilters( - common::ScanSpec* scanSpec, - dwio::common::Reader* reader, - const std::string& filePath, - const std::unordered_map>& - partitionKey, - std::unordered_map>& - partitionKeysHandle) { - auto totalRows = reader->numberOfRows(); - const auto& fileTypeWithId = reader->typeWithId(); - const auto& rowType = reader->rowType(); - for (const auto& child : scanSpec->children()) { - if (child->filter()) { - const auto& name = child->fieldName(); - if (!rowType->containsChild(name)) { - // If missing column is partition key. - auto iter = partitionKey.find(name); - if (iter != partitionKey.end() && iter->second.has_value()) { - return applyPartitionFilter( - partitionKeysHandle[name]->dataType()->kind(), - iter->second.value(), - child->filter()); - } - // Column is missing. Most likely due to schema evolution. - if (child->filter()->isDeterministic() && - !child->filter()->testNull()) { - return false; - } - } else { - const auto& typeWithId = fileTypeWithId->childByName(name); - auto columnStats = reader->columnStatistics(typeWithId->id()); - if (columnStats != nullptr && - !testFilter( - child->filter(), - columnStats.get(), - totalRows.value(), - typeWithId->type())) { - VLOG(1) << "Skipping " << filePath - << " based on stats and filter for column " - << child->fieldName(); - return false; - } - } - } + if (type->isDate()) { + auto days = DATE()->toDays((folly::StringPiece)value.value()); + return std::make_shared>( + pool, size, false, type, std::move(days)); } - return true; -} - -template -velox::variant convertFromString(const std::optional& value) { - if (value.has_value()) { - if constexpr (ToKind == TypeKind::VARCHAR) { - return velox::variant(value.value()); - } - if constexpr (ToKind == TypeKind::VARBINARY) { - return velox::variant::binary((value.value())); + if constexpr (std::is_same_v) { + return std::make_shared>( + pool, size, false, type, StringView(value.value())); + } else { + auto copy = velox::util::Converter::tryCast(value.value()) + .thenOrThrow(folly::identity, [&](const Status& status) { + VELOX_USER_FAIL("{}", status.message()); + }); + if constexpr (kind == TypeKind::TIMESTAMP) { + copy.toGMT(Timestamp::defaultTimezone()); } - auto result = velox::util::Converter::cast(value.value()); - - return velox::variant(result); + return std::make_shared>( + pool, size, false, type, std::move(copy)); } - return velox::variant(ToKind); } - } // namespace std::unique_ptr SplitReader::create( - std::shared_ptr hiveSplit, - const RowTypePtr readerOutputType, - std::unordered_map>& + const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveTableHandle, + const std::unordered_map>* partitionKeys, - std::shared_ptr scanSpec, - memory::MemoryPool* pool) { + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig, + const RowTypePtr& readerOutputType, + const std::shared_ptr& ioStats, + FileHandleFactory* fileHandleFactory, + folly::Executor* executor, + const std::shared_ptr& scanSpec) { // Create the SplitReader based on hiveSplit->customSplitInfo["table_format"] - return std::make_unique( - hiveSplit, readerOutputType, partitionKeys, scanSpec, pool); + if (hiveSplit->customSplitInfo.count("table_format") > 0 && + hiveSplit->customSplitInfo["table_format"] == "hive-iceberg") { + return std::make_unique( + hiveSplit, + hiveTableHandle, + partitionKeys, + connectorQueryCtx, + hiveConfig, + readerOutputType, + ioStats, + fileHandleFactory, + executor, + scanSpec); + } else { + return std::unique_ptr(new SplitReader( + hiveSplit, + hiveTableHandle, + partitionKeys, + connectorQueryCtx, + hiveConfig, + readerOutputType, + ioStats, + fileHandleFactory, + executor, + scanSpec)); + } } SplitReader::SplitReader( - std::shared_ptr hiveSplit, - const RowTypePtr readerOutputType, - std::unordered_map>& + const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveTableHandle, + const std::unordered_map>* partitionKeys, - std::shared_ptr scanSpec, - memory::MemoryPool* pool) - : hiveSplit_(std::move(hiveSplit)), - readerOutputType_(readerOutputType), + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig, + const RowTypePtr& readerOutputType, + const std::shared_ptr& ioStats, + FileHandleFactory* fileHandleFactory, + folly::Executor* executor, + const std::shared_ptr& scanSpec) + : hiveSplit_(hiveSplit), + hiveTableHandle_(hiveTableHandle), partitionKeys_(partitionKeys), - scanSpec_(std::move(scanSpec)), - pool_(pool) {} + connectorQueryCtx_(connectorQueryCtx), + hiveConfig_(hiveConfig), + readerOutputType_(readerOutputType), + ioStats_(ioStats), + fileHandleFactory_(fileHandleFactory), + executor_(executor), + pool_(connectorQueryCtx->memoryPool()), + scanSpec_(scanSpec), + baseReaderOpts_(connectorQueryCtx->memoryPool()), + emptySplit_(false) {} + +void SplitReader::configureReaderOptions( + std::shared_ptr randomSkip) { + hive::configureReaderOptions( + baseReaderOpts_, + hiveConfig_, + connectorQueryCtx_, + hiveTableHandle_, + hiveSplit_); + baseReaderOpts_.setRandomSkip(std::move(randomSkip)); + baseReaderOpts_.setScanSpec(scanSpec_); +} void SplitReader::prepareSplit( - const std::shared_ptr& hiveTableHandle, - const dwio::common::ReaderOptions& readerOptions, - std::unique_ptr baseFileInput, std::shared_ptr metadataFilter, - dwio::common::RuntimeStatistics& runtimeStats) { - baseReader_ = dwio::common::getReaderFactory(readerOptions.getFileFormat()) - ->createReader(std::move(baseFileInput), readerOptions); - - // Note that this doesn't apply to Hudi tables. - emptySplit_ = false; - if (baseReader_->numberOfRows() == 0) { - emptySplit_ = true; - return; - } + dwio::common::RuntimeStatistics& runtimeStats, + const std::shared_ptr& rowIndexColumn) { + createReader(std::move(metadataFilter), rowIndexColumn); - // Check filters and see if the whole split can be skipped. Note that this - // doesn't apply to Hudi tables. - if (!testFilters( - scanSpec_.get(), - baseReader_.get(), - hiveSplit_->filePath, - hiveSplit_->partitionKeys, - partitionKeys_)) { - emptySplit_ = true; - ++runtimeStats.skippedSplits; - runtimeStats.skippedSplitBytes += hiveSplit_->length; + if (checkIfSplitIsEmpty(runtimeStats)) { + VELOX_CHECK(emptySplit_); return; } - auto& fileType = baseReader_->rowType(); - auto columnTypes = adaptColumns(fileType, readerOptions.getFileSchema()); - - auto skipRowsIt = hiveTableHandle->tableParameters().find( - dwio::common::TableParameter::kSkipHeaderLineCount); - if (skipRowsIt != hiveTableHandle->tableParameters().end()) { - rowReaderOpts_.setSkipRows(folly::to(skipRowsIt->second)); - } - - rowReaderOpts_.setScanSpec(scanSpec_); - rowReaderOpts_.setMetadataFilter(metadataFilter); - configureRowReaderOptions( - rowReaderOpts_, - ROW(std::vector(fileType->names()), std::move(columnTypes))); - // NOTE: we firstly reset the finished 'baseRowReader_' of previous split - // before setting up for the next one to avoid doubling the peak memory usage. - baseRowReader_.reset(); - baseRowReader_ = baseReader_->createRowReader(rowReaderOpts_); + createRowReader(); } -std::vector SplitReader::adaptColumns( - const RowTypePtr& fileType, - const std::shared_ptr& tableSchema) { - // Keep track of schema types for columns in file, used by ColumnSelector. - std::vector columnTypes = fileType->children(); - - auto& childrenSpecs = scanSpec_->children(); - for (size_t i = 0; i < childrenSpecs.size(); ++i) { - auto* childSpec = childrenSpecs[i].get(); - const std::string& fieldName = childSpec->fieldName(); - - auto iter = hiveSplit_->partitionKeys.find(fieldName); - if (iter != hiveSplit_->partitionKeys.end()) { - setPartitionValue(childSpec, fieldName, iter->second); - } else if (fieldName == kPath) { - setConstantValue( - childSpec, VARCHAR(), velox::variant(hiveSplit_->filePath)); - } else if (fieldName == kBucket) { - if (hiveSplit_->tableBucketNumber.has_value()) { - setConstantValue( - childSpec, - INTEGER(), - velox::variant(hiveSplit_->tableBucketNumber.value())); - } - } else { - auto fileTypeIdx = fileType->getChildIdxIfExists(fieldName); - if (!fileTypeIdx.has_value()) { - // Column is missing. Most likely due to schema evolution. - VELOX_CHECK(tableSchema); - setNullConstantValue(childSpec, tableSchema->findChild(fieldName)); - } else { - // Column no longer missing, reset constant value set on the spec. - childSpec->setConstantValue(nullptr); - auto outputTypeIdx = readerOutputType_->getChildIdxIfExists(fieldName); - if (outputTypeIdx.has_value()) { - // We know the fieldName exists in the file, make the type at that - // position match what we expect in the output. - columnTypes[fileTypeIdx.value()] = - readerOutputType_->childAt(*outputTypeIdx); - } - } - } +uint64_t SplitReader::next(uint64_t size, VectorPtr& output) { + if (!baseReaderOpts_.randomSkip()) { + return baseRowReader_->next(size, output); } - - scanSpec_->resetCachedValues(false); - - return columnTypes; -} - -uint64_t SplitReader::next(int64_t size, VectorPtr& output) { - return baseRowReader_->next(size, output); + dwio::common::Mutation mutation; + mutation.randomSkip = baseReaderOpts_.randomSkip().get(); + return baseRowReader_->next(size, output, &mutation); } void SplitReader::resetFilterCaches() { @@ -265,11 +184,8 @@ int64_t SplitReader::estimatedRowSize() const { return DataSource::kUnknownRowSize; } - auto size = baseRowReader_->estimatedRowSize(); - if (size.has_value()) { - return size.value(); - } - return DataSource::kUnknownRowSize; + const auto size = baseRowReader_->estimatedRowSize(); + return size.value_or(DataSource::kUnknownRowSize); } void SplitReader::updateRuntimeStats( @@ -283,61 +199,17 @@ bool SplitReader::allPrefetchIssued() const { return baseRowReader_ && baseRowReader_->allPrefetchIssued(); } -void SplitReader::setConstantValue( - common::ScanSpec* spec, - const TypePtr& type, - const velox::variant& value) const { - spec->setConstantValue(BaseVector::createConstant(type, value, 1, pool_)); -} - -void SplitReader::setNullConstantValue( - common::ScanSpec* spec, - const TypePtr& type) const { - spec->setConstantValue(BaseVector::createNullConstant(type, 1, pool_)); -} - -void SplitReader::setPartitionValue( - common::ScanSpec* spec, - const std::string& partitionKey, - const std::optional& value) const { - auto it = partitionKeys_.find(partitionKey); - VELOX_CHECK( - it != partitionKeys_.end(), - "ColumnHandle is missing for partition key {}", - partitionKey); - auto constValue = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( - convertFromString, it->second->dataType()->kind(), value); - setConstantValue(spec, it->second->dataType(), constValue); -} - -void SplitReader::configureRowReaderOptions( - dwio::common::RowReaderOptions& options, - const RowTypePtr& rowType) { - std::vector columnNames; - for (auto& spec : scanSpec_->children()) { - if (!spec->isConstant()) { - columnNames.push_back(spec->fieldName()); - } - } - std::shared_ptr cs; - if (columnNames.empty()) { - static const RowTypePtr kEmpty{ROW({}, {})}; - cs = std::make_shared(kEmpty); - } else { - cs = std::make_shared(rowType, columnNames); - } - options.select(cs).range(hiveSplit_->start, hiveSplit_->length); +void SplitReader::setConnectorQueryCtx( + const ConnectorQueryCtx* connectorQueryCtx) { + connectorQueryCtx_ = connectorQueryCtx; } std::string SplitReader::toString() const { std::string partitionKeys; std::for_each( - partitionKeys_.begin(), - partitionKeys_.end(), - [&](std::pair< - const std::string, - std::shared_ptr> - column) { partitionKeys += " " + column.second->toString(); }); + partitionKeys_->begin(), partitionKeys_->end(), [&](const auto& column) { + partitionKeys += " " + column.second->toString(); + }); return fmt::format( "SplitReader: hiveSplit_{} scanSpec_{} readerOutputType_{} partitionKeys_{} reader{} rowReader{}", hiveSplit_->toString(), @@ -348,4 +220,205 @@ std::string SplitReader::toString() const { static_cast(baseRowReader_.get())); } +void SplitReader::createReader( + std::shared_ptr metadataFilter, + const std::shared_ptr& rowIndexColumn) { + VELOX_CHECK_NE( + baseReaderOpts_.fileFormat(), dwio::common::FileFormat::UNKNOWN); + + FileHandleCachedPtr fileHandleCachePtr; + try { + fileHandleCachePtr = fileHandleFactory_->generate( + hiveSplit_->filePath, + hiveSplit_->properties.has_value() ? &*hiveSplit_->properties + : nullptr); + VELOX_CHECK_NOT_NULL(fileHandleCachePtr.get()); + } catch (const VeloxRuntimeError& e) { + if (e.errorCode() == error_code::kFileNotFound && + hiveConfig_->ignoreMissingFiles( + connectorQueryCtx_->sessionProperties())) { + emptySplit_ = true; + return; + } + throw; + } + + // Here we keep adding new entries to CacheTTLController when new fileHandles + // are generated, if CacheTTLController was created. Creator of + // CacheTTLController needs to make sure a size control strategy was available + // such as removing aged out entries. + if (auto* cacheTTLController = cache::CacheTTLController::getInstance()) { + cacheTTLController->addOpenFileInfo(fileHandleCachePtr->uuid.id()); + } + auto baseFileInput = createBufferedInput( + *fileHandleCachePtr, + baseReaderOpts_, + connectorQueryCtx_, + ioStats_, + executor_); + + baseReader_ = dwio::common::getReaderFactory(baseReaderOpts_.fileFormat()) + ->createReader(std::move(baseFileInput), baseReaderOpts_); + + auto& fileType = baseReader_->rowType(); + auto columnTypes = adaptColumns(fileType, baseReaderOpts_.fileSchema()); + auto columnNames = fileType->names(); + if (rowIndexColumn != nullptr) { + bool isExplicit = scanSpec_->childByName(rowIndexColumn->name()) != nullptr; + setRowIndexColumn(rowIndexColumn, isExplicit); + } + configureRowReaderOptions( + hiveTableHandle_->tableParameters(), + scanSpec_, + std::move(metadataFilter), + ROW(std::move(columnNames), std::move(columnTypes)), + hiveSplit_, + hiveConfig_, + connectorQueryCtx_->sessionProperties(), + baseRowReaderOpts_); +} + +bool SplitReader::checkIfSplitIsEmpty( + dwio::common::RuntimeStatistics& runtimeStats) { + // emptySplit_ may already be set if the data file is not found. In this case + // we don't need to test further. + if (emptySplit_) { + return true; + } + + if (!baseReader_ || baseReader_->numberOfRows() == 0) { + emptySplit_ = true; + } else { + // Check filters and see if the whole split can be skipped. Note that this + // doesn't apply to Hudi tables. + if (!testFilters( + scanSpec_.get(), + baseReader_.get(), + hiveSplit_->filePath, + hiveSplit_->partitionKeys, + *partitionKeys_)) { + ++runtimeStats.skippedSplits; + runtimeStats.skippedSplitBytes += hiveSplit_->length; + emptySplit_ = true; + } + } + + return emptySplit_; +} + +void SplitReader::createRowReader() { + VELOX_CHECK_NULL(baseRowReader_); + baseRowReader_ = baseReader_->createRowReader(baseRowReaderOpts_); +} + +void SplitReader::setRowIndexColumn( + const std::shared_ptr& rowIndexColumn, + bool isExplicit) { + dwio::common::RowNumberColumnInfo rowNumberColumnInfo; + rowNumberColumnInfo.insertPosition = + readerOutputType_->getChildIdx(rowIndexColumn->name()); + rowNumberColumnInfo.name = rowIndexColumn->name(); + rowNumberColumnInfo.isExplicit = isExplicit; + baseRowReaderOpts_.setRowNumberColumnInfo(std::move(rowNumberColumnInfo)); +} + +std::vector SplitReader::adaptColumns( + const RowTypePtr& fileType, + const std::shared_ptr& tableSchema) { + // Keep track of schema types for columns in file, used by ColumnSelector. + std::vector columnTypes = fileType->children(); + + auto& childrenSpecs = scanSpec_->children(); + for (size_t i = 0; i < childrenSpecs.size(); ++i) { + auto* childSpec = childrenSpecs[i].get(); + const std::string& fieldName = childSpec->fieldName(); + + if (auto it = hiveSplit_->partitionKeys.find(fieldName); + it != hiveSplit_->partitionKeys.end()) { + setPartitionValue(childSpec, fieldName, it->second); + } else if (fieldName == kPath) { + auto constantVec = std::make_shared>( + connectorQueryCtx_->memoryPool(), + 1, + false, + VARCHAR(), + StringView(hiveSplit_->filePath)); + childSpec->setConstantValue(constantVec); + } else if (fieldName == kBucket) { + if (hiveSplit_->tableBucketNumber.has_value()) { + int32_t bucket = hiveSplit_->tableBucketNumber.value(); + auto constantVec = std::make_shared>( + connectorQueryCtx_->memoryPool(), + 1, + false, + INTEGER(), + std::move(bucket)); + childSpec->setConstantValue(constantVec); + } + } else if (auto iter = hiveSplit_->infoColumns.find(fieldName); + iter != hiveSplit_->infoColumns.end()) { + auto infoColumnType = + readerOutputType_->childAt(readerOutputType_->getChildIdx(fieldName)); + auto constant = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL( + newConstantFromString, + infoColumnType->kind(), + infoColumnType, + iter->second, + 1, + connectorQueryCtx_->memoryPool()); + childSpec->setConstantValue(constant); + } else if (!childSpec->isExplicitRowNumber()) { + auto fileTypeIdx = fileType->getChildIdxIfExists(fieldName); + if (!fileTypeIdx.has_value()) { + // Column is missing. Most likely due to schema evolution. + VELOX_CHECK(tableSchema); + childSpec->setConstantValue(BaseVector::createNullConstant( + tableSchema->findChild(fieldName), + 1, + connectorQueryCtx_->memoryPool())); + } else { + // Column no longer missing, reset constant value set on the spec. + childSpec->setConstantValue(nullptr); + auto outputTypeIdx = readerOutputType_->getChildIdxIfExists(fieldName); + if (outputTypeIdx.has_value()) { + auto& outputType = readerOutputType_->childAt(*outputTypeIdx); + auto& columnType = columnTypes[*fileTypeIdx]; + if (childSpec->isFlatMapAsStruct()) { + // Flat map column read as struct. Leave the schema type as MAP. + VELOX_CHECK(outputType->isRow() && columnType->isMap()); + } else { + // We know the fieldName exists in the file, make the type at that + // position match what we expect in the output. + columnType = outputType; + } + } + } + } + } + + scanSpec_->resetCachedValues(false); + + return columnTypes; +} + +void SplitReader::setPartitionValue( + common::ScanSpec* spec, + const std::string& partitionKey, + const std::optional& value) const { + auto it = partitionKeys_->find(partitionKey); + VELOX_CHECK( + it != partitionKeys_->end(), + "ColumnHandle is missing for partition key {}", + partitionKey); + auto type = it->second->dataType(); + auto constant = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL( + newConstantFromString, + type->kind(), + type, + value, + 1, + connectorQueryCtx_->memoryPool()); + spec->setConstantValue(constant); +} + } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/SplitReader.h b/velox/connectors/hive/SplitReader.h index 0cac6c4f9d6a3..02509a75abc54 100644 --- a/velox/connectors/hive/SplitReader.h +++ b/velox/connectors/hive/SplitReader.h @@ -16,53 +16,72 @@ #pragma once -#include "velox/dwio/common/Reader.h" -#include "velox/type/Type.h" +#include "velox/common/base/RandomUtil.h" +#include "velox/connectors/hive/FileHandle.h" +#include "velox/dwio/common/Options.h" + +namespace facebook::velox { +class BaseVector; +class variant; +using VectorPtr = std::shared_ptr; +} // namespace facebook::velox + +namespace facebook::velox::common { +class MetadataFilter; +class ScanSpec; +} // namespace facebook::velox::common + +namespace facebook::velox::connector { +class ConnectorQueryCtx; +} // namespace facebook::velox::connector namespace facebook::velox::dwio::common { -class BufferedInput; +class Reader; +class RowReader; +struct RuntimeStatistics; +} // namespace facebook::velox::dwio::common + +namespace facebook::velox::memory { +class MemoryPool; } namespace facebook::velox::connector::hive { -constexpr const char* kPath = "$path"; -constexpr const char* kBucket = "$bucket"; - struct HiveConnectorSplit; class HiveTableHandle; class HiveColumnHandle; +class HiveConfig; class SplitReader { public: static std::unique_ptr create( - std::shared_ptr hiveSplit, - const RowTypePtr readerOutputType, - std::unordered_map>& + const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveTableHandle, + const std::unordered_map>* partitionKeys, - std::shared_ptr scanSpec, - memory::MemoryPool* pool); - - SplitReader( - std::shared_ptr hiveSplit, - const RowTypePtr readerOutputType, - std::unordered_map>& - partitionKeys, - std::shared_ptr scanSpec, - memory::MemoryPool* pool); + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig, + const RowTypePtr& readerOutputType, + const std::shared_ptr& ioStats, + FileHandleFactory* fileHandleFactory, + folly::Executor* executor, + const std::shared_ptr& scanSpec); virtual ~SplitReader() = default; + void configureReaderOptions( + std::shared_ptr randomSkip); + /// This function is used by different table formats like Iceberg and Hudi to /// do additional preparations before reading the split, e.g. Open delete - /// files or log files, and add column adapatations for metadata columns + /// files or log files, and add column adapatations for metadata columns. It + /// would be called only once per incoming split virtual void prepareSplit( - const std::shared_ptr& hiveTableHandle, - const dwio::common::ReaderOptions& readerOptions, - std::unique_ptr baseFileInput, std::shared_ptr metadataFilter, - dwio::common::RuntimeStatistics& runtimeStats); + dwio::common::RuntimeStatistics& runtimeStats, + const std::shared_ptr& rowIndexColumn); - virtual uint64_t next(int64_t size, VectorPtr& output); + virtual uint64_t next(uint64_t size, VectorPtr& output); void resetFilterCaches(); @@ -76,44 +95,76 @@ class SplitReader { bool allPrefetchIssued() const; + void setConnectorQueryCtx(const ConnectorQueryCtx* connectorQueryCtx); + std::string toString() const; protected: - // Different table formats may have different meatadata columns. This function - // will be used to update the scanSpec for these columns. + SplitReader( + const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveTableHandle, + const std::unordered_map>* + partitionKeys, + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig, + const RowTypePtr& readerOutputType, + const std::shared_ptr& ioStats, + FileHandleFactory* fileHandleFactory, + folly::Executor* executor, + const std::shared_ptr& scanSpec); + + /// Create the dwio::common::Reader object baseReader_, which will be used to + /// read the data file's metadata and schema + void createReader( + std::shared_ptr metadataFilter, + const std::shared_ptr& rowIndexColumn); + + /// Check if the hiveSplit_ is empty. The split is considered empty when + /// 1) The data file is missing but the user chooses to ignore it + /// 2) The file does not contain any rows + /// 3) The data in the file does not pass the filters. The test is based on + /// the file metadata and partition key values + /// This function needs to be called after baseReader_ is created. + bool checkIfSplitIsEmpty(dwio::common::RuntimeStatistics& runtimeStats); + + /// Create the dwio::common::RowReader object baseRowReader_, which owns the + /// ColumnReaders that will be used to read the data + void createRowReader(); + + /// Different table formats may have different meatadata columns. + /// This function will be used to update the scanSpec for these columns. virtual std::vector adaptColumns( const RowTypePtr& fileType, const std::shared_ptr& tableSchema); - void setConstantValue( - common::ScanSpec* FOLLY_NONNULL spec, - const TypePtr& type, - const velox::variant& value) const; - - void setNullConstantValue( - common::ScanSpec* FOLLY_NONNULL spec, - const TypePtr& type) const; + void setRowIndexColumn( + const std::shared_ptr& rowIndexColumn, + bool isExplicit); void setPartitionValue( - common::ScanSpec* FOLLY_NONNULL spec, + common::ScanSpec* spec, const std::string& partitionKey, const std::optional& value) const; - std::shared_ptr hiveSplit_; - RowTypePtr readerOutputType_; - std::unordered_map>& - partitionKeys_; + std::shared_ptr hiveSplit_; + const std::shared_ptr hiveTableHandle_; + const std::unordered_map< + std::string, + std::shared_ptr>* const partitionKeys_; + const ConnectorQueryCtx* connectorQueryCtx_; + const std::shared_ptr hiveConfig_; + + const RowTypePtr readerOutputType_; + const std::shared_ptr ioStats_; + FileHandleFactory* const fileHandleFactory_; + folly::Executor* const executor_; + memory::MemoryPool* const pool_; + std::shared_ptr scanSpec_; - memory::MemoryPool* pool_; std::unique_ptr baseReader_; - dwio::common::RowReaderOptions rowReaderOpts_; std::unique_ptr baseRowReader_; - - private: - void configureRowReaderOptions( - dwio::common::RowReaderOptions& options, - const RowTypePtr& rowType); - + dwio::common::ReaderOptions baseReaderOpts_; + dwio::common::RowReaderOptions baseRowReaderOpts_; bool emptySplit_; }; diff --git a/velox/connectors/hive/TableHandle.cpp b/velox/connectors/hive/TableHandle.cpp index d03b2f57e43d0..77088ca754a46 100644 --- a/velox/connectors/hive/TableHandle.cpp +++ b/velox/connectors/hive/TableHandle.cpp @@ -73,8 +73,8 @@ std::string HiveColumnHandle::toString() const { columnTypeName(columnType_), dataType_->toString()); out << " requiredSubfields: ["; - for (const auto& s : requiredSubfields_) { - out << " " << s.toString(); + for (const auto& subfield : requiredSubfields_) { + out << " " << subfield.toString(); } out << " ]]"; return out.str(); diff --git a/velox/connectors/hive/TableHandle.h b/velox/connectors/hive/TableHandle.h index ee62a0892d7cc..cfc9295bd4055 100644 --- a/velox/connectors/hive/TableHandle.h +++ b/velox/connectors/hive/TableHandle.h @@ -28,7 +28,14 @@ using SubfieldFilters = class HiveColumnHandle : public ColumnHandle { public: - enum class ColumnType { kPartitionKey, kRegular, kSynthesized }; + enum class ColumnType { + kPartitionKey, + kRegular, + kSynthesized, + /// A zero-based row number of type BIGINT auto-generated by the connector. + /// Rows numbers are unique within a single file only. + kRowIndex + }; /// NOTE: 'dataType' is the column type in target write table. 'hiveType' is /// converted type of the corresponding column in source table which might not @@ -69,20 +76,20 @@ class HiveColumnHandle : public ColumnHandle { return hiveType_; } - // Applies to columns of complex types: arrays, maps and structs. When a - // query uses only some of the subfields, the engine provides the complete - // list of required subfields and the connector is free to prune the rest. - // - // Examples: - // - SELECT a[1], b['x'], x.y FROM t - // - SELECT a FROM t WHERE b['y'] > 10 - // - // Pruning a struct means populating some of the members with null values. - // - // Pruning a map means dropping keys not listed in the required subfields. - // - // Pruning arrays means dropping values with indices larger than maximum - // required index. + /// Applies to columns of complex types: arrays, maps and structs. When a + /// query uses only some of the subfields, the engine provides the complete + /// list of required subfields and the connector is free to prune the rest. + /// + /// Examples: + /// - SELECT a[1], b['x'], x.y FROM t + /// - SELECT a FROM t WHERE b['y'] > 10 + /// + /// Pruning a struct means populating some of the members with null values. + /// + /// Pruning a map means dropping keys not listed in the required subfields. + /// + /// Pruning arrays means dropping values with indices larger than maximum + /// required index. const std::vector& requiredSubfields() const { return requiredSubfields_; } diff --git a/velox/connectors/hive/benchmarks/HivePartitionFunctionBenchmark.cpp b/velox/connectors/hive/benchmarks/HivePartitionFunctionBenchmark.cpp index 9d016cae7f7c0..a3eec393b000f 100644 --- a/velox/connectors/hive/benchmarks/HivePartitionFunctionBenchmark.cpp +++ b/velox/connectors/hive/benchmarks/HivePartitionFunctionBenchmark.cpp @@ -329,9 +329,9 @@ BENCHMARK_DRAW_LINE(); } // namespace int main(int argc, char** argv) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; gflags::ParseCommandLineFlags(&argc, &argv, true); - + memory::MemoryManager::initialize({}); benchmarkFew = std::make_unique(1'000); benchmarkMany = std::make_unique(10'000); diff --git a/velox/connectors/hive/iceberg/CMakeLists.txt b/velox/connectors/hive/iceberg/CMakeLists.txt new file mode 100644 index 0000000000000..bc78005c91bb1 --- /dev/null +++ b/velox/connectors/hive/iceberg/CMakeLists.txt @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +velox_add_library(velox_hive_iceberg_splitreader IcebergSplitReader.cpp + IcebergSplit.cpp PositionalDeleteFileReader.cpp) + +velox_link_libraries(velox_hive_iceberg_splitreader velox_connector + Folly::folly) + +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() diff --git a/velox/connectors/hive/iceberg/IcebergDeleteFile.h b/velox/connectors/hive/iceberg/IcebergDeleteFile.h new file mode 100644 index 0000000000000..2f9206dfc2649 --- /dev/null +++ b/velox/connectors/hive/iceberg/IcebergDeleteFile.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include "velox/dwio/common/Options.h" + +namespace facebook::velox::connector::hive::iceberg { + +enum class FileContent { + kData, + kPositionalDeletes, + kEqualityDeletes, +}; + +struct IcebergDeleteFile { + FileContent content; + const std::string filePath; + dwio::common::FileFormat fileFormat; + uint64_t recordCount; + uint64_t fileSizeInBytes; + // The field ids for the delete columns for equality delete files + std::vector equalityFieldIds; + // The lower bounds of the in-file positions for the deleted rows, identified + // by each column's field id. E.g. The deleted rows for a column with field id + // 1 is in range [10, 50], where 10 and 50 are the deleted row positions in + // the data file, then lowerBounds would contain entry <1, "10"> + std::unordered_map lowerBounds; + // The upper bounds of the in-file positions for the deleted rows, identified + // by each column's field id. E.g. The deleted rows for a column with field id + // 1 is in range [10, 50], then upperBounds will contain entry <1, "50"> + std::unordered_map upperBounds; + + IcebergDeleteFile( + FileContent _content, + const std::string& _filePath, + dwio::common::FileFormat _fileFormat, + uint64_t _recordCount, + uint64_t _fileSizeInBytes, + std::vector _equalityFieldIds = {}, + std::unordered_map _lowerBounds = {}, + std::unordered_map _upperBounds = {}) + : content(_content), + filePath(_filePath), + fileFormat(_fileFormat), + recordCount(_recordCount), + fileSizeInBytes(_fileSizeInBytes), + equalityFieldIds(_equalityFieldIds), + lowerBounds(_lowerBounds), + upperBounds(_upperBounds) {} +}; + +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/IcebergMetadataColumns.h b/velox/connectors/hive/iceberg/IcebergMetadataColumns.h new file mode 100644 index 0000000000000..4cbf2a7862b30 --- /dev/null +++ b/velox/connectors/hive/iceberg/IcebergMetadataColumns.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "velox/type/Type.h" + +namespace facebook::velox::connector::hive::iceberg { + +struct IcebergMetadataColumn { + int id; + std::string name; + std::shared_ptr type; + std::string doc; + + IcebergMetadataColumn( + int _id, + const std::string& _name, + std::shared_ptr _type, + const std::string& _doc) + : id(_id), name(_name), type(_type), doc(_doc) {} + + static std::shared_ptr icebergDeleteFilePathColumn() { + return std::make_shared( + 2147483546, + "file_path", + VARCHAR(), + "Path of a file in which a deleted row is stored"); + } + + static std::shared_ptr icebergDeletePosColumn() { + return std::make_shared( + 2147483545, + "pos", + BIGINT(), + "Ordinal position of a deleted row in the data file"); + } +}; + +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/IcebergSplit.cpp b/velox/connectors/hive/iceberg/IcebergSplit.cpp new file mode 100644 index 0000000000000..6ac3abf4e4dd0 --- /dev/null +++ b/velox/connectors/hive/iceberg/IcebergSplit.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/iceberg/IcebergSplit.h" + +#include "velox/connectors/hive/iceberg/IcebergDeleteFile.h" + +namespace facebook::velox::connector::hive::iceberg { + +HiveIcebergSplit::HiveIcebergSplit( + const std::string& _connectorId, + const std::string& _filePath, + dwio::common::FileFormat _fileFormat, + uint64_t _start, + uint64_t _length, + const std::unordered_map>& + _partitionKeys, + std::optional _tableBucketNumber, + const std::unordered_map& _customSplitInfo, + const std::shared_ptr& _extraFileInfo, + const std::unordered_map& _infoColumns, + std::optional _properties) + : HiveConnectorSplit( + _connectorId, + _filePath, + _fileFormat, + _start, + _length, + _partitionKeys, + _tableBucketNumber, + _customSplitInfo, + _extraFileInfo, + {}, + 0, + _infoColumns, + _properties) { + // TODO: Deserialize _extraFileInfo to get deleteFiles; +} + +// For tests only +HiveIcebergSplit::HiveIcebergSplit( + const std::string& _connectorId, + const std::string& _filePath, + dwio::common::FileFormat _fileFormat, + uint64_t _start, + uint64_t _length, + const std::unordered_map>& + _partitionKeys, + std::optional _tableBucketNumber, + const std::unordered_map& _customSplitInfo, + const std::shared_ptr& _extraFileInfo, + std::vector _deletes, + const std::unordered_map& _infoColumns, + std::optional _properties) + : HiveConnectorSplit( + _connectorId, + _filePath, + _fileFormat, + _start, + _length, + _partitionKeys, + _tableBucketNumber, + _customSplitInfo, + _extraFileInfo, + {}, + 0, + _infoColumns, + _properties), + deleteFiles(_deletes) {} +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/IcebergSplit.h b/velox/connectors/hive/iceberg/IcebergSplit.h new file mode 100644 index 0000000000000..6c1f4cc6bc3f3 --- /dev/null +++ b/velox/connectors/hive/iceberg/IcebergSplit.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "velox/connectors/hive/HiveConnectorSplit.h" + +namespace facebook::velox::connector::hive::iceberg { + +struct IcebergDeleteFile; + +struct HiveIcebergSplit : public connector::hive::HiveConnectorSplit { + std::vector deleteFiles; + + HiveIcebergSplit( + const std::string& connectorId, + const std::string& _filePath, + dwio::common::FileFormat _fileFormat, + uint64_t _start = 0, + uint64_t _length = std::numeric_limits::max(), + const std::unordered_map>& + _partitionKeys = {}, + std::optional _tableBucketNumber = std::nullopt, + const std::unordered_map& _customSplitInfo = {}, + const std::shared_ptr& _extraFileInfo = {}, + const std::unordered_map& _infoColumns = {}, + std::optional fileProperties = std::nullopt); + + // For tests only + HiveIcebergSplit( + const std::string& connectorId, + const std::string& _filePath, + dwio::common::FileFormat _fileFormat, + uint64_t _start = 0, + uint64_t _length = std::numeric_limits::max(), + const std::unordered_map>& + _partitionKeys = {}, + std::optional _tableBucketNumber = std::nullopt, + const std::unordered_map& _customSplitInfo = {}, + const std::shared_ptr& _extraFileInfo = {}, + std::vector deletes = {}, + const std::unordered_map& _infoColumns = {}, + std::optional fileProperties = std::nullopt); +}; + +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/IcebergSplitReader.cpp b/velox/connectors/hive/iceberg/IcebergSplitReader.cpp new file mode 100644 index 0000000000000..b7c1f6b523408 --- /dev/null +++ b/velox/connectors/hive/iceberg/IcebergSplitReader.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/iceberg/IcebergSplitReader.h" + +#include "velox/connectors/hive/iceberg/IcebergDeleteFile.h" +#include "velox/connectors/hive/iceberg/IcebergSplit.h" +#include "velox/dwio/common/BufferUtil.h" + +using namespace facebook::velox::dwio::common; + +namespace facebook::velox::connector::hive::iceberg { + +IcebergSplitReader::IcebergSplitReader( + const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveTableHandle, + const std::unordered_map>* + partitionKeys, + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig, + const RowTypePtr& readerOutputType, + const std::shared_ptr& ioStats, + FileHandleFactory* const fileHandleFactory, + folly::Executor* executor, + const std::shared_ptr& scanSpec) + : SplitReader( + hiveSplit, + hiveTableHandle, + partitionKeys, + connectorQueryCtx, + hiveConfig, + readerOutputType, + ioStats, + fileHandleFactory, + executor, + scanSpec), + baseReadOffset_(0), + splitOffset_(0), + deleteBitmap_(nullptr), + deleteBitmapBitOffset_(0) {} + +void IcebergSplitReader::prepareSplit( + std::shared_ptr metadataFilter, + dwio::common::RuntimeStatistics& runtimeStats, + const std::shared_ptr& rowIndexColumn) { + createReader(std::move(metadataFilter), rowIndexColumn); + + if (checkIfSplitIsEmpty(runtimeStats)) { + VELOX_CHECK(emptySplit_); + return; + } + + createRowReader(); + + std::shared_ptr icebergSplit = + std::dynamic_pointer_cast(hiveSplit_); + baseReadOffset_ = 0; + splitOffset_ = baseRowReader_->nextRowNumber(); + positionalDeleteFileReaders_.clear(); + + const auto& deleteFiles = icebergSplit->deleteFiles; + for (const auto& deleteFile : deleteFiles) { + if (deleteFile.content == FileContent::kPositionalDeletes) { + if (deleteFile.recordCount > 0) { + positionalDeleteFileReaders_.push_back( + std::make_unique( + deleteFile, + hiveSplit_->filePath, + fileHandleFactory_, + connectorQueryCtx_, + executor_, + hiveConfig_, + ioStats_, + runtimeStats, + splitOffset_, + hiveSplit_->connectorId)); + } + } else { + VELOX_NYI(); + } + } +} + +uint64_t IcebergSplitReader::next(uint64_t size, VectorPtr& output) { + Mutation mutation; + mutation.randomSkip = baseReaderOpts_.randomSkip().get(); + mutation.deletedRows = nullptr; + + if (deleteBitmap_ && deleteBitmapBitOffset_ > 0) { + // There are unconsumed bits from last batch + if (deleteBitmapBitOffset_ < deleteBitmap_->size() * 8) { + bits::copyBits( + deleteBitmap_->as(), + deleteBitmapBitOffset_, + deleteBitmap_->asMutable(), + 0, + deleteBitmap_->size() * 8 - deleteBitmapBitOffset_); + + uint64_t newBitMapSizeInBytes = + deleteBitmap_->size() - deleteBitmapBitOffset_ / 8; + if (deleteBitmapBitOffset_ % 8 != 0) { + newBitMapSizeInBytes--; + } + deleteBitmap_->setSize(newBitMapSizeInBytes); + } else { + // All bits were consumed, reset to 0 for all bits + std::memset( + (void*)(deleteBitmap_->asMutable()), + 0L, + deleteBitmap_->size()); + } + } + + if (!positionalDeleteFileReaders_.empty()) { + auto numBytes = bits::nbytes(size); + dwio::common::ensureCapacity( + deleteBitmap_, numBytes, connectorQueryCtx_->memoryPool(), true, true); + + for (auto iter = positionalDeleteFileReaders_.begin(); + iter != positionalDeleteFileReaders_.end();) { + (*iter)->readDeletePositions(baseReadOffset_, size, deleteBitmap_); + + if ((*iter)->noMoreData()) { + iter = positionalDeleteFileReaders_.erase(iter); + } else { + ++iter; + } + } + } + + mutation.deletedRows = deleteBitmap_ && deleteBitmap_->size() > 0 + ? deleteBitmap_->as() + : nullptr; + + auto rowsScanned = baseRowReader_->next(size, output, &mutation); + baseReadOffset_ += rowsScanned; + deleteBitmapBitOffset_ = rowsScanned; + + return rowsScanned; +} + +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/IcebergSplitReader.h b/velox/connectors/hive/iceberg/IcebergSplitReader.h new file mode 100644 index 0000000000000..b5ab7da64480a --- /dev/null +++ b/velox/connectors/hive/iceberg/IcebergSplitReader.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/connectors/Connector.h" +#include "velox/connectors/hive/SplitReader.h" +#include "velox/connectors/hive/iceberg/PositionalDeleteFileReader.h" + +namespace facebook::velox::connector::hive::iceberg { + +struct IcebergDeleteFile; + +class IcebergSplitReader : public SplitReader { + public: + IcebergSplitReader( + const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveTableHandle, + const std::unordered_map>* + partitionKeys, + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig, + const RowTypePtr& readerOutputType, + const std::shared_ptr& ioStats, + FileHandleFactory* fileHandleFactory, + folly::Executor* executor, + const std::shared_ptr& scanSpec); + + ~IcebergSplitReader() override = default; + + void prepareSplit( + std::shared_ptr metadataFilter, + dwio::common::RuntimeStatistics& runtimeStats, + const std::shared_ptr& rowIndexColumn) override; + + uint64_t next(uint64_t size, VectorPtr& output) override; + + private: + // The read offset to the beginning of the split in number of rows for the + // current batch for the base data file + uint64_t baseReadOffset_; + // The file position for the first row in the split + uint64_t splitOffset_; + std::list> + positionalDeleteFileReaders_; + BufferPtr deleteBitmap_; + // The offset in bits of the deleteBitmap_ starting from where the bits shall + // be consumed + uint64_t deleteBitmapBitOffset_; +}; +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp new file mode 100644 index 0000000000000..94828d136e6bb --- /dev/null +++ b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/iceberg/PositionalDeleteFileReader.h" + +#include "velox/connectors/hive/HiveConnectorUtil.h" +#include "velox/connectors/hive/TableHandle.h" +#include "velox/connectors/hive/iceberg/IcebergDeleteFile.h" +#include "velox/connectors/hive/iceberg/IcebergMetadataColumns.h" +#include "velox/dwio/common/ReaderFactory.h" + +namespace facebook::velox::connector::hive::iceberg { + +PositionalDeleteFileReader::PositionalDeleteFileReader( + const IcebergDeleteFile& deleteFile, + const std::string& baseFilePath, + FileHandleFactory* fileHandleFactory, + const ConnectorQueryCtx* connectorQueryCtx, + folly::Executor* executor, + const std::shared_ptr& hiveConfig, + const std::shared_ptr& ioStats, + dwio::common::RuntimeStatistics& runtimeStats, + uint64_t splitOffset, + const std::string& connectorId) + : deleteFile_(deleteFile), + baseFilePath_(baseFilePath), + fileHandleFactory_(fileHandleFactory), + executor_(executor), + hiveConfig_(hiveConfig), + ioStats_(ioStats), + pool_(connectorQueryCtx->memoryPool()), + filePathColumn_(IcebergMetadataColumn::icebergDeleteFilePathColumn()), + posColumn_(IcebergMetadataColumn::icebergDeletePosColumn()), + splitOffset_(splitOffset), + deleteSplit_(nullptr), + deleteRowReader_(nullptr), + deletePositionsOutput_(nullptr), + deletePositionsOffset_(0), + totalNumRowsScanned_(0) { + VELOX_CHECK(deleteFile_.content == FileContent::kPositionalDeletes); + VELOX_CHECK(deleteFile_.recordCount); + + // TODO: check if the lowerbounds and upperbounds in deleteFile overlap with + // this batch. If not, no need to proceed. + + // Create the ScanSpec for this delete file + auto scanSpec = std::make_shared(""); + scanSpec->addField(posColumn_->name, 0); + auto* pathSpec = scanSpec->getOrCreateChild(filePathColumn_->name); + pathSpec->setFilter(std::make_unique( + std::vector({baseFilePath_}), false)); + + // Create the file schema (in RowType) and split that will be used by readers + std::vector deleteColumnNames( + {filePathColumn_->name, posColumn_->name}); + std::vector> deleteColumnTypes( + {filePathColumn_->type, posColumn_->type}); + RowTypePtr deleteFileSchema = + ROW(std::move(deleteColumnNames), std::move(deleteColumnTypes)); + + deleteSplit_ = std::make_shared( + connectorId, + deleteFile_.filePath, + deleteFile_.fileFormat, + 0, + deleteFile_.fileSizeInBytes); + + // Create the Reader and RowReader + + dwio::common::ReaderOptions deleteReaderOpts(pool_); + configureReaderOptions( + deleteReaderOpts, + hiveConfig_, + connectorQueryCtx, + deleteFileSchema, + deleteSplit_); + + auto deleteFileHandleCachePtr = + fileHandleFactory_->generate(deleteFile_.filePath); + auto deleteFileInput = createBufferedInput( + *deleteFileHandleCachePtr, + deleteReaderOpts, + connectorQueryCtx, + ioStats_, + executor_); + + auto deleteReader = + dwio::common::getReaderFactory(deleteReaderOpts.fileFormat()) + ->createReader(std::move(deleteFileInput), deleteReaderOpts); + + // Check if the whole delete file split can be skipped. This could happen when + // 1) the delete file doesn't contain the base file that is being read; 2) The + // delete file does not contain the positions in the current batch for the + // base file. + if (!testFilters( + scanSpec.get(), + deleteReader.get(), + deleteSplit_->filePath, + deleteSplit_->partitionKeys, + {})) { + // We only count the number of base splits skipped as skippedSplits runtime + // statistics in Velox. Skipped delta split is only counted as skipped + // bytes. + runtimeStats.skippedSplitBytes += deleteSplit_->length; + deleteSplit_.reset(); + return; + } + + dwio::common::RowReaderOptions deleteRowReaderOpts; + configureRowReaderOptions( + {}, + scanSpec, + nullptr, + deleteFileSchema, + deleteSplit_, + nullptr, + nullptr, + deleteRowReaderOpts); + + deleteRowReader_.reset(); + deleteRowReader_ = deleteReader->createRowReader(deleteRowReaderOpts); +} + +void PositionalDeleteFileReader::readDeletePositions( + uint64_t baseReadOffset, + uint64_t size, + BufferPtr deleteBitmapBuffer) { + // We are going to read to the row number up to the end of the batch. For the + // same base file, the deleted rows are in ascending order in the same delete + // file. rowNumberUpperBound is the upperbound for the row number in this + // batch, excluding boundaries + int64_t rowNumberUpperBound = splitOffset_ + baseReadOffset + size; + + // Finish unused delete positions from last batch. Note that at this point we + // don't know how many rows the base row reader would scan yet. + if (deletePositionsOutput_ && + deletePositionsOffset_ < deletePositionsOutput_->size()) { + updateDeleteBitmap( + std::dynamic_pointer_cast(deletePositionsOutput_) + ->childAt(0), + baseReadOffset, + rowNumberUpperBound, + deleteBitmapBuffer); + + if (readFinishedForBatch(rowNumberUpperBound)) { + return; + } + } + + if (!deleteRowReader_ || !deleteSplit_) { + return; + } + + // Read the new delete positions for this batch into deletePositionsOutput_ + // and update the delete bitmap + + auto outputType = posColumn_->type; + RowTypePtr outputRowType = ROW({posColumn_->name}, {posColumn_->type}); + if (!deletePositionsOutput_) { + deletePositionsOutput_ = BaseVector::create(outputRowType, 0, pool_); + } + + do { + auto rowsScanned = deleteRowReader_->next(size, deletePositionsOutput_); + totalNumRowsScanned_ += rowsScanned; + + if (rowsScanned > 0) { + VELOX_CHECK( + !deletePositionsOutput_->mayHaveNulls(), + "Iceberg delete file pos column cannot have nulls"); + + auto numDeletedRows = deletePositionsOutput_->size(); + if (numDeletedRows > 0) { + deletePositionsOutput_->loadedVector(); + deletePositionsOffset_ = 0; + + // Convert the row numbers to set bits, up to rowNumberUpperBound. + // Beyond that the buffer of deleteBitMap is not available. + updateDeleteBitmap( + std::dynamic_pointer_cast(deletePositionsOutput_) + ->childAt(0), + baseReadOffset, + rowNumberUpperBound, + deleteBitmapBuffer); + } + } else { + // Reaching the end of the file + deleteSplit_.reset(); + break; + } + } while (!readFinishedForBatch(rowNumberUpperBound)); +} + +bool PositionalDeleteFileReader::noMoreData() { + return totalNumRowsScanned_ >= deleteFile_.recordCount && + deletePositionsOutput_ && + deletePositionsOffset_ >= deletePositionsOutput_->size(); +} + +void PositionalDeleteFileReader::updateDeleteBitmap( + VectorPtr deletePositionsVector, + uint64_t baseReadOffset, + int64_t rowNumberUpperBound, + BufferPtr deleteBitmapBuffer) { + auto deleteBitmap = deleteBitmapBuffer->asMutable(); + + // Convert the positions in file into positions relative to the start of the + // split. + const int64_t* deletePositions = + deletePositionsVector->as>()->rawValues(); + int64_t offset = baseReadOffset + splitOffset_; + + while (deletePositionsOffset_ < deletePositionsVector->size() && + deletePositions[deletePositionsOffset_] < rowNumberUpperBound) { + bits::setBit( + deleteBitmap, deletePositions[deletePositionsOffset_] - offset); + deletePositionsOffset_++; + } + + // There might be multiple delete files for a single base file. The size of + // the deleteBitmapBuffer should be the largest position among all delte files + deleteBitmapBuffer->setSize(std::max( + (uint64_t)deleteBitmapBuffer->size(), + deletePositionsOffset_ == 0 || + (deletePositionsOffset_ < deletePositionsVector->size() && + deletePositions[deletePositionsOffset_] > rowNumberUpperBound) + ? 0 + : bits::nbytes( + deletePositions[deletePositionsOffset_ - 1] + 1 - offset))); +} + +bool PositionalDeleteFileReader::readFinishedForBatch( + int64_t rowNumberUpperBound) { + VELOX_CHECK_NOT_NULL(deletePositionsOutput_); + + auto deletePositionsVector = + std::dynamic_pointer_cast(deletePositionsOutput_)->childAt(0); + const int64_t* deletePositions = + deletePositionsVector->as>()->rawValues(); + + // We've read enough of the delete positions from this delete file when 1) it + // reaches the end of the file, or 2) the last read delete position is greater + // than the largest base file row number that is going to be read in this + // batch + if (totalNumRowsScanned_ >= deleteFile_.recordCount || + (deletePositionsVector->size() != 0 && + (deletePositionsOffset_ < deletePositionsVector->size() && + deletePositions[deletePositionsOffset_] >= rowNumberUpperBound))) { + return true; + } + return false; +} + +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.h b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.h new file mode 100644 index 0000000000000..ba98845eb6393 --- /dev/null +++ b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "velox/connectors/Connector.h" +#include "velox/connectors/hive/FileHandle.h" +#include "velox/connectors/hive/HiveConfig.h" +#include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/dwio/common/Reader.h" + +namespace facebook::velox::connector::hive::iceberg { + +struct IcebergDeleteFile; +struct IcebergMetadataColumn; + +using SubfieldFilters = + std::unordered_map>; + +class PositionalDeleteFileReader { + public: + PositionalDeleteFileReader( + const IcebergDeleteFile& deleteFile, + const std::string& baseFilePath, + FileHandleFactory* fileHandleFactory, + const ConnectorQueryCtx* connectorQueryCtx, + folly::Executor* executor, + const std::shared_ptr& hiveConfig, + const std::shared_ptr& ioStats, + dwio::common::RuntimeStatistics& runtimeStats, + uint64_t splitOffset, + const std::string& connectorId); + + void readDeletePositions( + uint64_t baseReadOffset, + uint64_t size, + BufferPtr deleteBitmap); + + bool noMoreData(); + + private: + void updateDeleteBitmap( + VectorPtr deletePositionsVector, + uint64_t baseReadOffset, + int64_t rowNumberUpperBound, + BufferPtr deleteBitmapBuffer); + + bool readFinishedForBatch(int64_t rowNumberUpperBound); + + const IcebergDeleteFile& deleteFile_; + const std::string& baseFilePath_; + FileHandleFactory* const fileHandleFactory_; + folly::Executor* const executor_; + const std::shared_ptr hiveConfig_; + const std::shared_ptr ioStats_; + memory::MemoryPool* const pool_; + + std::shared_ptr filePathColumn_; + std::shared_ptr posColumn_; + uint64_t splitOffset_; + + std::shared_ptr deleteSplit_; + std::unique_ptr deleteRowReader_; + // The vector to hold the delete positions read from the positional delete + // file. These positions are relative to the start of the whole base data + // file. + VectorPtr deletePositionsOutput_; + // The index of deletePositionsOutput_ that indicates up to where the delete + // positions have been converted into the bitmap + uint64_t deletePositionsOffset_; + // Total number of rows read from this positional delete file reader, + // including the rows filtered out from filters on both filePathColumn_ and + // posColumn_. + uint64_t totalNumRowsScanned_; +}; + +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/tests/CMakeLists.txt b/velox/connectors/hive/iceberg/tests/CMakeLists.txt new file mode 100644 index 0000000000000..5808b640af64c --- /dev/null +++ b/velox/connectors/hive/iceberg/tests/CMakeLists.txt @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +add_library(velox_dwio_iceberg_reader_benchmark_lib + IcebergSplitReaderBenchmark.cpp) +target_link_libraries( + velox_dwio_iceberg_reader_benchmark_lib + velox_exec_test_lib + velox_exec + velox_hive_connector + Folly::folly + ${FOLLY_BENCHMARK} + ${TEST_LINK_LIBS}) + +add_executable(velox_dwio_iceberg_reader_benchmark + IcebergSplitReaderBenchmarkMain.cpp) +target_link_libraries( + velox_dwio_iceberg_reader_benchmark + velox_dwio_iceberg_reader_benchmark_lib + velox_dwio_dwrf_proto + velox_exec_test_lib + velox_exec + velox_hive_connector + Folly::folly + ${FOLLY_BENCHMARK} + ${TEST_LINK_LIBS}) + +if(NOT VELOX_DISABLE_GOOGLETEST) + + add_executable(velox_hive_iceberg_test IcebergReadTest.cpp + IcebergSplitReaderBenchmarkTest.cpp) + add_test(velox_hive_iceberg_test velox_hive_iceberg_test) + + target_link_libraries( + velox_hive_iceberg_test + velox_dwio_iceberg_reader_benchmark_lib + velox_hive_connector + velox_hive_iceberg_splitreader + velox_hive_partition_function + velox_dwio_common_exception + velox_dwio_common_test_utils + velox_dwio_dwrf_proto + velox_vector_test_lib + velox_exec + velox_exec_test_lib + Folly::folly + ${FOLLY_BENCHMARK} + GTest::gtest + GTest::gtest_main) + +endif() diff --git a/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp b/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp new file mode 100644 index 0000000000000..d79e21b733439 --- /dev/null +++ b/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp @@ -0,0 +1,662 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/file/FileSystems.h" +#include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/connectors/hive/iceberg/IcebergDeleteFile.h" +#include "velox/connectors/hive/iceberg/IcebergMetadataColumns.h" +#include "velox/connectors/hive/iceberg/IcebergSplit.h" +#include "velox/exec/PlanNodeStats.h" +#include "velox/exec/tests/utils/HiveConnectorTestBase.h" +#include "velox/exec/tests/utils/PlanBuilder.h" + +#include + +using namespace facebook::velox::exec::test; +using namespace facebook::velox::exec; +using namespace facebook::velox::dwio::common; +using namespace facebook::velox::test; + +namespace facebook::velox::connector::hive::iceberg { + +class HiveIcebergTest : public HiveConnectorTestBase { + public: + HiveIcebergTest() + : config_{std::make_shared()} { + // Make the writers flush per batch so that we can create non-aligned + // RowGroups between the base data files and delete files + flushPolicyFactory_ = []() { + return std::make_unique([]() { return true; }); + }; + } + + /// Create 1 base data file data_file_1 with 2 RowGroups of 10000 rows each. + /// Also create 1 delete file delete_file_1 which contains delete positions + /// for data_file_1. + void assertSingleBaseFileSingleDeleteFile( + const std::vector& deletePositionsVec) { + std::map> rowGroupSizesForFiles = { + {"data_file_1", {10000, 10000}}}; + std::unordered_map< + std::string, + std::multimap>> + deleteFilesForBaseDatafiles = { + {"delete_file_1", {{"data_file_1", deletePositionsVec}}}}; + + assertPositionalDeletes( + rowGroupSizesForFiles, deleteFilesForBaseDatafiles, 0); + } + + /// Create 3 base data files, where the first file data_file_0 has 500 rows, + /// the second file data_file_1 contains 2 RowGroups of 10000 rows each, and + /// the third file data_file_2 contains 500 rows. It creates 1 positional + /// delete file delete_file_1, which contains delete positions for + /// data_file_1. + void assertMultipleBaseFileSingleDeleteFile( + const std::vector& deletePositionsVec) { + int64_t previousFileRowCount = 500; + int64_t afterFileRowCount = 500; + + assertPositionalDeletes( + { + {"data_file_0", {previousFileRowCount}}, + {"data_file_1", {10000, 10000}}, + {"data_file_2", {afterFileRowCount}}, + }, + {{"delete_file_1", {{"data_file_1", deletePositionsVec}}}}, + 0); + } + + /// Create 1 base data file data_file_1 with 2 RowGroups of 10000 rows each. + /// Create multiple delete files with name data_file_1, data_file_2, and so on + void assertSingleBaseFileMultipleDeleteFiles( + const std::vector>& deletePositionsVecs) { + std::map> rowGroupSizesForFiles = { + {"data_file_1", {10000, 10000}}}; + + std::unordered_map< + std::string, + std::multimap>> + deleteFilesForBaseDatafiles; + for (int i = 0; i < deletePositionsVecs.size(); i++) { + std::string deleteFileName = fmt::format("delete_file_{}", i); + deleteFilesForBaseDatafiles[deleteFileName] = { + {"data_file_1", deletePositionsVecs[i]}}; + } + assertPositionalDeletes( + rowGroupSizesForFiles, deleteFilesForBaseDatafiles, 0); + } + + void assertMultipleSplits( + const std::vector& deletePositions, + int32_t splitCount, + int32_t numPrefetchSplits) { + std::map> rowGroupSizesForFiles; + for (int32_t i = 0; i < splitCount; i++) { + std::string dataFileName = fmt::format("data_file_{}", i); + rowGroupSizesForFiles[dataFileName] = {rowCount}; + } + + std::unordered_map< + std::string, + std::multimap>> + deleteFilesForBaseDatafiles; + for (int i = 0; i < splitCount; i++) { + std::string deleteFileName = fmt::format("delete_file_{}", i); + deleteFilesForBaseDatafiles[deleteFileName] = { + {fmt::format("data_file_{}", i), deletePositions}}; + } + + assertPositionalDeletes( + rowGroupSizesForFiles, deleteFilesForBaseDatafiles, numPrefetchSplits); + } + + std::vector makeRandomIncreasingValues(int64_t begin, int64_t end) { + VELOX_CHECK(begin < end); + + std::mt19937 gen{0}; + std::vector values; + values.reserve(end - begin); + for (int i = begin; i < end; i++) { + if (folly::Random::rand32(0, 10, gen) > 8) { + values.push_back(i); + } + } + return values; + } + + std::vector makeContinuousIncreasingValues( + int64_t begin, + int64_t end) { + std::vector values; + values.resize(end - begin); + std::iota(values.begin(), values.end(), begin); + return values; + } + + /// @rowGroupSizesForFiles The key is the file name, and the value is a vector + /// of RowGroup sizes + /// @deleteFilesForBaseDatafiles The key is the delete file name, and the + /// value contains the information about the content of this delete file. + /// e.g. { + /// "delete_file_1", + /// { + /// {"data_file_1", {1, 2, 3}}, + /// {"data_file_1", {4, 5, 6}}, + /// {"data_file_2", {0, 2, 4}} + /// } + /// } + /// represents one delete file called delete_file_1, which contains delete + /// positions for data_file_1 and data_file_2. THere are 3 RowGroups in this + /// delete file, the first two contain positions for data_file_1, and the last + /// contain positions for data_file_2 + void assertPositionalDeletes( + const std::map>& rowGroupSizesForFiles, + const std::unordered_map< + std::string, + std::multimap>>& + deleteFilesForBaseDatafiles, + int32_t numPrefetchSplits = 0) { + // Keep the reference to the deleteFilePath, otherwise the corresponding + // file will be deleted. + std::map> dataFilePaths = + writeDataFiles(rowGroupSizesForFiles); + std::unordered_map< + std::string, + std::pair>> + deleteFilePaths = writePositionDeleteFiles( + deleteFilesForBaseDatafiles, dataFilePaths); + + std::vector> splits; + + for (const auto& dataFile : dataFilePaths) { + std::string baseFileName = dataFile.first; + std::string baseFilePath = dataFile.second->getPath(); + + std::vector deleteFiles; + + for (auto const& deleteFile : deleteFilesForBaseDatafiles) { + std::string deleteFileName = deleteFile.first; + std::multimap> deleteFileContent = + deleteFile.second; + + if (deleteFileContent.count(baseFileName) != 0) { + // If this delete file contains rows for the target base file, then + // add it to the split + auto deleteFilePath = + deleteFilePaths[deleteFileName].second->getPath(); + IcebergDeleteFile deleteFile( + FileContent::kPositionalDeletes, + deleteFilePath, + fileFomat_, + deleteFilePaths[deleteFileName].first, + testing::internal::GetFileSize( + std::fopen(deleteFilePath.c_str(), "r"))); + deleteFiles.push_back(deleteFile); + } + } + + splits.emplace_back(makeIcebergSplit(baseFilePath, deleteFiles)); + } + + std::string duckdbSql = + getDuckDBQuery(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); + auto plan = tableScanNode(); + auto task = HiveConnectorTestBase::assertQuery( + plan, splits, duckdbSql, numPrefetchSplits); + + auto planStats = toPlanStats(task->taskStats()); + auto scanNodeId = plan->id(); + auto it = planStats.find(scanNodeId); + ASSERT_TRUE(it != planStats.end()); + ASSERT_TRUE(it->second.peakMemoryBytes > 0); + } + + const static int rowCount = 20000; + + private: + std::map> writeDataFiles( + std::map> rowGroupSizesForFiles) { + std::map> dataFilePaths; + + std::vector dataVectorsJoined; + dataVectorsJoined.reserve(rowGroupSizesForFiles.size()); + + int64_t startingValue = 0; + for (auto& dataFile : rowGroupSizesForFiles) { + dataFilePaths[dataFile.first] = TempFilePath::create(); + + // We make the values are continuously increasing even across base data + // files. This is to make constructing DuckDB queries easier + std::vector dataVectors = + makeVectors(dataFile.second, startingValue); + writeToFile( + dataFilePaths[dataFile.first]->getPath(), + dataVectors, + config_, + flushPolicyFactory_); + + for (int i = 0; i < dataVectors.size(); i++) { + dataVectorsJoined.push_back(dataVectors[i]); + } + } + + createDuckDbTable(dataVectorsJoined); + return dataFilePaths; + } + + /// Input is like <"deleteFile1", <"dataFile1", {pos_RG1, pos_RG2,..}>, + /// <"dataFile2", {pos_RG1, pos_RG2,..}> + std::unordered_map< + std::string, + std::pair>> + writePositionDeleteFiles( + const std::unordered_map< + std::string, // delete file name + std::multimap< + std::string, + std::vector>>& + deleteFilesForBaseDatafiles, // + std::map> baseFilePaths) { + std::unordered_map< + std::string, + std::pair>> + deleteFilePaths; + deleteFilePaths.reserve(deleteFilesForBaseDatafiles.size()); + + for (auto& deleteFile : deleteFilesForBaseDatafiles) { + auto deleteFileName = deleteFile.first; + auto deleteFileContent = deleteFile.second; + auto deleteFilePath = TempFilePath::create(); + + std::vector deleteFileVectors; + int64_t totalPositionsInDeleteFile = 0; + + for (auto& deleteFileRowGroup : deleteFileContent) { + auto baseFileName = deleteFileRowGroup.first; + auto baseFilePath = baseFilePaths[baseFileName]->getPath(); + auto positionsInRowGroup = deleteFileRowGroup.second; + + auto filePathVector = makeFlatVector( + static_cast(positionsInRowGroup.size()), + [&](vector_size_t row) { return baseFilePath; }); + auto deletePosVector = makeFlatVector(positionsInRowGroup); + + RowVectorPtr deleteFileVector = makeRowVector( + {pathColumn_->name, posColumn_->name}, + {filePathVector, deletePosVector}); + + deleteFileVectors.push_back(deleteFileVector); + totalPositionsInDeleteFile += positionsInRowGroup.size(); + } + + writeToFile( + deleteFilePath->getPath(), + deleteFileVectors, + config_, + flushPolicyFactory_); + + deleteFilePaths[deleteFileName] = + std::make_pair(totalPositionsInDeleteFile, deleteFilePath); + } + + return deleteFilePaths; + } + + std::vector makeVectors( + std::vector vectorSizes, + int64_t& startingValue) { + std::vector vectors; + vectors.reserve(vectorSizes.size()); + + vectors.reserve(vectorSizes.size()); + for (int j = 0; j < vectorSizes.size(); j++) { + auto data = makeContinuousIncreasingValues( + startingValue, startingValue + vectorSizes[j]); + VectorPtr c0 = vectorMaker_.flatVector(data); + vectors.push_back(makeRowVector({"c0"}, {c0})); + startingValue += vectorSizes[j]; + } + + return vectors; + } + + std::shared_ptr makeIcebergSplit( + const std::string& dataFilePath, + const std::vector& deleteFiles = {}) { + std::unordered_map> partitionKeys; + std::unordered_map customSplitInfo; + customSplitInfo["table_format"] = "hive-iceberg"; + + auto file = filesystems::getFileSystem(dataFilePath, nullptr) + ->openFileForRead(dataFilePath); + const int64_t fileSize = file->size(); + + return std::make_shared( + kHiveConnectorId, + dataFilePath, + fileFomat_, + 0, + fileSize, + partitionKeys, + std::nullopt, + customSplitInfo, + nullptr, + deleteFiles); + } + + std::string getDuckDBQuery( + const std::map>& rowGroupSizesForFiles, + const std::unordered_map< + std::string, + std::multimap>>& + deleteFilesForBaseDatafiles) { + int64_t totalNumRowsInAllBaseFiles = 0; + std::map baseFileSizes; + for (auto rowGroupSizesInFile : rowGroupSizesForFiles) { + // Sum up the row counts in all RowGroups in each base file + baseFileSizes[rowGroupSizesInFile.first] += std::accumulate( + rowGroupSizesInFile.second.begin(), + rowGroupSizesInFile.second.end(), + 0LL); + totalNumRowsInAllBaseFiles += baseFileSizes[rowGroupSizesInFile.first]; + } + + // Group the delete vectors by baseFileName + std::map>> + deletePosVectorsForAllBaseFiles; + for (auto deleteFile : deleteFilesForBaseDatafiles) { + auto deleteFileContent = deleteFile.second; + for (auto rowGroup : deleteFileContent) { + auto baseFileName = rowGroup.first; + deletePosVectorsForAllBaseFiles[baseFileName].push_back( + rowGroup.second); + } + } + + // Flatten and deduplicate the delete position vectors in + // deletePosVectorsForAllBaseFiles from previous step, and count the total + // number of distinct delete positions for all base files + std::map> + flattenedDeletePosVectorsForAllBaseFiles; + int64_t totalNumDeletePositions = 0; + for (auto deleteVectorsForBaseFile : deletePosVectorsForAllBaseFiles) { + auto baseFileName = deleteVectorsForBaseFile.first; + auto deletePositionVectors = deleteVectorsForBaseFile.second; + std::vector deletePositionVector = + flattenAndDedup(deletePositionVectors, baseFileSizes[baseFileName]); + flattenedDeletePosVectorsForAllBaseFiles[baseFileName] = + deletePositionVector; + totalNumDeletePositions += deletePositionVector.size(); + } + + // Now build the DuckDB queries + if (totalNumDeletePositions == 0) { + return "SELECT * FROM tmp"; + } else if (totalNumDeletePositions >= totalNumRowsInAllBaseFiles) { + return "SELECT * FROM tmp WHERE 1 = 0"; + } else { + // Convert the delete positions in all base files into column values + std::vector allDeleteValues; + + int64_t numRowsInPreviousBaseFiles = 0; + for (auto baseFileSize : baseFileSizes) { + auto deletePositions = + flattenedDeletePosVectorsForAllBaseFiles[baseFileSize.first]; + + if (numRowsInPreviousBaseFiles > 0) { + for (int64_t& deleteValue : deletePositions) { + deleteValue += numRowsInPreviousBaseFiles; + } + } + + allDeleteValues.insert( + allDeleteValues.end(), + deletePositions.begin(), + deletePositions.end()); + + numRowsInPreviousBaseFiles += baseFileSize.second; + } + + return fmt::format( + "SELECT * FROM tmp WHERE c0 NOT IN ({})", + makeNotInList(allDeleteValues)); + } + } + + std::vector flattenAndDedup( + const std::vector>& deletePositionVectors, + int64_t baseFileSize) { + std::vector deletePositionVector; + for (auto vec : deletePositionVectors) { + for (auto pos : vec) { + if (pos >= 0 && pos < baseFileSize) { + deletePositionVector.push_back(pos); + } + } + } + + std::sort(deletePositionVector.begin(), deletePositionVector.end()); + auto last = + std::unique(deletePositionVector.begin(), deletePositionVector.end()); + deletePositionVector.erase(last, deletePositionVector.end()); + + return deletePositionVector; + } + + std::string makeNotInList(const std::vector& deletePositionVector) { + if (deletePositionVector.empty()) { + return ""; + } + + return std::accumulate( + deletePositionVector.begin() + 1, + deletePositionVector.end(), + std::to_string(deletePositionVector[0]), + [](const std::string& a, int64_t b) { + return a + ", " + std::to_string(b); + }); + } + + core::PlanNodePtr tableScanNode() { + return PlanBuilder(pool_.get()).tableScan(rowType_).planNode(); + } + + dwio::common::FileFormat fileFomat_{dwio::common::FileFormat::DWRF}; + std::shared_ptr config_; + std::function()> flushPolicyFactory_; + + RowTypePtr rowType_{ROW({"c0"}, {BIGINT()})}; + std::shared_ptr pathColumn_ = + IcebergMetadataColumn::icebergDeleteFilePathColumn(); + std::shared_ptr posColumn_ = + IcebergMetadataColumn::icebergDeletePosColumn(); +}; + +/// This test creates one single data file and one delete file. The parameter +/// passed to assertSingleBaseFileSingleDeleteFile is the delete positions. +TEST_F(HiveIcebergTest, singleBaseFileSinglePositionalDeleteFile) { + folly::SingletonVault::singleton()->registrationComplete(); + + assertSingleBaseFileSingleDeleteFile({{0, 1, 2, 3}}); + // Delete the first and last row in each batch (10000 rows per batch) + assertSingleBaseFileSingleDeleteFile({{0, 9999, 10000, 19999}}); + // Delete several rows in the second batch (10000 rows per batch) + assertSingleBaseFileSingleDeleteFile({{10000, 10002, 19999}}); + // Delete random rows + assertSingleBaseFileSingleDeleteFile({makeRandomIncreasingValues(0, 20000)}); + // Delete 0 rows + assertSingleBaseFileSingleDeleteFile({}); + // Delete all rows + assertSingleBaseFileSingleDeleteFile( + {makeContinuousIncreasingValues(0, 20000)}); + // Delete rows that don't exist + assertSingleBaseFileSingleDeleteFile({{20000, 29999}}); +} + +/// This test creates 3 base data files, only the middle one has corresponding +/// delete positions. The parameter passed to +/// assertSingleBaseFileSingleDeleteFile is the delete positions.for the middle +/// base file. +TEST_F(HiveIcebergTest, MultipleBaseFilesSinglePositionalDeleteFile) { + folly::SingletonVault::singleton()->registrationComplete(); + + assertMultipleBaseFileSingleDeleteFile({0, 1, 2, 3}); + assertMultipleBaseFileSingleDeleteFile({0, 9999, 10000, 19999}); + assertMultipleBaseFileSingleDeleteFile({10000, 10002, 19999}); + assertMultipleBaseFileSingleDeleteFile({10000, 10002, 19999}); + assertMultipleBaseFileSingleDeleteFile( + makeRandomIncreasingValues(0, rowCount)); + assertMultipleBaseFileSingleDeleteFile({}); + assertMultipleBaseFileSingleDeleteFile( + makeContinuousIncreasingValues(0, rowCount)); +} + +/// This test creates one base data file/split with multiple delete files. The +/// parameter passed to assertSingleBaseFileMultipleDeleteFiles is the vector of +/// delete files. Each leaf vector represents the delete positions in that +/// delete file. +TEST_F(HiveIcebergTest, singleBaseFileMultiplePositionalDeleteFiles) { + folly::SingletonVault::singleton()->registrationComplete(); + + // Delete row 0, 1, 2, 3 from the first batch out of two. + assertSingleBaseFileMultipleDeleteFiles({{1}, {2}, {3}, {4}}); + // Delete the first and last row in each batch (10000 rows per batch). + assertSingleBaseFileMultipleDeleteFiles({{0}, {9999}, {10000}, {19999}}); + + assertSingleBaseFileMultipleDeleteFiles({{500, 21000}}); + + assertSingleBaseFileMultipleDeleteFiles( + {makeRandomIncreasingValues(0, 10000), + makeRandomIncreasingValues(10000, 20000), + makeRandomIncreasingValues(5000, 15000)}); + + assertSingleBaseFileMultipleDeleteFiles( + {makeContinuousIncreasingValues(0, 10000), + makeContinuousIncreasingValues(10000, 20000)}); + + assertSingleBaseFileMultipleDeleteFiles( + {makeContinuousIncreasingValues(0, 10000), + makeContinuousIncreasingValues(10000, 20000), + makeRandomIncreasingValues(5000, 15000)}); + + assertSingleBaseFileMultipleDeleteFiles( + {makeContinuousIncreasingValues(0, 20000), + makeContinuousIncreasingValues(0, 20000)}); + + assertSingleBaseFileMultipleDeleteFiles( + {makeRandomIncreasingValues(0, 20000), + {}, + makeRandomIncreasingValues(5000, 15000)}); + + assertSingleBaseFileMultipleDeleteFiles({{}, {}}); +} + +/// This test creates 2 base data files, and 1 or 2 delete files, with unaligned +/// RowGroup boundaries +TEST_F(HiveIcebergTest, multipleBaseFileMultiplePositionalDeleteFiles) { + folly::SingletonVault::singleton()->registrationComplete(); + + std::map> rowGroupSizesForFiles; + std::unordered_map< + std::string, + std::multimap>> + deleteFilesForBaseDatafiles; + + // Create two data files, each with two RowGroups + rowGroupSizesForFiles["data_file_1"] = {100, 85}; + rowGroupSizesForFiles["data_file_2"] = {99, 1}; + + // Delete 3 rows from the first RowGroup in data_file_1 + deleteFilesForBaseDatafiles["delete_file_1"] = {{"data_file_1", {0, 1, 99}}}; + assertPositionalDeletes(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); + + // Delete 3 rows from the second RowGroup in data_file_1 + deleteFilesForBaseDatafiles["delete_file_1"] = { + {"data_file_1", {100, 101, 184}}}; + assertPositionalDeletes(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); + + // Delete random rows from the both RowGroups in data_file_1 + deleteFilesForBaseDatafiles["delete_file_1"] = { + {"data_file_1", makeRandomIncreasingValues(0, 185)}}; + assertPositionalDeletes(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); + + // Delete all rows in data_file_1 + deleteFilesForBaseDatafiles["delete_file_1"] = { + {"data_file_1", makeContinuousIncreasingValues(0, 185)}}; + assertPositionalDeletes(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); + // + // Delete non-existent rows from data_file_1 + deleteFilesForBaseDatafiles["delete_file_1"] = { + {"data_file_1", makeRandomIncreasingValues(186, 300)}}; + assertPositionalDeletes(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); + + // Delete several rows from both RowGroups in both data files + deleteFilesForBaseDatafiles.clear(); + deleteFilesForBaseDatafiles["delete_file_1"] = { + {"data_file_1", {0, 100, 102, 184}}, {"data_file_2", {1, 98, 99}}}; + assertPositionalDeletes(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); + + // The delete file delete_file_1 contains 3 RowGroups itself, with the first 3 + // deleting some repeating rows in data_file_1, and the last 2 RowGroups + // deleting some repeating rows in data_file_2 + deleteFilesForBaseDatafiles.clear(); + deleteFilesForBaseDatafiles["delete_file_1"] = { + {"data_file_1", {0, 1, 2, 3}}, + {"data_file_1", {1, 2, 3, 4}}, + {"data_file_1", makeRandomIncreasingValues(0, 185)}, + {"data_file_2", {1, 3, 5, 7}}, + {"data_file_2", makeRandomIncreasingValues(0, 100)}}; + assertPositionalDeletes(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); + + // delete_file_2 contains non-overlapping delete rows for each data files in + // each RowGroup + deleteFilesForBaseDatafiles.clear(); + deleteFilesForBaseDatafiles["delete_file_1"] = { + {"data_file_1", {0, 1, 2, 3}}, {"data_file_2", {1, 3, 5, 7}}}; + deleteFilesForBaseDatafiles["delete_file_2"] = { + {"data_file_1", {1, 2, 3, 4}}, + {"data_file_1", {98, 99, 100, 101, 184}}, + {"data_file_2", {3, 5, 7, 9}}, + {"data_file_2", {98, 99, 100}}}; + assertPositionalDeletes(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); + + // Two delete files each containing overlapping delete rows for both data + // files + deleteFilesForBaseDatafiles.clear(); + deleteFilesForBaseDatafiles["delete_file_1"] = { + {"data_file_1", makeRandomIncreasingValues(0, 185)}, + {"data_file_2", makeRandomIncreasingValues(0, 100)}}; + deleteFilesForBaseDatafiles["delete_file_2"] = { + {"data_file_1", makeRandomIncreasingValues(10, 120)}, + {"data_file_2", makeRandomIncreasingValues(50, 100)}}; + assertPositionalDeletes(rowGroupSizesForFiles, deleteFilesForBaseDatafiles); +} + +TEST_F(HiveIcebergTest, positionalDeletesMultipleSplits) { + folly::SingletonVault::singleton()->registrationComplete(); + + assertMultipleSplits({1, 2, 3, 4}, 10, 5); + assertMultipleSplits({1, 2, 3, 4}, 10, 0); + assertMultipleSplits({1, 2, 3, 4}, 10, 10); + assertMultipleSplits({0, 9999, 10000, 19999}, 10, 3); + assertMultipleSplits(makeRandomIncreasingValues(0, 20000), 10, 3); + assertMultipleSplits(makeContinuousIncreasingValues(0, 20000), 10, 3); + assertMultipleSplits({}, 10, 3); +} + +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.cpp b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.cpp new file mode 100644 index 0000000000000..e0b2a6c31f853 --- /dev/null +++ b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.cpp @@ -0,0 +1,390 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.h" +#include +#include "velox/exec/tests/utils/PrefixSortUtils.h" + +using namespace facebook::velox; +using namespace facebook::velox::dwio; +using namespace facebook::velox::dwio::common; +using namespace facebook::velox::dwrf; +using namespace facebook::velox::connector::hive; +using namespace facebook::velox::connector::hive::iceberg; +using namespace facebook::velox::memory; + +namespace facebook::velox::iceberg::reader::test { +void IcebergSplitReaderBenchmark::writeToFile( + const std::vector& batches) { + auto path = fileFolder_->getPath() + "/" + fileName_; + auto localWriteFile = std::make_unique(path, true, false); + auto sink = std::make_unique(std::move(localWriteFile), path); + dwrf::WriterOptions options; + options.memoryPool = rootPool_.get(); + options.schema = batches[0]->type(); + dwrf::Writer dataFilewriter{std::move(sink), options}; + for (auto& batch : batches) { + dataFilewriter.write(batch); + } + dataFilewriter.flush(); + dataFilewriter.close(); +} + +void IcebergSplitReaderBenchmark::writeToPositionDeleteFile( + const std::string& filePath, + const std::vector& vectors) { + auto localPosWriteFile = + std::make_unique(filePath, true, false); + auto posDeletesink = + std::make_unique(std::move(localPosWriteFile), filePath); + dwrf::WriterOptions options; + options.memoryPool = rootPool_.get(); + options.schema = vectors[0]->type(); + dwrf::Writer posDeletewriter{std::move(posDeletesink), options}; + for (size_t i = 0; i < vectors.size(); ++i) { + posDeletewriter.write(vectors[i]); + } + posDeletewriter.close(); +} + +std::vector IcebergSplitReaderBenchmark::makeRandomDeleteRows( + int32_t deleteRowsCount) { + std::random_device rd; + std::mt19937 gen(rd()); + int64_t totalDataRows = kNumBatches * kNumRowsPerBatch; + std::uniform_int_distribution<> dis(0, totalDataRows - 1); + std::set uniqueDeleteRows; + while (uniqueDeleteRows.size() < deleteRowsCount) { + uniqueDeleteRows.insert(dis(gen)); + } + std::vector deleteRows( + uniqueDeleteRows.begin(), uniqueDeleteRows.end()); + return deleteRows; +} + +std::vector IcebergSplitReaderBenchmark::makeSequenceRows( + int32_t maxRowNumber) { + std::vector deleteRows; + deleteRows.resize(maxRowNumber); + std::iota(deleteRows.begin(), deleteRows.end(), 0); + return deleteRows; +} + +std::vector IcebergSplitReaderBenchmark::listFiles( + const std::string& dirPath) { + std::vector files; + for (auto& dirEntry : + std::filesystem::recursive_directory_iterator(dirPath)) { + if (dirEntry.is_regular_file()) { + files.push_back(dirEntry.path().string()); + } + } + return files; +} + +std::shared_ptr +IcebergSplitReaderBenchmark::makeIcebergSplit( + const std::string& dataFilePath, + const std::vector& deleteFiles) { + std::unordered_map> partitionKeys; + std::unordered_map customSplitInfo; + customSplitInfo["table_format"] = "hive-iceberg"; + + auto readFile = std::make_shared(dataFilePath); + const int64_t fileSize = readFile->size(); + + return std::make_shared( + kHiveConnectorId, + dataFilePath, + fileFomat_, + 0, + fileSize, + partitionKeys, + std::nullopt, + customSplitInfo, + nullptr, + deleteFiles); +} + +std::string IcebergSplitReaderBenchmark::writePositionDeleteFile( + const std::string& dataFilePath, + int64_t numDeleteRows) { + facebook::velox::test::VectorMaker vectorMaker{leafPool_.get()}; + auto filePathVector = + vectorMaker.flatVector(numDeleteRows, [&](auto row) { + if (row < numDeleteRows) { + return StringView(dataFilePath); + } else { + return StringView(); + } + }); + + std::vector deleteRowsVec; + deleteRowsVec.reserve(numDeleteRows); + auto deleteRows = makeRandomDeleteRows(numDeleteRows); + deleteRowsVec.insert( + deleteRowsVec.end(), deleteRows.begin(), deleteRows.end()); + + auto deletePositionsVector = vectorMaker.flatVector(deleteRowsVec); + + std::shared_ptr pathColumn = + IcebergMetadataColumn::icebergDeleteFilePathColumn(); + std::shared_ptr posColumn = + IcebergMetadataColumn::icebergDeletePosColumn(); + RowVectorPtr deleteFileVectors = vectorMaker.rowVector( + {pathColumn->name, posColumn->name}, + {filePathVector, deletePositionsVector}); + + auto deleteFilePath = deleteFileFolder_->getPath() + "/" + "posDelete.data"; + writeToPositionDeleteFile(deleteFilePath, std::vector{deleteFileVectors}); + + return deleteFilePath; +} + +std::vector> +IcebergSplitReaderBenchmark::createIcebergSplitsWithPositionalDelete( + int32_t deleteRowsPercentage, + int32_t deleteFilesCount) { + std::vector> splits; + + std::vector deleteFilePaths; + std::vector dataFilePaths = listFiles(fileFolder_->getPath()); + + for (const auto& dataFilePath : dataFilePaths) { + std::vector deleteFiles; + int64_t deleteRowsCount = + kNumBatches * kNumRowsPerBatch * deleteRowsPercentage * 0.01; + deleteFiles.reserve(deleteRowsCount); + for (int i = 0; i < deleteFilesCount; i++) { + std::string deleteFilePath = + writePositionDeleteFile(dataFilePath, deleteRowsCount); + + IcebergDeleteFile deleteFile( + FileContent::kPositionalDeletes, + deleteFilePath, + fileFomat_, + deleteRowsCount, + testing::internal::GetFileSize( + std::fopen(deleteFilePath.c_str(), "r"))); + deleteFilePaths.emplace_back(deleteFilePath); + deleteFiles.emplace_back(deleteFile); + } + splits.emplace_back(makeIcebergSplit(dataFilePath, deleteFiles)); + } + return splits; +} + +FilterSpec IcebergSplitReaderBenchmark::createFilterSpec( + const std::string& columnName, + float startPct, + float selectPct, + const TypePtr& type, + bool isForRowGroupSkip, + bool allowNulls) { + switch (type->childAt(0)->kind()) { + case TypeKind::BIGINT: + case TypeKind::INTEGER: + return FilterSpec( + columnName, + startPct, + selectPct, + FilterKind::kBigintRange, + isForRowGroupSkip, + allowNulls); + default: + VELOX_FAIL("Unsupported Data Type {}", type->childAt(0)->toString()); + } + return FilterSpec(columnName, startPct, selectPct, FilterKind(), false); +} + +std::shared_ptr IcebergSplitReaderBenchmark::createScanSpec( + const std::vector& batches, + RowTypePtr& rowType, + const std::vector& filterSpecs, + std::vector& hitRows, + std::unordered_map>& filters) { + std::unique_ptr filterGenerator = + std::make_unique(rowType, 0); + filters = filterGenerator->makeSubfieldFilters( + filterSpecs, batches, nullptr, hitRows); + auto scanSpec = filterGenerator->makeScanSpec(std::move(filters)); + return scanSpec; +} + +// This method is the place where we do the read operations using +// icebergSplitReader. scanSpec contains the setting of filters. e.g. +// filterRateX100 = 30 means it would filter out 70% of rows and 30% remain. +// deleteRateX100 = 30 means it would delete 30% of overall data rows and 70% +// remain. Return the number of rows after the filter and delete. +int IcebergSplitReaderBenchmark::read( + const RowTypePtr& rowType, + uint32_t nextSize, + std::unique_ptr icebergSplitReader) { + runtimeStats_ = RuntimeStatistics(); + icebergSplitReader->resetFilterCaches(); + int resultSize = 0; + auto result = BaseVector::create(rowType, 0, leafPool_.get()); + while (true) { + bool hasData = icebergSplitReader->next(nextSize, result); + if (!hasData) { + break; + } + auto rowsRemaining = result->size(); + resultSize += rowsRemaining; + } + icebergSplitReader->updateRuntimeStats(runtimeStats_); + return resultSize; +} + +void IcebergSplitReaderBenchmark::readSingleColumn( + const std::string& columnName, + const TypePtr& type, + float startPct, + float selectPct, + float deletePct, + uint32_t nextSize) { + folly::BenchmarkSuspender suspender; + auto rowType = ROW({columnName}, {type}); + + auto batches = + dataSetBuilder_->makeDataset(rowType, kNumBatches, kNumRowsPerBatch) + .withRowGroupSpecificData(kNumRowsPerRowGroup) + .withNullsForField(Subfield(columnName), 0) + .build(); + writeToFile(*batches); + std::vector filterSpecs; + + filterSpecs.emplace_back( + createFilterSpec(columnName, startPct, selectPct, rowType, false, false)); + + std::vector hitRows; + std::unordered_map> filters; + auto scanSpec = + createScanSpec(*batches, rowType, filterSpecs, hitRows, filters); + + std::vector> splits = + createIcebergSplitsWithPositionalDelete(deletePct, 1); + + core::TypedExprPtr remainingFilterExpr; + + std::shared_ptr hiveTableHandle = + std::make_shared( + "kHiveConnectorId", + "tableName", + false, + std::move(filters), + remainingFilterExpr, + rowType); + + std::shared_ptr hiveConfig = + std::make_shared(std::make_shared( + std::unordered_map(), true)); + const RowTypePtr readerOutputType; + const std::shared_ptr ioStats = + std::make_shared(); + + std::shared_ptr root = + memory::memoryManager()->addRootPool( + "IcebergSplitReader", kMaxMemory, MemoryReclaimer::create()); + std::shared_ptr opPool = root->addLeafChild("operator"); + std::shared_ptr connectorPool = + root->addAggregateChild(kHiveConnectorId, MemoryReclaimer::create()); + std::shared_ptr connectorSessionProperties_ = + std::make_shared( + std::unordered_map()); + + std::unique_ptr connectorQueryCtx_ = + std::make_unique( + opPool.get(), + connectorPool.get(), + connectorSessionProperties_.get(), + nullptr, + exec::test::defaultPrefixSortConfig(), + nullptr, + nullptr, + "query.IcebergSplitReader", + "task.IcebergSplitReader", + "planNodeId.IcebergSplitReader", + 0, + ""); + + FileHandleFactory fileHandleFactory( + std::make_unique>( + hiveConfig->numCacheFileHandles()), + std::make_unique(connectorSessionProperties_)); + + suspender.dismiss(); + + uint64_t resultSize = 0; + for (std::shared_ptr split : splits) { + scanSpec->resetCachedValues(true); + std::unique_ptr icebergSplitReader = + std::make_unique( + split, + hiveTableHandle, + nullptr, + connectorQueryCtx_.get(), + hiveConfig, + rowType, + ioStats, + &fileHandleFactory, + nullptr, + scanSpec); + + std::shared_ptr randomSkip; + icebergSplitReader->configureReaderOptions(randomSkip); + icebergSplitReader->prepareSplit(nullptr, runtimeStats_, nullptr); + + // Filter range is generated from a small sample data of 4096 rows. So the + // upperBound and lowerBound are introduced to estimate the result size. + resultSize += read(rowType, nextSize, std::move(icebergSplitReader)); + } + // Calculate the expected number of rows after the filters. + // Add one to expected to avoid 0 in calculating upperBound and lowerBound. + int expected = kNumBatches * kNumRowsPerBatch * ((double)selectPct / 100) * + (1 - (double)deletePct / 100) + + 1; + + // Make the upperBound and lowerBound large enough to avoid very small + // resultSize and expected size, where the diff ratio is relatively very + // large. + int upperBound = expected * (1 + kFilterErrorMargin) + 1; + int lowerBound = expected * (1 - kFilterErrorMargin) - 1; + upperBound = std::max(16, upperBound); + lowerBound = std::max(0, lowerBound); + + VELOX_CHECK( + resultSize <= upperBound && resultSize >= lowerBound, + "Result Size {} and Expected Size {} Mismatch", + resultSize, + expected); +} + +void run( + uint32_t, + const std::string& columnName, + const TypePtr& type, + float filterRateX100, + float deleteRateX100, + uint32_t nextSize) { + RowTypePtr rowType = ROW({columnName}, {type}); + IcebergSplitReaderBenchmark benchmark; + BIGINT()->toString(); + benchmark.readSingleColumn( + columnName, type, 0, filterRateX100, deleteRateX100, nextSize); +} + +} // namespace facebook::velox::iceberg::reader::test diff --git a/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.h b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.h new file mode 100644 index 0000000000000..37b057e72dfa7 --- /dev/null +++ b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/common/file/FileSystems.h" +#include "velox/connectors/hive/TableHandle.h" +#include "velox/connectors/hive/iceberg/IcebergDeleteFile.h" +#include "velox/connectors/hive/iceberg/IcebergMetadataColumns.h" +#include "velox/connectors/hive/iceberg/IcebergSplit.h" +#include "velox/connectors/hive/iceberg/IcebergSplitReader.h" +#include "velox/dwio/common/tests/utils/DataSetBuilder.h" +#include "velox/dwio/dwrf/writer/Writer.h" +#include "velox/exec/tests/utils/TempDirectoryPath.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +#include +#include + +namespace facebook::velox::iceberg::reader::test { + +constexpr uint32_t kNumRowsPerBatch = 20000; +constexpr uint32_t kNumBatches = 50; +constexpr uint32_t kNumRowsPerRowGroup = 10000; +constexpr double kFilterErrorMargin = 0.2; + +class IcebergSplitReaderBenchmark { + public: + explicit IcebergSplitReaderBenchmark() { + rootPool_ = + memory::memoryManager()->addRootPool("IcebergSplitReaderBenchmark"); + leafPool_ = rootPool_->addLeafChild("IcebergSplitReaderBenchmark"); + dataSetBuilder_ = + std::make_unique(*leafPool_, 0); + filesystems::registerLocalFileSystem(); + } + + ~IcebergSplitReaderBenchmark() {} + + void writeToFile(const std::vector& batches); + + void writeToPositionDeleteFile( + const std::string& filePath, + const std::vector& vectors); + + dwio::common::FilterSpec createFilterSpec( + const std::string& columnName, + float startPct, + float selectPct, + const TypePtr& type, + bool isForRowGroupSkip, + bool allowNulls); + + std::shared_ptr createScanSpec( + const std::vector& batches, + RowTypePtr& rowType, + const std::vector& filterSpecs, + std::vector& hitRows, + std::unordered_map< + facebook::velox::common::Subfield, + std::unique_ptr>& filters); + + int read( + const RowTypePtr& rowType, + uint32_t nextSize, + std::unique_ptr + icebergSplitReader); + + void readSingleColumn( + const std::string& columnName, + const TypePtr& type, + float startPct, + float selectPct, + float deleteRate, + uint32_t nextSize); + + std::vector> + createIcebergSplitsWithPositionalDelete( + int32_t deleteRowsPercentage, + int32_t deleteFilesCount); + + std::vector listFiles(const std::string& dirPath); + + std::shared_ptr makeIcebergSplit( + const std::string& dataFilePath, + const std::vector& + deleteFiles = {}); + + std::vector makeRandomDeleteRows(int32_t deleteRowsCount); + + std::vector makeSequenceRows(int32_t maxRowNumber); + + std::string writePositionDeleteFile( + const std::string& dataFilePath, + int64_t numDeleteRows); + + private: + const std::string fileName_ = "test.data"; + const std::shared_ptr fileFolder_ = + exec::test::TempDirectoryPath::create(); + const std::shared_ptr deleteFileFolder_ = + exec::test::TempDirectoryPath::create(); + + std::unique_ptr dataSetBuilder_; + std::shared_ptr rootPool_; + std::shared_ptr leafPool_; + std::unique_ptr writer_; + dwio::common::RuntimeStatistics runtimeStats_; + + dwio::common::FileFormat fileFomat_{dwio::common::FileFormat::DWRF}; + const std::string kHiveConnectorId = "hive-iceberg"; +}; + +void run( + uint32_t, + const std::string& columnName, + const TypePtr& type, + float filterRateX100, + float deleteRateX100, + uint32_t nextSize); + +} // namespace facebook::velox::iceberg::reader::test diff --git a/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmarkMain.cpp b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmarkMain.cpp new file mode 100644 index 0000000000000..2efb3700bd233 --- /dev/null +++ b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmarkMain.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.h" + +using namespace facebook::velox; +using namespace facebook::velox::dwio; +using namespace facebook::velox::dwio::common; +using namespace facebook::velox::dwrf; +using namespace facebook::velox::iceberg::reader::test; +using namespace facebook::velox::test; + +#define PARQUET_BENCHMARKS_FILTER_DELETES(_type_, _name_, _filter_, _deletes_) \ + BENCHMARK_NAMED_PARAM( \ + run, \ + _name_##_Filter_##_filter_##_Delete_##_deletes_##_next_5k, \ + #_name_, \ + _type_, \ + _filter_, \ + _deletes_, \ + 5000); \ + BENCHMARK_NAMED_PARAM( \ + run, \ + _name_##_Filter_##_filter_##_Delete_##_deletes_##_next_10k, \ + #_name_, \ + _type_, \ + _filter_, \ + _deletes_, \ + 10000); \ + BENCHMARK_DRAW_LINE(); + +#define PARQUET_BENCHMARKS_FILTERS(_type_, _name_, _filter_) \ + PARQUET_BENCHMARKS_FILTER_DELETES(_type_, _name_, _filter_, 0) \ + PARQUET_BENCHMARKS_FILTER_DELETES(_type_, _name_, _filter_, 20) \ + PARQUET_BENCHMARKS_FILTER_DELETES(_type_, _name_, _filter_, 50) \ + PARQUET_BENCHMARKS_FILTER_DELETES(_type_, _name_, _filter_, 70) \ + PARQUET_BENCHMARKS_FILTER_DELETES(_type_, _name_, _filter_, 100) + +#define PARQUET_BENCHMARKS(_type_, _name_) \ + PARQUET_BENCHMARKS_FILTERS(_type_, _name_, 0) \ + PARQUET_BENCHMARKS_FILTERS(_type_, _name_, 20) \ + PARQUET_BENCHMARKS_FILTERS(_type_, _name_, 50) \ + PARQUET_BENCHMARKS_FILTERS(_type_, _name_, 70) \ + PARQUET_BENCHMARKS_FILTERS(_type_, _name_, 100) \ + BENCHMARK_DRAW_LINE(); + +PARQUET_BENCHMARKS(BIGINT(), BigInt); + +int main(int argc, char** argv) { + folly::Init init{&argc, &argv}; + memory::MemoryManager::initialize({}); + folly::runBenchmarks(); + return 0; +} diff --git a/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmarkTest.cpp b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmarkTest.cpp new file mode 100644 index 0000000000000..d8d0f99fe8b89 --- /dev/null +++ b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmarkTest.cpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.h" +#include + +namespace facebook::velox::iceberg::reader::test { +namespace { +TEST(IcebergSplitReaderBenchmarkTest, basic) { + memory::MemoryManager::testingSetInstance({}); + run(1, "BigInt", BIGINT(), 20, 0, 500); + run(1, "BigInt", BIGINT(), 50, 20, 500); + run(1, "BigInt", BIGINT(), 100, 20, 500); + run(1, "BigInt", BIGINT(), 100, 100, 500); +} +} // namespace +} // namespace facebook::velox::iceberg::reader::test diff --git a/velox/connectors/hive/storage_adapters/CMakeLists.txt b/velox/connectors/hive/storage_adapters/CMakeLists.txt index 9c2e046a0b870..bd7c37f816405 100644 --- a/velox/connectors/hive/storage_adapters/CMakeLists.txt +++ b/velox/connectors/hive/storage_adapters/CMakeLists.txt @@ -15,3 +15,4 @@ add_subdirectory(s3fs) add_subdirectory(hdfs) add_subdirectory(gcs) +add_subdirectory(abfs) diff --git a/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp b/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp new file mode 100644 index 0000000000000..681d26b35e768 --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp @@ -0,0 +1,272 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.h" + +#include +#include +#include +#include + +#include "velox/common/config/Config.h" +#include "velox/common/file/File.h" +#include "velox/connectors/hive/HiveConfig.h" +#include "velox/connectors/hive/storage_adapters/abfs/AbfsReadFile.h" +#include "velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.h" + +namespace facebook::velox::filesystems::abfs { +using namespace Azure::Storage::Blobs; + +class AbfsConfig { + public: + AbfsConfig(const config::ConfigBase* config) : config_(config) {} + + std::string connectionString(const std::string& path) const { + auto abfsAccount = AbfsAccount(path); + auto key = abfsAccount.credKey(); + VELOX_USER_CHECK( + config_->valueExists(key), "Failed to find storage credentials"); + + return abfsAccount.connectionString(config_->get(key).value()); + } + + private: + const config::ConfigBase* config_; +}; + +class AbfsReadFile::Impl { + constexpr static uint64_t kNaturalReadSize = 4 << 20; // 4M + constexpr static uint64_t kReadConcurrency = 8; + + public: + explicit Impl(const std::string& path, const std::string& connectStr) { + auto abfsAccount = AbfsAccount(path); + fileName_ = abfsAccount.filePath(); + fileClient_ = + std::make_unique(BlobClient::CreateFromConnectionString( + connectStr, abfsAccount.fileSystem(), fileName_)); + } + + void initialize(const FileOptions& options) { + if (options.fileSize.has_value()) { + VELOX_CHECK_GE( + options.fileSize.value(), 0, "File size must be non-negative"); + length_ = options.fileSize.value(); + } + + if (length_ != -1) { + return; + } + + try { + auto properties = fileClient_->GetProperties(); + length_ = properties.Value.BlobSize; + } catch (Azure::Storage::StorageException& e) { + throwStorageExceptionWithOperationDetails("GetProperties", fileName_, e); + } + + VELOX_CHECK_GE(length_, 0); + } + + std::string_view pread(uint64_t offset, uint64_t length, void* buffer) const { + preadInternal(offset, length, static_cast(buffer)); + return {static_cast(buffer), length}; + } + + std::string pread(uint64_t offset, uint64_t length) const { + std::string result(length, 0); + preadInternal(offset, length, result.data()); + return result; + } + + uint64_t preadv( + uint64_t offset, + const std::vector>& buffers) const { + size_t length = 0; + auto size = buffers.size(); + for (auto& range : buffers) { + length += range.size(); + } + std::string result(length, 0); + preadInternal(offset, length, static_cast(result.data())); + size_t resultOffset = 0; + for (auto range : buffers) { + if (range.data()) { + memcpy(range.data(), &(result.data()[resultOffset]), range.size()); + } + resultOffset += range.size(); + } + + return length; + } + + uint64_t preadv( + folly::Range regions, + folly::Range iobufs) const { + size_t length = 0; + VELOX_CHECK_EQ(regions.size(), iobufs.size()); + for (size_t i = 0; i < regions.size(); ++i) { + const auto& region = regions[i]; + auto& output = iobufs[i]; + output = folly::IOBuf(folly::IOBuf::CREATE, region.length); + pread(region.offset, region.length, output.writableData()); + output.append(region.length); + length += region.length; + } + + return length; + } + + uint64_t size() const { + return length_; + } + + uint64_t memoryUsage() const { + return 3 * sizeof(std::string) + sizeof(int64_t); + } + + bool shouldCoalesce() const { + return false; + } + + std::string getName() const { + return fileName_; + } + + uint64_t getNaturalReadSize() const { + return kNaturalReadSize; + } + + private: + void preadInternal(uint64_t offset, uint64_t length, char* position) const { + // Read the desired range of bytes. + Azure::Core::Http::HttpRange range; + range.Offset = offset; + range.Length = length; + + Azure::Storage::Blobs::DownloadBlobOptions blob; + blob.Range = range; + + auto response = fileClient_->Download(blob); + response.Value.BodyStream->ReadToCount( + reinterpret_cast(position), length); + } + + std::string fileName_; + std::unique_ptr fileClient_; + + int64_t length_ = -1; +}; + +AbfsReadFile::AbfsReadFile( + const std::string& path, + const std::string& connectStr) { + impl_ = std::make_shared(path, connectStr); +} + +void AbfsReadFile::initialize(const FileOptions& options) { + return impl_->initialize(options); +} + +std::string_view +AbfsReadFile::pread(uint64_t offset, uint64_t length, void* buffer) const { + return impl_->pread(offset, length, buffer); +} + +std::string AbfsReadFile::pread(uint64_t offset, uint64_t length) const { + return impl_->pread(offset, length); +} + +uint64_t AbfsReadFile::preadv( + uint64_t offset, + const std::vector>& buffers) const { + return impl_->preadv(offset, buffers); +} + +uint64_t AbfsReadFile::preadv( + folly::Range regions, + folly::Range iobufs) const { + return impl_->preadv(regions, iobufs); +} + +uint64_t AbfsReadFile::size() const { + return impl_->size(); +} + +uint64_t AbfsReadFile::memoryUsage() const { + return impl_->memoryUsage(); +} + +bool AbfsReadFile::shouldCoalesce() const { + return false; +} + +std::string AbfsReadFile::getName() const { + return impl_->getName(); +} + +uint64_t AbfsReadFile::getNaturalReadSize() const { + return impl_->getNaturalReadSize(); +} + +class AbfsFileSystem::Impl { + public: + explicit Impl(const config::ConfigBase* config) : abfsConfig_(config) { + LOG(INFO) << "Init Azure Blob file system"; + } + + ~Impl() { + LOG(INFO) << "Dispose Azure Blob file system"; + } + + const std::string connectionString(const std::string& path) const { + // Extract account name + return abfsConfig_.connectionString(path); + } + + private: + const AbfsConfig abfsConfig_; + std::shared_ptr ioExecutor_; +}; + +AbfsFileSystem::AbfsFileSystem( + const std::shared_ptr& config) + : FileSystem(config) { + impl_ = std::make_shared(config.get()); +} + +std::string AbfsFileSystem::name() const { + return "ABFS"; +} + +std::unique_ptr AbfsFileSystem::openFileForRead( + std::string_view path, + const FileOptions& options) { + auto abfsfile = std::make_unique( + std::string(path), impl_->connectionString(std::string(path))); + abfsfile->initialize(options); + return abfsfile; +} + +std::unique_ptr AbfsFileSystem::openFileForWrite( + std::string_view path, + const FileOptions& /*unused*/) { + auto abfsfile = std::make_unique( + std::string(path), impl_->connectionString(std::string(path))); + abfsfile->initialize(); + return abfsfile; +} +} // namespace facebook::velox::filesystems::abfs diff --git a/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.h b/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.h new file mode 100644 index 0000000000000..319a85b7a382c --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/common/file/FileSystems.h" + +namespace facebook::velox::filesystems::abfs { + +/// Implementation of the ABS (Azure Blob Storage) filesystem and file +/// interface. We provide a registration method for reading and writing files so +/// that the appropriate type of file can be constructed based on a filename. +/// The supported schema is `abfs(s)://` to align with the valid scheme +/// identifiers used in the Hadoop Filesystem ABFS driver when integrating with +/// Azure Blob Storage. One key difference here is that the ABFS Hadoop client +/// driver always uses Transport Layer Security (TLS) regardless of the +/// authentication method chosen when using the `abfss` schema, but not mandated +/// when using the `abfs` schema. In our implementation, we always use the HTTPS +/// protocol, regardless of whether the schema is `abfs://` or `abfss://`. The +/// legacy wabs(s):// schema is not supported as it has been deprecated already +/// by Azure Storage team. Reference document - +/// https://learn.microsoft.com/en-us/azure/databricks/storage/azure-storage. +class AbfsFileSystem : public FileSystem { + public: + explicit AbfsFileSystem( + const std::shared_ptr& config); + + std::string name() const override; + + std::unique_ptr openFileForRead( + std::string_view path, + const FileOptions& options = {}) override; + + std::unique_ptr openFileForWrite( + std::string_view path, + const FileOptions& options = {}) override; + + void rename( + std::string_view path, + std::string_view newPath, + bool overWrite = false) override { + VELOX_UNSUPPORTED("rename for abfs not implemented"); + } + + void remove(std::string_view path) override { + VELOX_UNSUPPORTED("remove for abfs not implemented"); + } + + bool exists(std::string_view path) override { + VELOX_UNSUPPORTED("exists for abfs not implemented"); + } + + std::vector list(std::string_view path) override { + VELOX_UNSUPPORTED("list for abfs not implemented"); + } + + void mkdir(std::string_view path) override { + VELOX_UNSUPPORTED("mkdir for abfs not implemented"); + } + + void rmdir(std::string_view path) override { + VELOX_UNSUPPORTED("rmdir for abfs not implemented"); + } + + protected: + class Impl; + std::shared_ptr impl_; +}; + +void registerAbfsFileSystem(); +} // namespace facebook::velox::filesystems::abfs diff --git a/velox/connectors/hive/storage_adapters/abfs/AbfsReadFile.h b/velox/connectors/hive/storage_adapters/abfs/AbfsReadFile.h new file mode 100644 index 0000000000000..d7c0374d1651e --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/AbfsReadFile.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "velox/common/file/File.h" +#include "velox/connectors/hive/storage_adapters/abfs/AbfsUtil.h" + +namespace facebook::velox::filesystems::abfs { +class AbfsReadFile final : public ReadFile { + public: + explicit AbfsReadFile(const std::string& path, const std::string& connectStr); + + void initialize(const FileOptions& options); + + std::string_view pread(uint64_t offset, uint64_t length, void* buf) + const final; + + std::string pread(uint64_t offset, uint64_t length) const final; + + uint64_t preadv( + uint64_t offset, + const std::vector>& buffers) const final; + + uint64_t preadv( + folly::Range regions, + folly::Range iobufs) const final; + + uint64_t size() const final; + + uint64_t memoryUsage() const final; + + bool shouldCoalesce() const final; + + std::string getName() const final; + + uint64_t getNaturalReadSize() const final; + + protected: + class Impl; + std::shared_ptr impl_; +}; +} // namespace facebook::velox::filesystems::abfs diff --git a/velox/connectors/hive/storage_adapters/abfs/AbfsUtil.h b/velox/connectors/hive/storage_adapters/abfs/AbfsUtil.h new file mode 100644 index 0000000000000..2af0f42390095 --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/AbfsUtil.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include "velox/common/file/File.h" + +#include +#include + +namespace facebook::velox::filesystems::abfs { +namespace { +constexpr std::string_view kAbfsScheme{"abfs://"}; +constexpr std::string_view kAbfssScheme{"abfss://"}; +} // namespace + +inline bool isAbfsFile(const std::string_view filename) { + return filename.find(kAbfsScheme) == 0 || filename.find(kAbfssScheme) == 0; +} + +class AbfsAccount { + public: + explicit AbfsAccount(const std::string path); + + const std::string accountNameWithSuffix() const; + + const std::string scheme() const; + + const std::string accountName() const; + + const std::string endpointSuffix() const; + + const std::string fileSystem() const; + + const std::string filePath() const; + + const std::string credKey() const; + + const std::string connectionString(const std::string accountKey) const; + + private: + std::string scheme_; + std::string accountName_; + std::string endpointSuffix_; + std::string accountNameWithSuffix_; + std::string fileSystem_; + std::string filePath_; + std::string path_; + std::string credKey_; +}; + +inline const std::string throwStorageExceptionWithOperationDetails( + std::string operation, + std::string path, + Azure::Storage::StorageException& error) { + const auto errMsg = fmt::format( + "Operation '{}' to path '{}' encountered azure storage exception, Details: '{}'.", + operation, + path, + error.what()); + if (error.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + VELOX_FILE_NOT_FOUND_ERROR(errMsg); + } + VELOX_FAIL(errMsg); +} + +} // namespace facebook::velox::filesystems::abfs diff --git a/velox/connectors/hive/storage_adapters/abfs/AbfsUtils.cpp b/velox/connectors/hive/storage_adapters/abfs/AbfsUtils.cpp new file mode 100644 index 0000000000000..700cfd915bbff --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/AbfsUtils.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/storage_adapters/abfs/AbfsUtil.h" + +namespace facebook::velox::filesystems::abfs { +AbfsAccount::AbfsAccount(const std::string path) { + auto file = std::string(""); + if (path.find(kAbfssScheme) == 0) { + file = std::string(path.substr(8)); + scheme_ = kAbfssScheme.substr(0, 5); + } else { + file = std::string(path.substr(7)); + scheme_ = kAbfsScheme.substr(0, 4); + } + + auto firstAt = file.find_first_of("@"); + fileSystem_ = std::string(file.substr(0, firstAt)); + auto firstSep = file.find_first_of("/"); + filePath_ = std::string(file.substr(firstSep + 1)); + + accountNameWithSuffix_ = file.substr(firstAt + 1, firstSep - firstAt - 1); + auto firstDot = accountNameWithSuffix_.find_first_of("."); + accountName_ = accountNameWithSuffix_.substr(0, firstDot); + endpointSuffix_ = accountNameWithSuffix_.substr(firstDot + 5); + credKey_ = fmt::format("fs.azure.account.key.{}", accountNameWithSuffix_); +} + +const std::string AbfsAccount::accountNameWithSuffix() const { + return accountNameWithSuffix_; +} + +const std::string AbfsAccount::scheme() const { + return scheme_; +} + +const std::string AbfsAccount::accountName() const { + return accountName_; +} + +const std::string AbfsAccount::endpointSuffix() const { + return endpointSuffix_; +} + +const std::string AbfsAccount::fileSystem() const { + return fileSystem_; +} + +const std::string AbfsAccount::filePath() const { + return filePath_; +} + +const std::string AbfsAccount::credKey() const { + return credKey_; +} + +const std::string AbfsAccount::connectionString( + const std::string accountKey) const { + return fmt::format( + "DefaultEndpointsProtocol=https;AccountName={};AccountKey={};EndpointSuffix={}", + accountName(), + accountKey, + endpointSuffix()); +} +} // namespace facebook::velox::filesystems::abfs diff --git a/velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.cpp b/velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.cpp new file mode 100644 index 0000000000000..1e390ea729e4e --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.h" + +#include + +namespace facebook::velox::filesystems::abfs { +class BlobStorageFileClient final : public IBlobStorageFileClient { + public: + BlobStorageFileClient(std::unique_ptr client) + : client_(std::move(client)) {} + + void create() override { + client_->Create(); + } + + PathProperties getProperties() override { + return client_->GetProperties().Value; + } + + void append(const uint8_t* buffer, size_t size, uint64_t offset) override { + auto bodyStream = Azure::Core::IO::MemoryBodyStream(buffer, size); + client_->Append(bodyStream, offset); + } + + void flush(uint64_t position) override { + client_->Flush(position); + } + + void close() override { + // do nothing. + } + + private: + const std::unique_ptr client_; +}; + +class AbfsWriteFile::Impl { + public: + explicit Impl(const std::string& path, const std::string& connectStr) + : path_(path), connectStr_(connectStr) { + // Make it a no-op if invoked twice. + if (position_ != -1) { + return; + } + position_ = 0; + } + + void initialize() { + if (!blobStorageFileClient_) { + auto abfsAccount = AbfsAccount(path_); + blobStorageFileClient_ = std::make_unique( + std::make_unique( + DataLakeFileClient::CreateFromConnectionString( + connectStr_, + abfsAccount.fileSystem(), + abfsAccount.filePath()))); + } + + VELOX_CHECK(!checkIfFileExists(), "File already exists"); + blobStorageFileClient_->create(); + } + + void testingSetFileClient( + const std::shared_ptr& blobStorageManager) { + blobStorageFileClient_ = blobStorageManager; + } + + void close() { + if (!closed_) { + flush(); + blobStorageFileClient_->close(); + closed_ = true; + } + } + + void flush() { + if (!closed_) { + blobStorageFileClient_->flush(position_); + } + } + + void append(std::string_view data) { + VELOX_CHECK(!closed_, "File is not open"); + if (data.size() == 0) { + return; + } + append(data.data(), data.size()); + } + + uint64_t size() const { + return blobStorageFileClient_->getProperties().FileSize; + } + + void append(const char* buffer, size_t size) { + blobStorageFileClient_->append( + reinterpret_cast(buffer), size, position_); + position_ += size; + } + + private: + bool checkIfFileExists() { + try { + blobStorageFileClient_->getProperties(); + return true; + } catch (Azure::Storage::StorageException& e) { + if (e.StatusCode != Azure::Core::Http::HttpStatusCode::NotFound) { + throwStorageExceptionWithOperationDetails("GetProperties", path_, e); + } + return false; + } + } + + const std::string path_; + const std::string connectStr_; + std::string fileSystem_; + std::string fileName_; + std::shared_ptr blobStorageFileClient_; + + uint64_t position_ = -1; + bool closed_ = false; +}; + +AbfsWriteFile::AbfsWriteFile( + const std::string& path, + const std::string& connectStr) { + impl_ = std::make_shared(path, connectStr); +} + +void AbfsWriteFile::initialize() { + impl_->initialize(); +} + +void AbfsWriteFile::close() { + impl_->close(); +} + +void AbfsWriteFile::flush() { + impl_->flush(); +} + +void AbfsWriteFile::append(std::string_view data) { + impl_->append(data); +} + +uint64_t AbfsWriteFile::size() const { + return impl_->size(); +} + +void AbfsWriteFile::testingSetFileClient( + const std::shared_ptr& fileClient) { + impl_->testingSetFileClient(fileClient); +} +} // namespace facebook::velox::filesystems::abfs diff --git a/velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.h b/velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.h new file mode 100644 index 0000000000000..acb701ac91ff9 --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/common/file/File.h" +#include "velox/connectors/hive/storage_adapters/abfs/AbfsUtil.h" + +namespace Azure::Storage::Files::DataLake::Models { +class PathProperties; +} + +namespace facebook::velox::filesystems::abfs { +using namespace Azure::Storage::Files::DataLake; +using namespace Azure::Storage::Files::DataLake::Models; + +/* + * We are using the DFS (Data Lake Storage) endpoint for Azure Blob File write + * operations because the DFS endpoint is designed to be compatible with file + * operation semantics, such as `Append` to a file and file `Flush` operations. + * The legacy Blob endpoint can only be used for blob level append and flush + * operations. When using the Blob endpoint, we would need to manually manage + * the creation, appending, and committing of file-related blocks. + * + * However, the Azurite Simulator does not yet support the DFS endpoint. + * (For more information, see https://github.com/Azure/Azurite/issues/553 and + * https://github.com/Azure/Azurite/issues/409). + * You can find a comparison between DFS and Blob endpoints here: + * https://github.com/Azure/Azurite/wiki/ADLS-Gen2-Implementation-Guidance + * + * To facilitate unit testing of file write scenarios, we define the + * IBlobStorageFileClient here, which can be mocked during testing. + */ +class IBlobStorageFileClient { + public: + virtual ~IBlobStorageFileClient() {} + + virtual void create() = 0; + virtual PathProperties getProperties() = 0; + virtual void append(const uint8_t* buffer, size_t size, uint64_t offset) = 0; + virtual void flush(uint64_t position) = 0; + virtual void close() = 0; +}; + +/// Implementation of abfs write file. Nothing written to the file should be +/// read back until it is closed. +class AbfsWriteFile : public WriteFile { + public: + constexpr static uint64_t kNaturalWriteSize = 8 << 20; // 8M + /// The constructor. + /// @param path The file path to write. + /// @param connectStr the connection string used to auth the storage account. + AbfsWriteFile(const std::string& path, const std::string& connectStr); + + /// check any issue reading file. + void initialize(); + + /// Get the file size. + uint64_t size() const override; + + /// Flush the data. + void flush() override; + + /// Write the data by append mode. + void append(std::string_view data) override; + + /// Close the file. + void close() override; + + /// Used by tests to override the FileSystem client. + void testingSetFileClient( + const std::shared_ptr& fileClient); + + protected: + class Impl; + std::shared_ptr impl_; +}; +} // namespace facebook::velox::filesystems::abfs diff --git a/velox/connectors/hive/storage_adapters/abfs/CMakeLists.txt b/velox/connectors/hive/storage_adapters/abfs/CMakeLists.txt new file mode 100644 index 0000000000000..ec71a1353ccdd --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/CMakeLists.txt @@ -0,0 +1,41 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# for generated headers + +velox_add_library(velox_abfs RegisterAbfsFileSystem.cpp) + +if(VELOX_ENABLE_ABFS) + velox_sources( + velox_abfs + PRIVATE + AbfsFileSystem.cpp + AbfsUtils.cpp + AbfsWriteFile.cpp) + velox_link_libraries( + velox_abfs + PUBLIC velox_file + velox_core + velox_hive_config + velox_dwio_common_exception + Azure::azure-storage-blobs + Azure::azure-storage-files-datalake + Folly::folly + glog::glog + fmt::fmt) + + if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) + endif() +endif() diff --git a/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.cpp b/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.cpp new file mode 100644 index 0000000000000..a8b0df52a6dba --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef VELOX_ENABLE_ABFS +#include "velox/common/config/Config.h" +#include "velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.h" // @manual +#include "velox/connectors/hive/storage_adapters/abfs/AbfsUtil.h" // @manual +#endif + +namespace facebook::velox::filesystems::abfs { + +#ifdef VELOX_ENABLE_ABFS +folly::once_flag abfsInitiationFlag; + +std::shared_ptr abfsFileSystemGenerator( + std::shared_ptr properties, + std::string_view filePath) { + static std::shared_ptr filesystem; + folly::call_once(abfsInitiationFlag, [&properties]() { + filesystem = std::make_shared(properties); + }); + return filesystem; +} +#endif + +void registerAbfsFileSystem() { +#ifdef VELOX_ENABLE_ABFS + LOG(INFO) << "Register ABFS"; + registerFileSystem(isAbfsFile, std::function(abfsFileSystemGenerator)); +#endif +} + +} // namespace facebook::velox::filesystems::abfs diff --git a/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.h b/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.h new file mode 100644 index 0000000000000..e725e084fa3ed --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace facebook::velox::filesystems::abfs { + +// Register the ABFS filesystem. +void registerAbfsFileSystem(); + +} // namespace facebook::velox::filesystems::abfs diff --git a/velox/connectors/hive/storage_adapters/abfs/tests/AbfsFileSystemTest.cpp b/velox/connectors/hive/storage_adapters/abfs/tests/AbfsFileSystemTest.cpp new file mode 100644 index 0000000000000..926f064ce28e4 --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/tests/AbfsFileSystemTest.cpp @@ -0,0 +1,362 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/file/File.h" +#include "velox/common/file/FileSystems.h" +#include "velox/connectors/hive/FileHandle.h" +#include "velox/connectors/hive/HiveConfig.h" +#include "velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.h" +#include "velox/connectors/hive/storage_adapters/abfs/AbfsReadFile.h" +#include "velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.h" +#include "velox/connectors/hive/storage_adapters/abfs/tests/AzuriteServer.h" +#include "velox/connectors/hive/storage_adapters/abfs/tests/MockBlobStorageFileClient.h" +#include "velox/exec/tests/utils/PortUtil.h" +#include "velox/exec/tests/utils/TempFilePath.h" + +using namespace facebook::velox; +using namespace facebook::velox::filesystems; +using namespace facebook::velox::filesystems::abfs; +using ::facebook::velox::common::Region; + +constexpr int kOneMB = 1 << 20; +static const std::string filePath = "test_file.txt"; +static const std::string fullFilePath = + filesystems::test::AzuriteABFSEndpoint + filePath; + +class AbfsFileSystemTest : public testing::Test { + public: + static std::shared_ptr hiveConfig( + const std::unordered_map configOverride = {}) { + std::unordered_map config({}); + + // Update the default config map with the supplied configOverride map + for (const auto& item : configOverride) { + config[item.first] = item.second; + std::cout << "config " + item.first + " value " + item.second + << std::endl; + } + + return std::make_shared(std::move(config)); + } + + public: + std::shared_ptr azuriteServer; + + static void SetUpTestCase() { + registerAbfsFileSystem(); + } + + void SetUp() override { + auto port = facebook::velox::exec::test::getFreePort(); + azuriteServer = std::make_shared(port); + azuriteServer->start(); + auto tempFile = createFile(); + azuriteServer->addFile(tempFile->getPath(), filePath); + } + + void TearDown() override { + azuriteServer->stop(); + } + + std::unique_ptr openFileForWrite( + std::string_view path, + std::shared_ptr client) { + auto abfsfile = std::make_unique( + std::string(path), azuriteServer->connectionStr()); + abfsfile->testingSetFileClient(client); + abfsfile->initialize(); + return abfsfile; + } + + static std::string generateRandomData(int size) { + static const char charset[] = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; + + std::string data(size, ' '); + + for (int i = 0; i < size; ++i) { + int index = rand() % (sizeof(charset) - 1); + data[i] = charset[index]; + } + + return data; + } + + private: + static std::shared_ptr<::exec::test::TempFilePath> createFile( + uint64_t size = -1) { + auto tempFile = exec::test::TempFilePath::create(); + if (size == -1) { + tempFile->append("aaaaa"); + tempFile->append("bbbbb"); + tempFile->append(std::string(kOneMB, 'c')); + tempFile->append("ddddd"); + } else { + const uint64_t totalSize = size * 1024 * 1024; + const uint64_t chunkSize = 5 * 1024 * 1024; + uint64_t remainingSize = totalSize; + while (remainingSize > 0) { + uint64_t dataSize = std::min(remainingSize, chunkSize); + std::string randomData = generateRandomData(dataSize); + tempFile->append(randomData); + remainingSize -= dataSize; + } + } + return tempFile; + } +}; + +void readData(ReadFile* readFile) { + ASSERT_EQ(readFile->size(), 15 + kOneMB); + char buffer1[5]; + ASSERT_EQ(readFile->pread(10 + kOneMB, 5, &buffer1), "ddddd"); + char buffer2[10]; + ASSERT_EQ(readFile->pread(0, 10, &buffer2), "aaaaabbbbb"); + auto buffer3 = new char[kOneMB]; + ASSERT_EQ(readFile->pread(10, kOneMB, buffer3), std::string(kOneMB, 'c')); + delete[] buffer3; + ASSERT_EQ(readFile->size(), 15 + kOneMB); + char buffer4[10]; + const std::string_view arf = readFile->pread(5, 10, &buffer4); + const std::string zarf = readFile->pread(kOneMB, 15); + auto buf = std::make_unique(8); + const std::string_view warf = readFile->pread(4, 8, buf.get()); + const std::string_view warfFromBuf(buf.get(), 8); + ASSERT_EQ(arf, "bbbbbccccc"); + ASSERT_EQ(zarf, "ccccccccccddddd"); + ASSERT_EQ(warf, "abbbbbcc"); + ASSERT_EQ(warfFromBuf, "abbbbbcc"); + + char buff1[10]; + char buff2[10]; + std::vector> buffers = { + folly::Range(buff1, 10), + folly::Range(nullptr, kOneMB - 5), + folly::Range(buff2, 10)}; + ASSERT_EQ(10 + kOneMB - 5 + 10, readFile->preadv(0, buffers)); + ASSERT_EQ(std::string_view(buff1, sizeof(buff1)), "aaaaabbbbb"); + ASSERT_EQ(std::string_view(buff2, sizeof(buff2)), "cccccddddd"); + + std::vector iobufs(2); + std::vector regions = {{0, 10}, {10, 5}}; + ASSERT_EQ( + 10 + 5, + readFile->preadv( + {regions.data(), regions.size()}, {iobufs.data(), iobufs.size()})); + ASSERT_EQ( + std::string_view( + reinterpret_cast(iobufs[0].writableData()), + iobufs[0].length()), + "aaaaabbbbb"); + ASSERT_EQ( + std::string_view( + reinterpret_cast(iobufs[1].writableData()), + iobufs[1].length()), + "ccccc"); +} + +TEST_F(AbfsFileSystemTest, readFile) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + auto readFile = abfs.openFileForRead(fullFilePath); + readData(readFile.get()); +} + +TEST_F(AbfsFileSystemTest, openFileForReadWithOptions) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + FileOptions options; + options.fileSize = 15 + kOneMB; + auto readFile = abfs.openFileForRead(fullFilePath, options); + readData(readFile.get()); +} + +TEST_F(AbfsFileSystemTest, openFileForReadWithInvalidOptions) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + FileOptions options; + options.fileSize = -kOneMB; + VELOX_ASSERT_THROW( + abfs.openFileForRead(fullFilePath, options), + "File size must be non-negative"); +} + +TEST_F(AbfsFileSystemTest, fileHandleWithProperties) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + FileHandleFactory factory( + std::make_unique>(1), + std::make_unique(hiveConfig)); + FileProperties properties = {15 + kOneMB, 1}; + auto fileHandleProperties = factory.generate(fullFilePath, &properties); + readData(fileHandleProperties->file.get()); + + auto fileHandleWithoutProperties = factory.generate(fullFilePath); + readData(fileHandleWithoutProperties->file.get()); +} + +TEST_F(AbfsFileSystemTest, multipleThreadsWithReadFile) { + std::atomic startThreads = false; + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + + std::vector threads; + std::mt19937 generator(std::random_device{}()); + std::vector sleepTimesInMicroseconds = {0, 500, 5000}; + std::uniform_int_distribution distribution( + 0, sleepTimesInMicroseconds.size() - 1); + for (int i = 0; i < 10; i++) { + auto thread = std::thread([&] { + int index = distribution(generator); + while (!startThreads) { + std::this_thread::yield(); + } + std::this_thread::sleep_for( + std::chrono::microseconds(sleepTimesInMicroseconds[index])); + auto readFile = abfs.openFileForRead(fullFilePath); + readData(readFile.get()); + }); + threads.emplace_back(std::move(thread)); + } + startThreads = true; + for (auto& thread : threads) { + thread.join(); + } +} + +TEST_F(AbfsFileSystemTest, missingFile) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + const std::string abfsFile = + facebook::velox::filesystems::test::AzuriteABFSEndpoint + "test.txt"; + AbfsFileSystem abfs{hiveConfig}; + VELOX_ASSERT_RUNTIME_THROW_CODE( + abfs.openFileForRead(abfsFile), error_code::kFileNotFound, "404"); +} + +TEST_F(AbfsFileSystemTest, OpenFileForWriteTest) { + const std::string abfsFile = + filesystems::test::AzuriteABFSEndpoint + "writetest.txt"; + auto mockClient = + std::make_shared( + filesystems::test::MockBlobStorageFileClient()); + auto abfsWriteFile = openFileForWrite(abfsFile, mockClient); + EXPECT_EQ(abfsWriteFile->size(), 0); + std::string dataContent = ""; + uint64_t totalSize = 0; + std::string randomData = + AbfsFileSystemTest::generateRandomData(1 * 1024 * 1024); + for (int i = 0; i < 8; ++i) { + abfsWriteFile->append(randomData); + dataContent += randomData; + } + totalSize = randomData.size() * 8; + abfsWriteFile->flush(); + EXPECT_EQ(abfsWriteFile->size(), totalSize); + + randomData = AbfsFileSystemTest::generateRandomData(9 * 1024 * 1024); + dataContent += randomData; + abfsWriteFile->append(randomData); + totalSize += randomData.size(); + randomData = AbfsFileSystemTest::generateRandomData(2 * 1024 * 1024); + dataContent += randomData; + totalSize += randomData.size(); + abfsWriteFile->append(randomData); + abfsWriteFile->flush(); + EXPECT_EQ(abfsWriteFile->size(), totalSize); + abfsWriteFile->flush(); + abfsWriteFile->close(); + VELOX_ASSERT_THROW(abfsWriteFile->append("abc"), "File is not open"); + VELOX_ASSERT_THROW( + openFileForWrite(abfsFile, mockClient), "File already exists"); + std::string fileContent = mockClient->readContent(); + ASSERT_EQ(fileContent.size(), dataContent.size()); + ASSERT_EQ(fileContent, dataContent); +} + +TEST_F(AbfsFileSystemTest, renameNotImplemented) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + VELOX_ASSERT_THROW( + abfs.rename("text", "text2"), "rename for abfs not implemented"); +} + +TEST_F(AbfsFileSystemTest, removeNotImplemented) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + VELOX_ASSERT_THROW(abfs.remove("text"), "remove for abfs not implemented"); +} + +TEST_F(AbfsFileSystemTest, existsNotImplemented) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + VELOX_ASSERT_THROW(abfs.exists("text"), "exists for abfs not implemented"); +} + +TEST_F(AbfsFileSystemTest, listNotImplemented) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + VELOX_ASSERT_THROW(abfs.list("dir"), "list for abfs not implemented"); +} + +TEST_F(AbfsFileSystemTest, mkdirNotImplemented) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + VELOX_ASSERT_THROW(abfs.mkdir("dir"), "mkdir for abfs not implemented"); +} + +TEST_F(AbfsFileSystemTest, rmdirNotImplemented) { + auto hiveConfig = AbfsFileSystemTest::hiveConfig( + {{"fs.azure.account.key.test.dfs.core.windows.net", + azuriteServer->connectionStr()}}); + AbfsFileSystem abfs{hiveConfig}; + VELOX_ASSERT_THROW(abfs.rmdir("dir"), "rmdir for abfs not implemented"); +} + +TEST_F(AbfsFileSystemTest, credNotFOund) { + const std::string abfsFile = + std::string("abfs://test@test1.dfs.core.windows.net/test"); + auto hiveConfig = AbfsFileSystemTest::hiveConfig({}); + AbfsFileSystem abfs{hiveConfig}; + VELOX_ASSERT_THROW( + abfs.openFileForRead(abfsFile), "Failed to find storage credentials"); +} diff --git a/velox/connectors/hive/storage_adapters/abfs/tests/AbfsUtilTest.cpp b/velox/connectors/hive/storage_adapters/abfs/tests/AbfsUtilTest.cpp new file mode 100644 index 0000000000000..c97aecb6b8e3d --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/tests/AbfsUtilTest.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/storage_adapters/abfs/AbfsUtil.h" + +#include "gtest/gtest.h" + +using namespace facebook::velox::filesystems::abfs; + +TEST(AbfsUtilsTest, isAbfsFile) { + EXPECT_FALSE(isAbfsFile("abfs:")); + EXPECT_FALSE(isAbfsFile("abfss:")); + EXPECT_FALSE(isAbfsFile("abfs:/")); + EXPECT_FALSE(isAbfsFile("abfss:/")); + EXPECT_TRUE(isAbfsFile("abfs://test@test.dfs.core.windows.net/test")); + EXPECT_TRUE(isAbfsFile("abfss://test@test.dfs.core.windows.net/test")); +} + +TEST(AbfsUtilsTest, abfsAccount) { + auto abfsAccount = AbfsAccount("abfs://test@test.dfs.core.windows.net/test"); + EXPECT_EQ(abfsAccount.accountNameWithSuffix(), "test.dfs.core.windows.net"); + EXPECT_EQ(abfsAccount.accountName(), "test"); + EXPECT_EQ(abfsAccount.endpointSuffix(), "core.windows.net"); + EXPECT_EQ(abfsAccount.fileSystem(), "test"); + EXPECT_EQ(abfsAccount.filePath(), "test"); + EXPECT_EQ( + abfsAccount.credKey(), "fs.azure.account.key.test.dfs.core.windows.net"); + EXPECT_EQ( + abfsAccount.connectionString("123"), + "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=123;EndpointSuffix=core.windows.net"); + + auto abfssAccount = AbfsAccount( + "abfss://test@test.dfs.core.windows.net/sf_1/store_sales/ss_sold_date_sk=2450816/part-00002-a29c25f1-4638-494e-8428-a84f51dcea41.c000.snappy.parquet"); + EXPECT_EQ(abfssAccount.scheme(), "abfss"); + EXPECT_EQ(abfssAccount.accountNameWithSuffix(), "test.dfs.core.windows.net"); + EXPECT_EQ(abfssAccount.accountName(), "test"); + EXPECT_EQ(abfssAccount.endpointSuffix(), "core.windows.net"); + EXPECT_EQ(abfssAccount.fileSystem(), "test"); + EXPECT_EQ( + abfssAccount.filePath(), + "sf_1/store_sales/ss_sold_date_sk=2450816/part-00002-a29c25f1-4638-494e-8428-a84f51dcea41.c000.snappy.parquet"); + EXPECT_EQ( + abfssAccount.credKey(), "fs.azure.account.key.test.dfs.core.windows.net"); + + // test with special characters + auto abfssAccountWithSpecialCharacters = AbfsAccount( + "abfss://test@test.dfs.core.windows.net/main@dir/sub dir/test.txt"); + EXPECT_EQ(abfssAccountWithSpecialCharacters.scheme(), "abfss"); + EXPECT_EQ( + abfssAccountWithSpecialCharacters.accountNameWithSuffix(), + "test.dfs.core.windows.net"); + EXPECT_EQ(abfssAccountWithSpecialCharacters.accountName(), "test"); + EXPECT_EQ( + abfssAccountWithSpecialCharacters.endpointSuffix(), "core.windows.net"); + EXPECT_EQ(abfssAccountWithSpecialCharacters.fileSystem(), "test"); + EXPECT_EQ( + abfssAccountWithSpecialCharacters.filePath(), + "main@dir/sub dir/test.txt"); + EXPECT_EQ( + abfssAccountWithSpecialCharacters.credKey(), + "fs.azure.account.key.test.dfs.core.windows.net"); + + // china cloud + auto abfsChinaCloudAccount = + AbfsAccount("abfs://test@test.dfs.core.chinacloudapi.cn/test"); + EXPECT_EQ(abfsChinaCloudAccount.scheme(), "abfs"); + EXPECT_EQ( + abfsChinaCloudAccount.accountNameWithSuffix(), + "test.dfs.core.chinacloudapi.cn"); + EXPECT_EQ(abfsChinaCloudAccount.accountName(), "test"); + EXPECT_EQ(abfsChinaCloudAccount.endpointSuffix(), "core.chinacloudapi.cn"); + EXPECT_EQ(abfsChinaCloudAccount.fileSystem(), "test"); + EXPECT_EQ(abfsChinaCloudAccount.filePath(), "test"); + EXPECT_EQ( + abfsChinaCloudAccount.credKey(), + "fs.azure.account.key.test.dfs.core.chinacloudapi.cn"); + + // us gov cloud + auto abfsUsGovCloudAccount = + AbfsAccount("abfs://test@test.dfs.core.usgovcloudapi.net/test"); + EXPECT_EQ(abfsUsGovCloudAccount.scheme(), "abfs"); + EXPECT_EQ( + abfsUsGovCloudAccount.accountNameWithSuffix(), + "test.dfs.core.usgovcloudapi.net"); + EXPECT_EQ(abfsUsGovCloudAccount.accountName(), "test"); + EXPECT_EQ(abfsUsGovCloudAccount.endpointSuffix(), "core.usgovcloudapi.net"); + EXPECT_EQ(abfsUsGovCloudAccount.fileSystem(), "test"); + EXPECT_EQ(abfsUsGovCloudAccount.filePath(), "test"); + EXPECT_EQ( + abfsUsGovCloudAccount.credKey(), + "fs.azure.account.key.test.dfs.core.usgovcloudapi.net"); + + // germany cloud + auto abfsGermanyCloudAccount = + AbfsAccount("abfs://test@test.dfs.core.cloudapi.de/test"); + EXPECT_EQ(abfsGermanyCloudAccount.scheme(), "abfs"); + EXPECT_EQ( + abfsGermanyCloudAccount.accountNameWithSuffix(), + "test.dfs.core.cloudapi.de"); + EXPECT_EQ(abfsGermanyCloudAccount.accountName(), "test"); + EXPECT_EQ(abfsGermanyCloudAccount.endpointSuffix(), "core.cloudapi.de"); + EXPECT_EQ(abfsGermanyCloudAccount.fileSystem(), "test"); + EXPECT_EQ(abfsGermanyCloudAccount.filePath(), "test"); + EXPECT_EQ( + abfsGermanyCloudAccount.credKey(), + "fs.azure.account.key.test.dfs.core.cloudapi.de"); +} diff --git a/velox/connectors/hive/storage_adapters/abfs/tests/AzuriteServer.cpp b/velox/connectors/hive/storage_adapters/abfs/tests/AzuriteServer.cpp new file mode 100644 index 0000000000000..83461b2c58f24 --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/tests/AzuriteServer.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/storage_adapters/abfs/tests/AzuriteServer.h" + +namespace facebook::velox::filesystems::test { +const std::string AzuriteServer::connectionStr() const { + return fmt::format( + "DefaultEndpointsProtocol=http;AccountName={};AccountKey={};BlobEndpoint=http://127.0.0.1:{}/{};", + AzuriteAccountName, + AzuriteAccountKey, + port_, + AzuriteAccountName); +} + +void AzuriteServer::start() { + try { + serverProcess_ = std::make_unique( + env_, exePath_, commandOptions_); + serverProcess_->wait_for(std::chrono::duration(5000)); + VELOX_CHECK_EQ( + serverProcess_->exit_code(), + 383, + "AzuriteServer process exited, code: ", + serverProcess_->exit_code()); + } catch (const std::exception& e) { + VELOX_FAIL("Failed to launch Azurite server: {}", e.what()); + } +} + +void AzuriteServer::stop() { + if (serverProcess_ && serverProcess_->valid()) { + serverProcess_->terminate(); + serverProcess_->wait(); + serverProcess_.reset(); + } +} + +bool AzuriteServer::isRunning() { + if (serverProcess_) { + return true; + } + return false; +} + +// requires azurite executable to be on the PATH +AzuriteServer::AzuriteServer(int64_t port) : port_(port) { + std::string dataLocation = fmt::format("/tmp/azurite_{}", port); + std::string logFilePath = fmt::format("/tmp/azurite/azurite_{}.log", port); + std::printf( + "Launch azurite instance with port - %s, data location - %s, log file path - %s\n", + std::to_string(port).c_str(), + dataLocation.c_str(), + logFilePath.c_str()); + commandOptions_ = { + "--silent", + "--blobPort", + std::to_string(port), + "--location", + dataLocation, + "--debug", + logFilePath, + }; + env_ = (boost::process::environment)boost::this_process::environment(); + env_["PATH"] = env_["PATH"].to_string() + AzuriteSearchPath; + env_["AZURITE_ACCOUNTS"] = + fmt::format("{}:{}", AzuriteAccountName, AzuriteAccountKey); + auto path = env_["PATH"].to_vector(); + exePath_ = boost::process::search_path( + AzuriteServerExecutableName, + std::vector(path.begin(), path.end())); + std::printf("AzuriteServer executable path: %s\n", exePath_.c_str()); + if (exePath_.empty()) { + VELOX_FAIL( + "Failed to find azurite executable {}'", AzuriteServerExecutableName); + } +} + +void AzuriteServer::addFile(std::string source, std::string destination) { + auto containerClient = BlobContainerClient::CreateFromConnectionString( + connectionStr(), AzuriteContainerName); + containerClient.CreateIfNotExists(); + auto blobClient = containerClient.GetBlockBlobClient(destination); + blobClient.UploadFrom(source); +} + +AzuriteServer::~AzuriteServer() { + // stop(); +} +} // namespace facebook::velox::filesystems::test diff --git a/velox/connectors/hive/storage_adapters/abfs/tests/AzuriteServer.h b/velox/connectors/hive/storage_adapters/abfs/tests/AzuriteServer.h new file mode 100644 index 0000000000000..67a4f434cb4d3 --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/tests/AzuriteServer.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/exec/tests/utils/TempDirectoryPath.h" + +#include +#include +#include +#include +#include +#include +#include "boost/process.hpp" + +namespace facebook::velox::filesystems::test { +using namespace Azure::Storage::Blobs; +static const std::string AzuriteServerExecutableName{"azurite-blob"}; +static const std::string AzuriteSearchPath{":/usr/bin/azurite"}; +static const std::string AzuriteAccountName{"test"}; +static const std::string AzuriteContainerName{"test"}; +// the default key of Azurite Server used for connection +static const std::string AzuriteAccountKey{ + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="}; +static const std::string AzuriteABFSEndpoint = fmt::format( + "abfs://{}@{}.dfs.core.windows.net/", + AzuriteContainerName, + AzuriteAccountName); + +class AzuriteServer { + public: + AzuriteServer(int64_t port); + + const std::string connectionStr() const; + + void start(); + + void stop(); + + bool isRunning(); + + void addFile(std::string source, std::string destination); + + virtual ~AzuriteServer(); + + private: + int64_t port_; + std::vector commandOptions_; + std::unique_ptr<::boost::process::child> serverProcess_; + boost::filesystem::path exePath_; + boost::process::environment env_; +}; +} // namespace facebook::velox::filesystems::test diff --git a/velox/connectors/hive/storage_adapters/abfs/tests/CMakeLists.txt b/velox/connectors/hive/storage_adapters/abfs/tests/CMakeLists.txt new file mode 100644 index 0000000000000..0bb2d428ca1ca --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/tests/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(velox_abfs_test AbfsFileSystemTest.cpp AbfsUtilTest.cpp + AzuriteServer.cpp MockBlobStorageFileClient.cpp) +add_test(velox_abfs_test velox_abfs_test) +target_link_libraries( + velox_abfs_test + PRIVATE + velox_file + velox_abfs + velox_core + velox_exec_test_lib + velox_hive_connector + velox_dwio_common_exception + velox_exec + GTest::gtest + GTest::gtest_main + Azure::azure-storage-blobs + Azure::azure-storage-files-datalake) + +target_compile_options(velox_abfs_test PRIVATE -Wno-deprecated-declarations) diff --git a/velox/connectors/hive/storage_adapters/abfs/tests/MockBlobStorageFileClient.cpp b/velox/connectors/hive/storage_adapters/abfs/tests/MockBlobStorageFileClient.cpp new file mode 100644 index 0000000000000..5f0cf9fa1efd6 --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/tests/MockBlobStorageFileClient.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/storage_adapters/abfs/tests/MockBlobStorageFileClient.h" + +#include + +#include + +using namespace Azure::Storage::Files::DataLake; +namespace facebook::velox::filesystems::test { +void MockBlobStorageFileClient::create() { + fileStream_ = std::ofstream( + filePath_, + std::ios_base::out | std::ios_base::binary | std::ios_base::app); +} + +PathProperties MockBlobStorageFileClient::getProperties() { + if (!std::filesystem::exists(filePath_)) { + Azure::Storage::StorageException exp(filePath_ + "doesn't exists"); + exp.StatusCode = Azure::Core::Http::HttpStatusCode::NotFound; + throw exp; + } + std::ifstream file(filePath_, std::ios::binary | std::ios::ate); + uint64_t size = static_cast(file.tellg()); + PathProperties ret; + ret.FileSize = size; + return ret; +} + +void MockBlobStorageFileClient::append( + const uint8_t* buffer, + size_t size, + uint64_t offset) { + fileStream_.seekp(offset); + fileStream_.write(reinterpret_cast(buffer), size); +} + +void MockBlobStorageFileClient::flush(uint64_t position) { + fileStream_.flush(); +} + +void MockBlobStorageFileClient::close() { + fileStream_.flush(); + fileStream_.close(); +} +} // namespace facebook::velox::filesystems::test diff --git a/velox/connectors/hive/storage_adapters/abfs/tests/MockBlobStorageFileClient.h b/velox/connectors/hive/storage_adapters/abfs/tests/MockBlobStorageFileClient.h new file mode 100644 index 0000000000000..1ef79b31c8b70 --- /dev/null +++ b/velox/connectors/hive/storage_adapters/abfs/tests/MockBlobStorageFileClient.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/storage_adapters/abfs/AbfsWriteFile.h" + +#include "velox/exec/tests/utils/TempFilePath.h" + +using namespace facebook::velox; +using namespace facebook::velox::filesystems::abfs; + +namespace facebook::velox::filesystems::test { +// A mocked blob storage file client backend with local file store. +class MockBlobStorageFileClient : public IBlobStorageFileClient { + public: + MockBlobStorageFileClient() { + auto tempFile = ::exec::test::TempFilePath::create(); + filePath_ = tempFile->getPath(); + } + + void create() override; + PathProperties getProperties() override; + void append(const uint8_t* buffer, size_t size, uint64_t offset) override; + void flush(uint64_t position) override; + void close() override; + + // for testing purpose to verify the written content if correct. + std::string readContent() { + std::ifstream inputFile(filePath_); + std::string content; + inputFile.seekg(0, std::ios::end); + std::streamsize fileSize = inputFile.tellg(); + inputFile.seekg(0, std::ios::beg); + content.resize(fileSize); + inputFile.read(&content[0], fileSize); + inputFile.close(); + return content; + } + + private: + std::string filePath_; + std::ofstream fileStream_; +}; +} // namespace facebook::velox::filesystems::test diff --git a/velox/connectors/hive/storage_adapters/gcs/CMakeLists.txt b/velox/connectors/hive/storage_adapters/gcs/CMakeLists.txt index 02d3035ff7a46..c5ac37c73fd52 100644 --- a/velox/connectors/hive/storage_adapters/gcs/CMakeLists.txt +++ b/velox/connectors/hive/storage_adapters/gcs/CMakeLists.txt @@ -14,20 +14,17 @@ # for generated headers -add_library(velox_gcs RegisterGCSFileSystem.cpp) +velox_add_library(velox_gcs RegisterGCSFileSystem.cpp) if(VELOX_ENABLE_GCS) - target_sources(velox_gcs PRIVATE GCSFileSystem.cpp GCSUtil.cpp) - target_link_libraries(velox_gcs Folly::folly google-cloud-cpp::storage) + velox_sources(velox_gcs PRIVATE GCSFileSystem.cpp GCSUtil.cpp) + velox_link_libraries(velox_gcs velox_exception Folly::folly + google-cloud-cpp::storage) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() - if(${VELOX_ENABLE_BENCHMARKS}) - add_subdirectory(benchmark) - endif() - if(${VELOX_ENABLE_EXAMPLES}) add_subdirectory(examples) endif() diff --git a/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.cpp b/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.cpp index e628bbf10fd02..bd4ccfa907ba1 100644 --- a/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.cpp @@ -15,10 +15,12 @@ */ #include "velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h" +#include "velox/common/base/Exceptions.h" +#include "velox/common/config/Config.h" #include "velox/common/file/File.h" #include "velox/connectors/hive/HiveConfig.h" #include "velox/connectors/hive/storage_adapters/gcs/GCSUtil.h" -#include "velox/core/Config.h" +#include "velox/core/QueryConfig.h" #include #include @@ -47,14 +49,17 @@ inline void checkGCSStatus( const std::string& bucket, const std::string& key) { if (!outcome.ok()) { - auto error = outcome.error_info(); - VELOX_FAIL( + const auto errMsg = fmt::format( "{} due to: Path:'{}', SDK Error Type:{}, GCS Status Code:{}, Message:'{}'", errorMsgPrefix, gcsURI(bucket, key), - error.domain(), + outcome.error_info().domain(), getErrorStringFromGCSError(outcome.code()), outcome.message()); + if (outcome.code() == gc::StatusCode::kNotFound) { + VELOX_FILE_NOT_FOUND_ERROR(errMsg); + } + VELOX_FAIL(errMsg); } } @@ -68,7 +73,13 @@ class GCSReadFile final : public ReadFile { // Gets the length of the file. // Checks if there are any issues reading the file. - void initialize() { + void initialize(const filesystems::FileOptions& options) { + if (options.fileSize.has_value()) { + VELOX_CHECK_GE( + options.fileSize.value(), 0, "File size must be non-negative"); + length_ = options.fileSize.value(); + } + // Make it a no-op if invoked twice. if (length_ != -1) { return; @@ -247,14 +258,16 @@ auto constexpr kGCSInvalidPath = "File {} is not a valid gcs file"; class GCSFileSystem::Impl { public: - Impl(const Config* config) : config_(config) {} + Impl(const config::ConfigBase* config) + : hiveConfig_(std::make_shared( + std::make_shared(config->rawConfigsCopy()))) {} ~Impl() = default; // Use the input Config parameters and initialize the GCSClient. void initializeClient() { auto options = gc::Options{}; - auto scheme = HiveConfig::gcsScheme(config_); + auto scheme = hiveConfig_->gcsScheme(); if (scheme == "https") { options.set( gc::MakeGoogleDefaultCredentials()); @@ -263,12 +276,26 @@ class GCSFileSystem::Impl { } options.set(kUploadBufferSize); - auto endpointOverride = HiveConfig::gcsEndpoint(config_); + auto max_retry_count = hiveConfig_->gcsMaxRetryCount(); + if (max_retry_count) { + options.set( + gcs::LimitedErrorCountRetryPolicy(max_retry_count.value()).clone()); + } + + auto max_retry_time = hiveConfig_->gcsMaxRetryTime(); + if (max_retry_time) { + auto retry_time = std::chrono::duration_cast( + facebook::velox::config::toDuration(max_retry_time.value())); + options.set( + gcs::LimitedTimeRetryPolicy(retry_time).clone()); + } + + auto endpointOverride = hiveConfig_->gcsEndpoint(); if (!endpointOverride.empty()) { options.set(scheme + "://" + endpointOverride); } - auto cred = HiveConfig::gcsCredentials(config_); + auto cred = hiveConfig_->gcsCredentials(); if (!cred.empty()) { auto credentials = gc::MakeServiceAccountCredentials(cred); options.set(credentials); @@ -284,11 +311,11 @@ class GCSFileSystem::Impl { } private: - const Config* FOLLY_NONNULL config_; + const std::shared_ptr hiveConfig_; std::shared_ptr client_; }; -GCSFileSystem::GCSFileSystem(std::shared_ptr config) +GCSFileSystem::GCSFileSystem(std::shared_ptr config) : FileSystem(config) { impl_ = std::make_shared(config.get()); } @@ -299,10 +326,10 @@ void GCSFileSystem::initializeClient() { std::unique_ptr GCSFileSystem::openFileForRead( std::string_view path, - const FileOptions& /*unused*/) { + const FileOptions& options) { const auto gcspath = gcsPath(path); auto gcsfile = std::make_unique(gcspath, impl_->getClient()); - gcsfile->initialize(); + gcsfile->initialize(options); return gcsfile; } @@ -400,5 +427,5 @@ void GCSFileSystem::rmdir(std::string_view path) { VELOX_UNSUPPORTED("rmdir for GCS not implemented"); } -}; // namespace filesystems -}; // namespace facebook::velox +} // namespace filesystems +} // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h b/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h index 5cd98ed0fd724..0d80cacd9df13 100644 --- a/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h +++ b/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h @@ -26,7 +26,7 @@ namespace facebook::velox::filesystems { /// (register|generate)ReadFile and (register|generate)WriteFile functions. class GCSFileSystem : public FileSystem { public: - explicit GCSFileSystem(std::shared_ptr config); + explicit GCSFileSystem(std::shared_ptr config); /// Initialize the google::cloud::storage::Client from the input Config /// parameters. diff --git a/velox/connectors/hive/storage_adapters/gcs/RegisterGCSFileSystem.cpp b/velox/connectors/hive/storage_adapters/gcs/RegisterGCSFileSystem.cpp index 4c76478dfca21..3474c8d4dfb9b 100644 --- a/velox/connectors/hive/storage_adapters/gcs/RegisterGCSFileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/gcs/RegisterGCSFileSystem.cpp @@ -15,9 +15,9 @@ */ #ifdef VELOX_ENABLE_GCS -#include "velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h" -#include "velox/connectors/hive/storage_adapters/gcs/GCSUtil.h" -#include "velox/core/Config.h" +#include "velox/common/config/Config.h" +#include "velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h" // @manual +#include "velox/connectors/hive/storage_adapters/gcs/GCSUtil.h" // @manual #endif namespace facebook::velox::filesystems { @@ -26,28 +26,30 @@ namespace facebook::velox::filesystems { folly::once_flag GCSInstantiationFlag; std::function(std::shared_ptr, std::string_view)> + FileSystem>(std::shared_ptr, std::string_view)> gcsFileSystemGenerator() { - static auto filesystemGenerator = [](std::shared_ptr properties, - std::string_view filePath) { - // Only one instance of GCSFileSystem is supported for now (follow S3 for - // now). - // TODO: Support multiple GCSFileSystem instances using a cache - // Initialize on first access and reuse after that. - static std::shared_ptr gcsfs; - folly::call_once(GCSInstantiationFlag, [&properties]() { - std::shared_ptr fs; - if (properties != nullptr) { - fs = std::make_shared(properties); - } else { - fs = std::make_shared( - std::make_shared()); - } - fs->initializeClient(); - gcsfs = fs; - }); - return gcsfs; - }; + static auto filesystemGenerator = + [](std::shared_ptr properties, + std::string_view filePath) { + // Only one instance of GCSFileSystem is supported for now (follow S3 + // for now). + // TODO: Support multiple GCSFileSystem instances using a cache + // Initialize on first access and reuse after that. + static std::shared_ptr gcsfs; + folly::call_once(GCSInstantiationFlag, [&properties]() { + std::shared_ptr fs; + if (properties != nullptr) { + fs = std::make_shared(properties); + } else { + fs = std::make_shared( + std::make_shared( + std::unordered_map())); + } + fs->initializeClient(); + gcsfs = fs; + }); + return gcsfs; + }; return filesystemGenerator; } #endif diff --git a/velox/connectors/hive/storage_adapters/gcs/benchmark/CMakeLists.txt b/velox/connectors/hive/storage_adapters/gcs/benchmark/CMakeLists.txt deleted file mode 100644 index 0eb61670736cd..0000000000000 --- a/velox/connectors/hive/storage_adapters/gcs/benchmark/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -add_executable(velox_gcsread_benchmark GCSReadBenchmark.cpp - GCSReadBenchmarkMain.cpp) - -target_link_libraries( - velox_gcsread_benchmark - velox_read_benchmark_lib - velox_gcs - velox_exception - velox_exec_test_lib - fmt::fmt - Folly::folly) diff --git a/velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmark.cpp b/velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmark.cpp deleted file mode 100644 index 5093a04f32ba3..0000000000000 --- a/velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmark.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmark.h" -#include "velox/core/Config.h" - -#include - -DEFINE_string(gcs_config, "", "Path of GCS config file"); - -namespace facebook::velox { - -// From presto-cpp -std::shared_ptr readConfig(const std::string& filePath) { - std::ifstream configFile(filePath); - if (!configFile.is_open()) { - throw std::runtime_error( - fmt::format("Couldn't open config file {} for reading.", filePath)); - } - - std::unordered_map properties; - std::string line; - while (getline(configFile, line)) { - line.erase(std::remove_if(line.begin(), line.end(), isspace), line.end()); - if (line[0] == '#' || line.empty()) { - continue; - } - auto delimiterPos = line.find('='); - auto name = line.substr(0, delimiterPos); - auto value = line.substr(delimiterPos + 1); - properties.emplace(name, value); - } - - return std::make_shared(properties); -} - -} // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmark.h b/velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmark.h deleted file mode 100644 index 5de1e852bf273..0000000000000 --- a/velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmark.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/common/file/benchmark/ReadBenchmark.h" -#include "velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h" - -DECLARE_string(gcs_config); - -namespace facebook::velox { - -std::shared_ptr readConfig(const std::string& filePath); - -class GCSReadBenchmark : public ReadBenchmark { - public: - // Initialize a GCSReadFile instance for the specified 'path'. - void initialize() override { - executor_ = - std::make_unique(FLAGS_num_threads); - - std::shared_ptr config; - if (!FLAGS_gcs_config.empty()) { - config = readConfig(FLAGS_gcs_config); - } - auto gcsfs = filesystems::getFileSystem(FLAGS_path, config); - readFile_ = gcsfs->openFileForRead(FLAGS_path); - - fileSize_ = readFile_->size(); - if (FLAGS_file_size_gb) { - fileSize_ = std::min(FLAGS_file_size_gb << 30, fileSize_); - } - - if (fileSize_ <= FLAGS_measurement_size) { - LOG(ERROR) << "File size " << fileSize_ - << " is <= then --measurement_size " << FLAGS_measurement_size; - exit(1); - } - if (FLAGS_seed) { - rng_.seed(FLAGS_seed); - } - } -}; - -} // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmarkMain.cpp b/velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmarkMain.cpp deleted file mode 100644 index f18437f9e490b..0000000000000 --- a/velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmarkMain.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/connectors/hive/storage_adapters/gcs/benchmark/GCSReadBenchmark.h" - -using namespace facebook::velox; - -// This benchmark measures the throughput of an GCS compatible FileSystem for -// various ReadFile APIs. The output helps us understand the maximum possible -// gains for queries. Example: If a single thread requires reading 1GB of data -// and the IO throughput is 100 MBps, then it takes 10 seconds to just read the -// data. -int main(int argc, char** argv) { - folly::init(&argc, &argv, false); - GCSReadBenchmark bm; - bm.initialize(); - bm.run(); -} diff --git a/velox/connectors/hive/storage_adapters/gcs/examples/GCSFileSystemExample.cpp b/velox/connectors/hive/storage_adapters/gcs/examples/GCSFileSystemExample.cpp index 5495e560bc2ff..ee026a86e0db9 100644 --- a/velox/connectors/hive/storage_adapters/gcs/examples/GCSFileSystemExample.cpp +++ b/velox/connectors/hive/storage_adapters/gcs/examples/GCSFileSystemExample.cpp @@ -13,22 +13,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "velox/common/config/Config.h" #include "velox/common/file/File.h" #include "velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h" -#include "velox/core/Config.h" #include - #include - #include DEFINE_string(gcs_path, "", "Path of GCS bucket"); +DEFINE_string(gcs_max_retry_count, "", "Max retry count"); +DEFINE_string(gcs_max_retry_time, "", "Max retry time"); auto newConfiguration() { using namespace facebook::velox; std::unordered_map configOverride = {}; - return std::make_shared(std::move(configOverride)); + if (!FLAGS_gcs_max_retry_count.empty()) { + configOverride.emplace( + "hive.gcs.max-retry-count", FLAGS_gcs_max_retry_count); + } + if (!FLAGS_gcs_max_retry_time.empty()) { + configOverride.emplace("hive.gcs.max-retry-time", FLAGS_gcs_max_retry_time); + } + return std::make_shared(std::move(configOverride)); } int main(int argc, char** argv) { @@ -40,7 +47,7 @@ int main(int argc, char** argv) { } filesystems::GCSFileSystem gcfs(newConfiguration()); gcfs.initializeClient(); - std::cout << "Opening file " << FLAGS_gcs_path << std::endl; + std::cout << "Opening file for read " << FLAGS_gcs_path << std::endl; std::unique_ptr file_read = gcfs.openFileForRead(FLAGS_gcs_path); std::size_t file_size = file_read->size(); std::cout << "File size = " << file_size << std::endl; diff --git a/velox/connectors/hive/storage_adapters/gcs/tests/CMakeLists.txt b/velox/connectors/hive/storage_adapters/gcs/tests/CMakeLists.txt index b44aa355c09fc..edf4c69eeffd8 100644 --- a/velox/connectors/hive/storage_adapters/gcs/tests/CMakeLists.txt +++ b/velox/connectors/hive/storage_adapters/gcs/tests/CMakeLists.txt @@ -22,6 +22,6 @@ target_link_libraries( velox_hive_connector velox_dwio_common_exception velox_exec - gmock - gtest - gtest_main) + GTest::gmock + GTest::gtest + GTest::gtest_main) diff --git a/velox/connectors/hive/storage_adapters/gcs/tests/GCSFileSystemTest.cpp b/velox/connectors/hive/storage_adapters/gcs/tests/GCSFileSystemTest.cpp index a0cb3c7c5222c..5293901f61f92 100644 --- a/velox/connectors/hive/storage_adapters/gcs/tests/GCSFileSystemTest.cpp +++ b/velox/connectors/hive/storage_adapters/gcs/tests/GCSFileSystemTest.cpp @@ -16,10 +16,10 @@ #include "velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.h" #include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/config/Config.h" #include "velox/common/file/File.h" #include "velox/connectors/hive/FileHandle.h" #include "velox/connectors/hive/storage_adapters/gcs/GCSUtil.h" -#include "velox/core/Config.h" #include "velox/exec/tests/utils/TempFilePath.h" #include @@ -139,12 +139,13 @@ class GCSFileSystemTest : public testing::Test { << ">, status=" << object.status(); } - std::shared_ptr testGcsOptions() const { + std::shared_ptr testGcsOptions() const { std::unordered_map configOverride = {}; configOverride["hive.gcs.scheme"] = "http"; configOverride["hive.gcs.endpoint"] = "localhost:" + testbench_->port(); - return std::make_shared(std::move(configOverride)); + return std::make_shared( + std::move(configOverride)); } std::string preexistingBucketName() { @@ -285,30 +286,20 @@ TEST_F(GCSFileSystemTest, missingFile) { const std::string gcsFile = gcsURI(preexistingBucketName(), file); filesystems::GCSFileSystem gcfs(testGcsOptions()); gcfs.initializeClient(); - try { - gcfs.openFileForRead(gcsFile); - FAIL() << "Expected VeloxException"; - } catch (VeloxException const& err) { - EXPECT_THAT( - err.message(), - ::testing::HasSubstr( - "\\\"message\\\": \\\"Live version of object test1-gcs/newTest.txt does not exist.\\\"")); - } + VELOX_ASSERT_RUNTIME_THROW_CODE( + gcfs.openFileForRead(gcsFile), + error_code::kFileNotFound, + "\\\"message\\\": \\\"Live version of object test1-gcs/newTest.txt does not exist.\\\""); } TEST_F(GCSFileSystemTest, missingBucket) { filesystems::GCSFileSystem gcfs(testGcsOptions()); gcfs.initializeClient(); - try { - const char* gcsFile = "gs://dummy/foo.txt"; - gcfs.openFileForRead(gcsFile); - FAIL() << "Expected VeloxException"; - } catch (VeloxException const& err) { - EXPECT_THAT( - err.message(), - ::testing::HasSubstr( - "\\\"message\\\": \\\"Bucket dummy does not exist.\\\"")); - } + const char* gcsFile = "gs://dummy/foo.txt"; + VELOX_ASSERT_RUNTIME_THROW_CODE( + gcfs.openFileForRead(gcsFile), + error_code::kFileNotFound, + "\\\"message\\\": \\\"Bucket dummy does not exist.\\\""); } TEST_F(GCSFileSystemTest, credentialsConfig) { @@ -355,8 +346,8 @@ TEST_F(GCSFileSystemTest, credentialsConfig) { })"""; configOverride["hive.gcs.scheme"] = "http"; configOverride["hive.gcs.endpoint"] = "localhost:" + testbench_->port(); - std::shared_ptr conf = - std::make_shared(std::move(configOverride)); + std::shared_ptr conf = + std::make_shared(std::move(configOverride)); filesystems::GCSFileSystem gcfs(conf); diff --git a/velox/connectors/hive/storage_adapters/hdfs/CMakeLists.txt b/velox/connectors/hive/storage_adapters/hdfs/CMakeLists.txt index d6363d9e71c55..6c1e84aec4040 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/CMakeLists.txt +++ b/velox/connectors/hive/storage_adapters/hdfs/CMakeLists.txt @@ -14,12 +14,16 @@ # for generated headers -add_library(velox_hdfs RegisterHdfsFileSystem.cpp) +velox_add_library(velox_hdfs RegisterHdfsFileSystem.cpp) if(VELOX_ENABLE_HDFS) - target_sources(velox_hdfs PRIVATE HdfsFileSystem.cpp HdfsReadFile.cpp - HdfsWriteFile.cpp) - target_link_libraries(velox_hdfs Folly::folly ${LIBHDFS3} xsimd) + velox_sources( + velox_hdfs + PRIVATE + HdfsFileSystem.cpp + HdfsReadFile.cpp + HdfsWriteFile.cpp) + velox_link_libraries(velox_hdfs Folly::folly ${LIBHDFS3} xsimd) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) diff --git a/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.cpp b/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.cpp index ff49163bba9f9..4e7b9ddc0ec52 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.cpp @@ -16,9 +16,9 @@ #include "velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.h" #include #include +#include "velox/common/config/Config.h" #include "velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.h" #include "velox/connectors/hive/storage_adapters/hdfs/HdfsWriteFile.h" -#include "velox/core/Config.h" namespace facebook::velox::filesystems { std::string_view HdfsFileSystem::kScheme("hdfs://"); @@ -26,16 +26,19 @@ std::string_view HdfsFileSystem::kScheme("hdfs://"); class HdfsFileSystem::Impl { public: // Keep config here for possible use in the future. - explicit Impl(const Config* config, const HdfsServiceEndpoint& endpoint) { + explicit Impl( + const config::ConfigBase* config, + const HdfsServiceEndpoint& endpoint) { auto builder = hdfsNewBuilder(); hdfsBuilderSetNameNode(builder, endpoint.host.c_str()); hdfsBuilderSetNameNodePort(builder, atoi(endpoint.port.data())); hdfsClient_ = hdfsBuilderConnect(builder); + hdfsFreeBuilder(builder); VELOX_CHECK_NOT_NULL( hdfsClient_, "Unable to connect to HDFS: {}, got error: {}.", endpoint.identity(), - hdfsGetLastError()) + hdfsGetLastError()); } ~Impl() { @@ -56,7 +59,7 @@ class HdfsFileSystem::Impl { }; HdfsFileSystem::HdfsFileSystem( - const std::shared_ptr& config, + const std::shared_ptr& config, const HdfsServiceEndpoint& endpoint) : FileSystem(config) { impl_ = std::make_shared(config.get(), endpoint); @@ -93,17 +96,17 @@ bool HdfsFileSystem::isHdfsFile(const std::string_view filePath) { /// fixed one from configuration. HdfsServiceEndpoint HdfsFileSystem::getServiceEndpoint( const std::string_view filePath, - const Config* config) { + const config::ConfigBase* config) { auto endOfIdentityInfo = filePath.find('/', kScheme.size()); std::string hdfsIdentity{ filePath.data(), kScheme.size(), endOfIdentityInfo - kScheme.size()}; if (hdfsIdentity.empty()) { // Fall back to get a fixed endpoint from config. - auto hdfsHost = config->get("hive.hdfs.host"); + auto hdfsHost = config->get("hive.hdfs.host"); VELOX_CHECK( hdfsHost.hasValue(), "hdfsHost is empty, configuration missing for hdfs host"); - auto hdfsPort = config->get("hive.hdfs.port"); + auto hdfsPort = config->get("hive.hdfs.port"); VELOX_CHECK( hdfsPort.hasValue(), "hdfsPort is empty, configuration missing for hdfs port"); diff --git a/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.h b/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.h index 295df6f8f0f67..25602a470f169 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.h +++ b/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.h @@ -43,7 +43,7 @@ struct HdfsServiceEndpoint { class HdfsFileSystem : public FileSystem { public: explicit HdfsFileSystem( - const std::shared_ptr& config, + const std::shared_ptr& config, const HdfsServiceEndpoint& endpoint); std::string name() const override; @@ -61,7 +61,7 @@ class HdfsFileSystem : public FileSystem { virtual void rename( std::string_view path, std::string_view newPath, - bool overWrite = false) { + bool overWrite = false) override { VELOX_UNSUPPORTED("rename for HDFs not implemented"); } @@ -88,7 +88,7 @@ class HdfsFileSystem : public FileSystem { /// will be used. static HdfsServiceEndpoint getServiceEndpoint( const std::string_view filePath, - const Config* config); + const config::ConfigBase* config); static std::string_view kScheme; diff --git a/velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.cpp b/velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.cpp index 84bbd217d4741..dedc2bb4a4c9a 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.cpp +++ b/velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.cpp @@ -23,11 +23,17 @@ namespace facebook::velox { HdfsReadFile::HdfsReadFile(hdfsFS hdfs, const std::string_view path) : hdfsClient_(hdfs), filePath_(path) { fileInfo_ = hdfsGetPathInfo(hdfsClient_, filePath_.data()); - VELOX_CHECK_NOT_NULL( - fileInfo_, - "Unable to get file path info for file: {}. got error: {}", - filePath_, - hdfsGetLastError()); + if (fileInfo_ == nullptr) { + auto error = hdfsGetLastError(); + auto errMsg = fmt::format( + "Unable to get file path info for file: {}. got error: {}", + filePath_, + error); + if (std::strstr(error, "FileNotFoundException") != nullptr) { + VELOX_FILE_NOT_FOUND_ERROR(errMsg); + } + VELOX_FAIL(errMsg); + } } HdfsReadFile::~HdfsReadFile() { @@ -85,6 +91,6 @@ void HdfsReadFile::checkFileReadParameters(uint64_t offset, uint64_t length) "Cannot read HDFS file beyond its size: {}, offset: {}, end point: {}", fileSize, offset, - endPoint) + endPoint); } } // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.h b/velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.h index 2bd94bf9c8aa5..1d531956f0eee 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.h +++ b/velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.h @@ -50,7 +50,7 @@ struct HdfsFile { int32_t read(char* pos, uint64_t length) const { auto bytesRead = hdfsRead(client_, handle_, pos, length); - VELOX_CHECK(bytesRead >= 0, "Read failure in HDFSReadFile::preadInternal.") + VELOX_CHECK(bytesRead >= 0, "Read failure in HDFSReadFile::preadInternal."); return bytesRead; } }; diff --git a/velox/connectors/hive/storage_adapters/hdfs/HdfsWriteFile.cpp b/velox/connectors/hive/storage_adapters/hdfs/HdfsWriteFile.cpp index 883ab1f649042..60f98a88c972b 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/HdfsWriteFile.cpp +++ b/velox/connectors/hive/storage_adapters/hdfs/HdfsWriteFile.cpp @@ -25,6 +25,12 @@ HdfsWriteFile::HdfsWriteFile( short replication, int blockSize) : hdfsClient_(hdfsClient), filePath_(path) { + auto pos = filePath_.rfind("/"); + auto parentDir = filePath_.substr(0, pos + 1); + if (hdfsExists(hdfsClient_, parentDir.c_str()) == -1) { + hdfsCreateDirectory(hdfsClient_, parentDir.c_str()); + } + hdfsFile_ = hdfsOpenFile( hdfsClient_, filePath_.c_str(), diff --git a/velox/connectors/hive/storage_adapters/hdfs/RegisterHdfsFileSystem.cpp b/velox/connectors/hive/storage_adapters/hdfs/RegisterHdfsFileSystem.cpp index 47734838838f8..bdff4a7a4fdc7 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/RegisterHdfsFileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/hdfs/RegisterHdfsFileSystem.cpp @@ -17,9 +17,9 @@ #ifdef VELOX_ENABLE_HDFS3 #include "folly/concurrency/ConcurrentHashMap.h" -#include "velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.h" -#include "velox/connectors/hive/storage_adapters/hdfs/HdfsUtil.h" -#include "velox/core/Config.h" +#include "velox/common/config/Config.h" +#include "velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.h" // @manual +#include "velox/connectors/hive/storage_adapters/hdfs/HdfsUtil.h" // @manual #include "velox/dwio/common/FileSink.h" #endif @@ -29,9 +29,10 @@ namespace facebook::velox::filesystems { std::mutex mtx; std::function(std::shared_ptr, std::string_view)> + FileSystem>(std::shared_ptr, std::string_view)> hdfsFileSystemGenerator() { - static auto filesystemGenerator = [](std::shared_ptr properties, + static auto filesystemGenerator = [](std::shared_ptr + properties, std::string_view filePath) { static folly::ConcurrentHashMap> filesystems; diff --git a/velox/connectors/hive/storage_adapters/hdfs/tests/CMakeLists.txt b/velox/connectors/hive/storage_adapters/hdfs/tests/CMakeLists.txt index f0bb039ac7137..6c3067e2de481 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/tests/CMakeLists.txt +++ b/velox/connectors/hive/storage_adapters/hdfs/tests/CMakeLists.txt @@ -26,6 +26,9 @@ target_link_libraries( velox_hive_connector velox_dwio_common_exception velox_exec - gtest - gtest_main - gmock) + GTest::gtest + GTest::gtest_main + GTest::gmock) + +target_compile_options(velox_hdfs_file_test + PRIVATE -Wno-deprecated-declarations) diff --git a/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsFileSystemTest.cpp b/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsFileSystemTest.cpp index 51d933167333c..da65d8e03478d 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsFileSystemTest.cpp +++ b/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsFileSystemTest.cpp @@ -46,7 +46,7 @@ class HdfsFileSystemTest : public testing::Test { miniCluster = std::make_shared(); miniCluster->start(); auto tempFile = createFile(); - miniCluster->addFile(tempFile->path, destinationPath); + miniCluster->addFile(tempFile->getPath(), destinationPath); } } @@ -64,7 +64,7 @@ class HdfsFileSystemTest : public testing::Test { private: static std::shared_ptr<::exec::test::TempFilePath> createFile() { - auto tempFile = ::exec::test::TempFilePath::create(); + auto tempFile = exec::test::TempFilePath::create(); tempFile->append("aaaaa"); tempFile->append("bbbbb"); tempFile->append(std::string(kOneMB, 'c')); @@ -100,10 +100,11 @@ void readData(ReadFile* readFile) { } std::unique_ptr openFileForWrite(std::string_view path) { - auto memConfig = std::make_shared(configurationValues); + auto config = std::make_shared( + std::unordered_map(configurationValues)); std::string hdfsFilePath = "hdfs://" + localhost + ":" + hdfsPort + std::string(path); - auto hdfsFileSystem = filesystems::getFileSystem(hdfsFilePath, memConfig); + auto hdfsFileSystem = filesystems::getFileSystem(hdfsFilePath, config); return hdfsFileSystem->openFileForWrite(path); } @@ -156,12 +157,12 @@ void verifyFailures(hdfsFS hdfs) { HdfsFileSystemTest::miniCluster->stop(); checkReadErrorMessages(&readFile2, readFailErrorMessage, 1); try { - auto memConfig = - std::make_shared(configurationValues); + auto config = std::make_shared( + std::unordered_map(configurationValues)); filesystems::HdfsFileSystem hdfsFileSystem( - memConfig, + config, filesystems::HdfsFileSystem::getServiceEndpoint( - simpleDestinationPath, memConfig.get())); + simpleDestinationPath, config.get())); FAIL() << "expected VeloxException"; } catch (VeloxException const& error) { EXPECT_THAT(error.message(), testing::HasSubstr(builderErrorMessage)); @@ -178,18 +179,18 @@ TEST_F(HdfsFileSystemTest, read) { } TEST_F(HdfsFileSystemTest, viaFileSystem) { - auto memConfig = std::make_shared(configurationValues); - auto hdfsFileSystem = - filesystems::getFileSystem(fullDestinationPath, memConfig); + auto config = std::make_shared( + std::unordered_map(configurationValues)); + auto hdfsFileSystem = filesystems::getFileSystem(fullDestinationPath, config); auto readFile = hdfsFileSystem->openFileForRead(fullDestinationPath); readData(readFile.get()); } TEST_F(HdfsFileSystemTest, initializeFsWithEndpointInfoInFilePath) { // Without host/port configured. - auto memConfig = std::make_shared(); - auto hdfsFileSystem = - filesystems::getFileSystem(fullDestinationPath, memConfig); + auto config = std::make_shared( + std::unordered_map()); + auto hdfsFileSystem = filesystems::getFileSystem(fullDestinationPath, config); auto readFile = hdfsFileSystem->openFileForRead(fullDestinationPath); readData(readFile.get()); @@ -197,14 +198,15 @@ TEST_F(HdfsFileSystemTest, initializeFsWithEndpointInfoInFilePath) { const std::string wrongFullDestinationPath = "hdfs://not_exist_host:" + hdfsPort + destinationPath; VELOX_ASSERT_THROW( - filesystems::getFileSystem(wrongFullDestinationPath, memConfig), + filesystems::getFileSystem(wrongFullDestinationPath, config), "Unable to connect to HDFS"); } TEST_F(HdfsFileSystemTest, fallbackToUseConfig) { - auto memConfig = std::make_shared(configurationValues); + auto config = std::make_shared( + std::unordered_map(configurationValues)); auto hdfsFileSystem = - filesystems::getFileSystem(simpleDestinationPath, memConfig); + filesystems::getFileSystem(simpleDestinationPath, config); auto readFile = hdfsFileSystem->openFileForRead(simpleDestinationPath); readData(readFile.get()); } @@ -218,32 +220,26 @@ TEST_F(HdfsFileSystemTest, oneFsInstanceForOneEndpoint) { } TEST_F(HdfsFileSystemTest, missingFileViaFileSystem) { - try { - auto memConfig = - std::make_shared(configurationValues); - auto hdfsFileSystem = - filesystems::getFileSystem(fullDestinationPath, memConfig); - auto readFile = hdfsFileSystem->openFileForRead( - "hdfs://localhost:7777/path/that/does/not/exist"); - FAIL() << "expected VeloxException"; - } catch (VeloxException const& error) { - EXPECT_THAT( - error.message(), - testing::HasSubstr( - "Unable to get file path info for file: /path/that/does/not/exist. got error: FileNotFoundException: Path /path/that/does/not/exist does not exist.")); - } + auto config = std::make_shared( + std::unordered_map(configurationValues)); + auto hdfsFileSystem = filesystems::getFileSystem(fullDestinationPath, config); + VELOX_ASSERT_RUNTIME_THROW_CODE( + hdfsFileSystem->openFileForRead( + "hdfs://localhost:7777/path/that/does/not/exist"), + error_code::kFileNotFound, + "Unable to get file path info for file: /path/that/does/not/exist. got error: FileNotFoundException: Path /path/that/does/not/exist does not exist."); } TEST_F(HdfsFileSystemTest, missingHost) { try { std::unordered_map missingHostConfiguration( {{"hive.hdfs.port", hdfsPort}}); - auto memConfig = - std::make_shared(missingHostConfiguration); + auto config = std::make_shared( + std::move(missingHostConfiguration)); filesystems::HdfsFileSystem hdfsFileSystem( - memConfig, + config, filesystems::HdfsFileSystem::getServiceEndpoint( - simpleDestinationPath, memConfig.get())); + simpleDestinationPath, config.get())); FAIL() << "expected VeloxException"; } catch (VeloxException const& error) { EXPECT_THAT( @@ -257,12 +253,12 @@ TEST_F(HdfsFileSystemTest, missingPort) { try { std::unordered_map missingPortConfiguration( {{"hive.hdfs.host", localhost}}); - auto memConfig = - std::make_shared(missingPortConfiguration); + auto config = std::make_shared( + std::move(missingPortConfiguration)); filesystems::HdfsFileSystem hdfsFileSystem( - memConfig, + config, filesystems::HdfsFileSystem::getServiceEndpoint( - simpleDestinationPath, memConfig.get())); + simpleDestinationPath, config.get())); FAIL() << "expected VeloxException"; } catch (VeloxException const& error) { EXPECT_THAT( @@ -306,10 +302,10 @@ TEST_F(HdfsFileSystemTest, schemeMatching) { TEST_F(HdfsFileSystemTest, writeNotSupported) { try { - auto memConfig = - std::make_shared(configurationValues); + auto config = std::make_shared( + std::unordered_map(configurationValues)); auto hdfsFileSystem = - filesystems::getFileSystem(fullDestinationPath, memConfig); + filesystems::getFileSystem(fullDestinationPath, config); hdfsFileSystem->openFileForWrite("/path"); } catch (VeloxException const& error) { EXPECT_EQ(error.message(), "Write to HDFS is unsupported"); @@ -318,10 +314,10 @@ TEST_F(HdfsFileSystemTest, writeNotSupported) { TEST_F(HdfsFileSystemTest, removeNotSupported) { try { - auto memConfig = - std::make_shared(configurationValues); + auto config = std::make_shared( + std::unordered_map(configurationValues)); auto hdfsFileSystem = - filesystems::getFileSystem(fullDestinationPath, memConfig); + filesystems::getFileSystem(fullDestinationPath, config); hdfsFileSystem->remove("/path"); } catch (VeloxException const& error) { EXPECT_EQ(error.message(), "Does not support removing files from hdfs"); @@ -361,9 +357,9 @@ TEST_F(HdfsFileSystemTest, multipleThreadsWithReadFile) { TEST_F(HdfsFileSystemTest, multipleThreadsWithFileSystem) { startThreads = false; - auto memConfig = std::make_shared(configurationValues); - auto hdfsFileSystem = - filesystems::getFileSystem(fullDestinationPath, memConfig); + auto config = std::make_shared( + std::unordered_map(configurationValues)); + auto hdfsFileSystem = filesystems::getFileSystem(fullDestinationPath, config); std::vector threads; std::mt19937 generator(std::random_device{}()); @@ -429,6 +425,20 @@ TEST_F(HdfsFileSystemTest, writeFlushFailures) { "Cannot flush HDFS file because file handle is null, file path: /a.txt"); } +TEST_F(HdfsFileSystemTest, writeWithParentDirNotExist) { + std::string path = "/parent/directory/that/does/not/exist/a.txt"; + auto writeFile = openFileForWrite(path); + std::string data = "abcdefghijk"; + writeFile->append(data); + writeFile->flush(); + ASSERT_EQ(writeFile->size(), 0); + writeFile->append(data); + writeFile->append(data); + writeFile->flush(); + writeFile->close(); + ASSERT_EQ(writeFile->size(), data.size() * 3); +} + TEST_F(HdfsFileSystemTest, readFailures) { struct hdfsBuilder* builder = hdfsNewBuilder(); hdfsBuilderSetNameNode(builder, localhost.c_str()); diff --git a/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsMiniCluster.cpp b/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsMiniCluster.cpp index 10ee508ba638c..c262bad3d1d1e 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsMiniCluster.cpp +++ b/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsMiniCluster.cpp @@ -38,7 +38,7 @@ void HdfsMiniCluster::start() { serverProcess_->exit_code(), 383, "Minicluster process exited, code: ", - serverProcess_->exit_code()) + serverProcess_->exit_code()); } catch (const std::exception& e) { VELOX_FAIL("Failed to launch Minicluster server: {}", e.what()); } @@ -91,7 +91,7 @@ void HdfsMiniCluster::addFile(std::string source, std::string destination) { if (!isExited) { VELOX_FAIL( "Failed to add file to hdfs, exit code: {}", - filePutProcess->exit_code()) + filePutProcess->exit_code()); } } diff --git a/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsMiniCluster.h b/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsMiniCluster.h index 55c0e3d8ba2e6..9571bb3c63790 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsMiniCluster.h +++ b/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsMiniCluster.h @@ -23,10 +23,10 @@ namespace facebook::velox::filesystems::test { static const std::string miniClusterExecutableName{"hadoop"}; -static const std::string hadoopSearchPath{":/usr/local/hadoop-2.10.1/bin"}; +static const std::string hadoopSearchPath{":/usr/local/hadoop/bin"}; static const std::string jarCommand{"jar"}; static const std::string miniclusterJar{ - "/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.10.1-tests.jar"}; + "/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-3.3.0-tests.jar"}; static const std::string miniclusterCommand{"minicluster"}; static const std::string noMapReduceOption{"-nomr"}; static const std::string formatNameNodeOption{"-format"}; diff --git a/velox/connectors/hive/storage_adapters/s3fs/CMakeLists.txt b/velox/connectors/hive/storage_adapters/s3fs/CMakeLists.txt index ac84135d2b039..96d6031c1728d 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/CMakeLists.txt +++ b/velox/connectors/hive/storage_adapters/s3fs/CMakeLists.txt @@ -14,18 +14,15 @@ # for generated headers -add_library(velox_s3fs RegisterS3FileSystem.cpp) +velox_add_library(velox_s3fs RegisterS3FileSystem.cpp) if(VELOX_ENABLE_S3) - target_sources(velox_s3fs PRIVATE S3FileSystem.cpp S3Util.cpp) + velox_sources(velox_s3fs PRIVATE S3FileSystem.cpp S3Util.cpp) - target_include_directories(velox_s3fs PUBLIC ${AWSSDK_INCLUDE_DIRS}) - target_link_libraries(velox_s3fs velox_dwio_common_exception Folly::folly - ${AWSSDK_LIBRARIES} xsimd) + velox_include_directories(velox_s3fs PUBLIC ${AWSSDK_INCLUDE_DIRS}) + velox_link_libraries(velox_s3fs velox_dwio_common Folly::folly + ${AWSSDK_LIBRARIES}) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() - if(${VELOX_ENABLE_BENCHMARKS}) - add_subdirectory(benchmark) - endif() endif() diff --git a/velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.cpp b/velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.cpp index 5e8a9fd650c2d..6f746b1c1bc4d 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.cpp @@ -15,84 +15,102 @@ */ #ifdef VELOX_ENABLE_S3 -#include "velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h" -#include "velox/connectors/hive/storage_adapters/s3fs/S3Util.h" -#include "velox/core/Config.h" +#include "velox/connectors/hive/HiveConfig.h" // @manual +#include "velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h" // @manual +#include "velox/connectors/hive/storage_adapters/s3fs/S3Util.h" // @manual #include "velox/dwio/common/FileSink.h" #endif -#include "velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.h" +#include "velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.h" // @manual namespace facebook::velox::filesystems { #ifdef VELOX_ENABLE_S3 -folly::once_flag S3FSInstantiationFlag; +using FileSystemMap = folly::Synchronized< + std::unordered_map>>; -// Only one instance of S3FileSystem is supported for now. -// TODO: Support multiple S3FileSystem instances using a cache -static std::shared_ptr s3fs = nullptr; +/// Multiple S3 filesystems are supported. +/// Key is the endpoint value specified in the config using hive.s3.endpoint. +/// If the endpoint is empty, it will default to AWS S3. +FileSystemMap& fileSystems() { + static FileSystemMap instances; + return instances; +} -std::function(std::shared_ptr, std::string_view)> -fileSystemGenerator() { - static auto filesystemGenerator = [](std::shared_ptr properties, - std::string_view filePath) { - folly::call_once(S3FSInstantiationFlag, [&properties]() { - std::shared_ptr fs; - if (properties != nullptr) { - initializeS3(properties.get()); - fs = std::make_shared(properties); - } else { - auto config = std::make_shared(); - initializeS3(config.get()); - fs = std::make_shared(config); - } - s3fs = fs; - }); - return s3fs; - }; - return filesystemGenerator; +std::string getS3Identity(const std::shared_ptr& config) { + HiveConfig hiveConfig = HiveConfig(config); + auto endpoint = hiveConfig.s3Endpoint(); + if (!endpoint.empty()) { + // The identity is the endpoint. + return endpoint; + } + // Default key value. + return "aws-s3-key"; } -std::function( - const std::string&, - const velox::dwio::common::FileSink::Options& options)> -s3WriteFileSinkGenerator() { - static auto s3WriteFileSink = - [](const std::string& fileURI, - const velox::dwio::common::FileSink::Options& options) - -> std::unique_ptr { - if (isS3File(fileURI)) { - auto fileSystem = - filesystems::getFileSystem(fileURI, options.connectorProperties); - return std::make_unique( - fileSystem->openFileForWrite(fileURI, {{}, options.pool}), - fileURI, - options.metricLogger, - options.stats); - } - return nullptr; - }; +std::shared_ptr fileSystemGenerator( + std::shared_ptr properties, + std::string_view /*filePath*/) { + std::shared_ptr config = + std::make_shared( + std::unordered_map()); + if (properties) { + config = std::make_shared(properties->rawConfigsCopy()); + } + const auto s3Identity = getS3Identity(config); - return s3WriteFileSink; + return fileSystems().withWLock( + [&](auto& instanceMap) -> std::shared_ptr { + initializeS3(config.get()); + auto iterator = instanceMap.find(s3Identity); + if (iterator == instanceMap.end()) { + auto fs = std::make_shared(properties); + instanceMap.insert({s3Identity, fs}); + return fs; + } + return iterator->second; + }); +} + +std::unique_ptr s3WriteFileSinkGenerator( + const std::string& fileURI, + const velox::dwio::common::FileSink::Options& options) { + if (isS3File(fileURI)) { + auto fileSystem = + filesystems::getFileSystem(fileURI, options.connectorProperties); + return std::make_unique( + fileSystem->openFileForWrite(fileURI, {{}, options.pool, std::nullopt}), + fileURI, + options.metricLogger, + options.stats); + } + return nullptr; } #endif void registerS3FileSystem() { #ifdef VELOX_ENABLE_S3 - if (!s3fs) { - registerFileSystem(isS3File, fileSystemGenerator()); - dwio::common::FileSink::registerFactory(s3WriteFileSinkGenerator()); - } + fileSystems().withWLock([&](auto& instanceMap) { + if (instanceMap.empty()) { + registerFileSystem(isS3File, std::function(fileSystemGenerator)); + dwio::common::FileSink::registerFactory( + std::function(s3WriteFileSinkGenerator)); + } + }); #endif } void finalizeS3FileSystem() { #ifdef VELOX_ENABLE_S3 - VELOX_CHECK( - !s3fs || (s3fs && s3fs.use_count() == 1), - "Cannot finalize S3FileSystem while in use"); - s3fs.reset(); + bool singleUseCount = true; + fileSystems().withWLock([&](auto& instanceMap) { + for (const auto& [id, fs] : instanceMap) { + singleUseCount &= (fs.use_count() == 1); + } + VELOX_CHECK(singleUseCount, "Cannot finalize S3FileSystem while in use"); + instanceMap.clear(); + }); + finalizeS3(); #endif } diff --git a/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.cpp b/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.cpp index 13f170da92923..7126774cea1fc 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.cpp @@ -15,11 +15,12 @@ */ #include "velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h" +#include "velox/common/config/Config.h" #include "velox/common/file/File.h" #include "velox/connectors/hive/HiveConfig.h" #include "velox/connectors/hive/storage_adapters/s3fs/S3Util.h" #include "velox/connectors/hive/storage_adapters/s3fs/S3WriteFile.h" -#include "velox/core/Config.h" +#include "velox/core/QueryConfig.h" #include "velox/dwio/common/DataBuffer.h" #include @@ -29,6 +30,8 @@ #include #include +#include +#include #include #include #include @@ -69,7 +72,6 @@ Aws::IOStreamFactory AwsWriteableStreamFactory(void* data, int64_t nbytes) { return [=]() { return Aws::New("", data, nbytes); }; } -// TODO: Implement retry on failure. class S3ReadFile final : public ReadFile { public: S3ReadFile(const std::string& path, Aws::S3::S3Client* client) @@ -79,7 +81,13 @@ class S3ReadFile final : public ReadFile { // Gets the length of the file. // Checks if there are any issues reading the file. - void initialize() { + void initialize(const filesystems::FileOptions& options) { + if (options.fileSize.has_value()) { + VELOX_CHECK_GE( + options.fileSize.value(), 0, "File size must be non-negative"); + length_ = options.fileSize.value(); + } + // Make it a no-op if invoked twice. if (length_ != -1) { return; @@ -255,6 +263,10 @@ class S3WriteFile::Impl { /// (https://github.com/apache/arrow/issues/11934). So we instead default /// to application/octet-stream which is less misleading. request.SetContentType(kApplicationOctetStream); + // The default algorithm used is MD5. However, MD5 is not supported with + // fips and can cause a SIGSEGV. Set CRC32 instead which is a standard for + // checksum computation and is not restricted by fips. + request.SetChecksumAlgorithm(Aws::S3::Model::ChecksumAlgorithm::CRC32); auto outcome = client_->CreateMultipartUpload(request); VELOX_CHECK_AWS_OUTCOME( @@ -369,6 +381,10 @@ class S3WriteFile::Impl { request.SetContentLength(part.size()); request.SetBody( std::make_shared(part.data(), part.size())); + // The default algorithm used is MD5. However, MD5 is not supported with + // fips and can cause a SIGSEGV. Set CRC32 instead which is a standard for + // checksum computation and is not restricted by fips. + request.SetChecksumAlgorithm(Aws::S3::Model::ChecksumAlgorithm::CRC32); auto outcome = client_->UploadPart(request); VELOX_CHECK_AWS_OUTCOME(outcome, "Failed to upload", bucket_, key_); // Append ETag and part number for this uploaded part. @@ -378,6 +394,11 @@ class S3WriteFile::Impl { part.SetPartNumber(uploadState_.partNumber); part.SetETag(result.GetETag()); + // Don't add the checksum to the part if the checksum is empty. + // Some filesystems such as IBM COS require this to be not set. + if (!result.GetChecksumCRC32().empty()) { + part.SetChecksumCRC32(result.GetChecksumCRC32()); + } uploadState_.completedParts.push_back(std::move(part)); } } @@ -430,7 +451,7 @@ struct AwsInstance { } // Returns true iff the instance was newly initialized with config. - bool initialize(const Config* config) { + bool initialize(const config::ConfigBase* config) { if (isFinalized_.load()) { VELOX_FAIL("Attempt to initialize S3 after it has been finalized."); } @@ -468,9 +489,11 @@ struct AwsInstance { } private: - void doInitialize(const Config* config) { + void doInitialize(const config::ConfigBase* config) { + std::shared_ptr hiveConfig = std::make_shared( + std::make_shared(config->rawConfigsCopy())); awsOptions_.loggingOptions.logLevel = - inferS3LogLevel(HiveConfig::s3GetLogLevel(config)); + inferS3LogLevel(hiveConfig->s3GetLogLevel()); // In some situations, curl triggers a SIGPIPE signal causing the entire // process to be terminated without any notification. // This behavior is seen via Prestissimo on AmazonLinux2 on AWS EC2. @@ -493,7 +516,7 @@ AwsInstance* getAwsInstance() { return instance.get(); } -bool initializeS3(const Config* config) { +bool initializeS3(const config::ConfigBase* config) { return getAwsInstance()->initialize(config); } @@ -506,24 +529,65 @@ void finalizeS3() { class S3FileSystem::Impl { public: - Impl(const Config* config) : config_(config) { + Impl(const config::ConfigBase* config) { + hiveConfig_ = std::make_shared( + std::make_shared(config->rawConfigsCopy())); VELOX_CHECK(getAwsInstance()->isInitialized(), "S3 is not initialized"); Aws::Client::ClientConfiguration clientConfig; - clientConfig.endpointOverride = HiveConfig::s3Endpoint(config_); + clientConfig.endpointOverride = hiveConfig_->s3Endpoint(); + + if (hiveConfig_->s3UseProxyFromEnv()) { + auto proxyConfig = S3ProxyConfigurationBuilder(hiveConfig_->s3Endpoint()) + .useSsl(hiveConfig_->s3UseSSL()) + .build(); + if (proxyConfig.has_value()) { + clientConfig.proxyScheme = Aws::Http::SchemeMapper::FromString( + proxyConfig.value().scheme().c_str()); + clientConfig.proxyHost = awsString(proxyConfig.value().host()); + clientConfig.proxyPort = proxyConfig.value().port(); + clientConfig.proxyUserName = awsString(proxyConfig.value().username()); + clientConfig.proxyPassword = awsString(proxyConfig.value().password()); + } + } - if (HiveConfig::s3UseSSL(config_)) { + if (hiveConfig_->s3UseSSL()) { clientConfig.scheme = Aws::Http::Scheme::HTTPS; } else { clientConfig.scheme = Aws::Http::Scheme::HTTP; } + if (hiveConfig_->s3ConnectTimeout().has_value()) { + clientConfig.connectTimeoutMs = + std::chrono::duration_cast( + facebook::velox::config::toDuration( + hiveConfig_->s3ConnectTimeout().value())) + .count(); + } + + if (hiveConfig_->s3SocketTimeout().has_value()) { + clientConfig.requestTimeoutMs = + std::chrono::duration_cast( + facebook::velox::config::toDuration( + hiveConfig_->s3SocketTimeout().value())) + .count(); + } + + if (hiveConfig_->s3MaxConnections().has_value()) { + clientConfig.maxConnections = hiveConfig_->s3MaxConnections().value(); + } + + auto retryStrategy = getRetryStrategy(); + if (retryStrategy.has_value()) { + clientConfig.retryStrategy = retryStrategy.value(); + } + auto credentialsProvider = getCredentialsProvider(); client_ = std::make_shared( credentialsProvider, clientConfig, Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, - HiveConfig::s3UseVirtualAddressing(config_)); + hiveConfig_->s3UseVirtualAddressing()); ++fileSystemCount; } @@ -560,9 +624,9 @@ class S3FileSystem::Impl { // Return an AWSCredentialsProvider based on the config. std::shared_ptr getCredentialsProvider() const { - auto accessKey = HiveConfig::s3AccessKey(config_); - auto secretKey = HiveConfig::s3SecretKey(config_); - const auto iamRole = HiveConfig::s3IAMRole(config_); + auto accessKey = hiveConfig_->s3AccessKey(); + auto secretKey = hiveConfig_->s3SecretKey(); + const auto iamRole = hiveConfig_->s3IAMRole(); int keyCount = accessKey.has_value() + secretKey.has_value(); // keyCount=0 means both are not specified @@ -573,7 +637,7 @@ class S3FileSystem::Impl { "Invalid configuration: both access key and secret key must be specified"); int configCount = (accessKey.has_value() && secretKey.has_value()) + - iamRole.has_value() + HiveConfig::s3UseInstanceCredentials(config_); + iamRole.has_value() + hiveConfig_->s3UseInstanceCredentials(); VELOX_USER_CHECK( (configCount <= 1), "Invalid configuration: specify only one among 'access/secret keys', 'use instance credentials', 'IAM role'"); @@ -583,18 +647,70 @@ class S3FileSystem::Impl { accessKey.value(), secretKey.value()); } - if (HiveConfig::s3UseInstanceCredentials(config_)) { + if (hiveConfig_->s3UseInstanceCredentials()) { return getDefaultCredentialsProvider(); } if (iamRole.has_value()) { return getIAMRoleCredentialsProvider( - iamRole.value(), HiveConfig::s3IAMRoleSessionName(config_)); + iamRole.value(), hiveConfig_->s3IAMRoleSessionName()); } return getDefaultCredentialsProvider(); } + // Return a client RetryStrategy based on the config. + std::optional> getRetryStrategy() + const { + auto retryMode = hiveConfig_->s3RetryMode(); + auto maxAttempts = hiveConfig_->s3MaxAttempts(); + if (retryMode.has_value()) { + if (retryMode.value() == "standard") { + if (maxAttempts.has_value()) { + VELOX_USER_CHECK_GE( + maxAttempts.value(), + 0, + "Invalid configuration: specified 'hive.s3.max-attempts' value {} is < 0.", + maxAttempts.value()); + return std::make_shared( + maxAttempts.value()); + } else { + // Otherwise, use default value 3. + return std::make_shared(); + } + } else if (retryMode.value() == "adaptive") { + if (maxAttempts.has_value()) { + VELOX_USER_CHECK_GE( + maxAttempts.value(), + 0, + "Invalid configuration: specified 'hive.s3.max-attempts' value {} is < 0.", + maxAttempts.value()); + return std::make_shared( + maxAttempts.value()); + } else { + // Otherwise, use default value 3. + return std::make_shared(); + } + } else if (retryMode.value() == "legacy") { + if (maxAttempts.has_value()) { + VELOX_USER_CHECK_GE( + maxAttempts.value(), + 0, + "Invalid configuration: specified 'hive.s3.max-attempts' value {} is < 0.", + maxAttempts.value()); + return std::make_shared( + maxAttempts.value()); + } else { + // Otherwise, use default value maxRetries = 10, scaleFactor = 25 + return std::make_shared(); + } + } else { + VELOX_USER_FAIL("Invalid retry mode for S3: {}", retryMode.value()); + } + } + return std::nullopt; + } + // Make it clear that the S3FileSystem instance owns the S3Client. // Once the S3FileSystem is destroyed, the S3Client fails to work // due to the Aws::ShutdownAPI invocation in the destructor. @@ -607,11 +723,11 @@ class S3FileSystem::Impl { } private: - const Config* config_; + std::shared_ptr hiveConfig_; std::shared_ptr client_; }; -S3FileSystem::S3FileSystem(std::shared_ptr config) +S3FileSystem::S3FileSystem(std::shared_ptr config) : FileSystem(config) { impl_ = std::make_shared(config.get()); } @@ -622,10 +738,10 @@ std::string S3FileSystem::getLogLevelName() const { std::unique_ptr S3FileSystem::openFileForRead( std::string_view path, - const FileOptions& /*unused*/) { + const FileOptions& options) { const auto file = s3Path(path); auto s3file = std::make_unique(file, impl_->s3Client()); - s3file->initialize(); + s3file->initialize(options); return s3file; } diff --git a/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h b/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h index 4240451ea2caa..088575760f991 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h +++ b/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h @@ -17,10 +17,12 @@ #pragma once #include "velox/common/file/FileSystems.h" +#include "velox/connectors/hive/HiveConfig.h" namespace facebook::velox::filesystems { +using namespace facebook::velox::connector::hive; -bool initializeS3(const Config* config); +bool initializeS3(const config::ConfigBase* config); void finalizeS3(); @@ -29,7 +31,7 @@ void finalizeS3(); /// type of file can be constructed based on a filename. class S3FileSystem : public FileSystem { public: - explicit S3FileSystem(std::shared_ptr config); + explicit S3FileSystem(std::shared_ptr config); std::string name() const override; diff --git a/velox/connectors/hive/storage_adapters/s3fs/S3Util.cpp b/velox/connectors/hive/storage_adapters/s3fs/S3Util.cpp index 2df80d356cc4a..e312ca30ca51d 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/S3Util.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/S3Util.cpp @@ -19,6 +19,8 @@ // type of file can be constructed based on a filename. See the // (register|generate)ReadFile and (register|generate)WriteFile functions. +#include "folly/IPAddress.h" + #include "velox/connectors/hive/storage_adapters/s3fs/S3Util.h" namespace facebook::velox { @@ -43,4 +45,107 @@ std::string getErrorStringFromS3Error( } } +/// The noProxyList is a comma separated list of subdomains, domains or IP +/// ranges (using CIDR). For a given hostname check if it has a matching +/// subdomain, domain or IP range in the noProxyList. +bool isHostExcludedFromProxy( + const std::string& hostname, + const std::string& noProxyList) { + std::vector noProxyListElements{}; + + if (noProxyList.empty()) { + return false; + } + + auto hostAsIp = folly::IPAddress::tryFromString(hostname); + folly::split(',', noProxyList, noProxyListElements); + for (auto elem : noProxyListElements) { + // Elem contains "/" which separates IP and subnet mask e.g. 192.168.1.0/24. + if (elem.find("/") != std::string::npos && hostAsIp.hasValue()) { + return hostAsIp.value().inSubnet(elem); + } + // Match subdomain, domain names and IP address strings. + else if ( + elem.length() < hostname.length() && elem[0] == '.' && + !hostname.compare( + hostname.length() - elem.length(), elem.length(), elem)) { + return true; + } else if ( + elem.length() < hostname.length() && elem[0] == '*' && elem[1] == '.' && + !hostname.compare( + hostname.length() - elem.length() + 1, + elem.length() - 1, + elem.substr(1))) { + return true; + } else if (elem.length() == hostname.length() && !hostname.compare(elem)) { + return true; + } + } + return false; +} + +/// Reading the various proxy related environment variables. +/// There is a lacking standard. The environment variables can be +/// defined lower case or upper case. The lower case values are checked +/// first and, if set, returned, therefore taking precendence. +/// Note, the envVar input is expected to be lower case. +namespace { +std::string readProxyEnvVar(std::string envVar) { + auto httpProxy = getenv(envVar.c_str()); + if (httpProxy) { + return std::string(httpProxy); + } + + std::transform(envVar.begin(), envVar.end(), envVar.begin(), ::toupper); + httpProxy = getenv(envVar.c_str()); + if (httpProxy) { + return std::string(httpProxy); + } + return ""; +}; +} // namespace + +std::string getHttpProxyEnvVar() { + return readProxyEnvVar("http_proxy"); +} + +std::string getHttpsProxyEnvVar() { + return readProxyEnvVar("https_proxy"); +}; + +std::string getNoProxyEnvVar() { + return readProxyEnvVar("no_proxy"); +}; + +std::optional S3ProxyConfigurationBuilder::build() { + std::string proxyUrl; + if (useSsl_) { + proxyUrl = getHttpsProxyEnvVar(); + } else { + proxyUrl = getHttpProxyEnvVar(); + } + + if (proxyUrl.empty()) { + return std::nullopt; + } + folly::Uri proxyUri(proxyUrl); + + /// The endpoint is usually a domain with port or an + /// IP address with port. It is assumed that there are + /// 2 parts separated by a colon. + std::vector endpointElements{}; + folly::split(':', s3Endpoint_, endpointElements); + if (FOLLY_UNLIKELY(endpointElements.size() > 2)) { + LOG(ERROR) << fmt::format( + "Too many parts in S3 endpoint URI {} ", s3Endpoint_); + return std::nullopt; + } + + auto noProxy = getNoProxyEnvVar(); + if (isHostExcludedFromProxy(endpointElements[0], noProxy)) { + return std::nullopt; + } + return proxyUri; +} + } // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/s3fs/S3Util.h b/velox/connectors/hive/storage_adapters/s3fs/S3Util.h index c5a51d28c7afd..399b5c2740e69 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/S3Util.h +++ b/velox/connectors/hive/storage_adapters/s3fs/S3Util.h @@ -23,9 +23,12 @@ #include #include +#include #include "velox/common/base/Exceptions.h" +#include + namespace facebook::velox { namespace { @@ -154,21 +157,63 @@ inline std::string getRequestID( } // namespace /// Only Amazon (amz) and Alibaba (oss) request IDs are supported. -#define VELOX_CHECK_AWS_OUTCOME(outcome, errorMsgPrefix, bucket, key) \ - { \ - if (!outcome.IsSuccess()) { \ - auto error = outcome.GetError(); \ - VELOX_FAIL( \ - "{} due to: '{}'. Path:'{}', SDK Error Type:{}, HTTP Status Code:{}, S3 Service:'{}', Message:'{}', RequestID:'{}'", \ - errorMsgPrefix, \ - getErrorStringFromS3Error(error), \ - s3URI(bucket, key), \ - error.GetErrorType(), \ - error.GetResponseCode(), \ - getS3BackendService(error.GetResponseHeaders()), \ - error.GetMessage(), \ - getRequestID(error.GetResponseHeaders())) \ - } \ +#define VELOX_CHECK_AWS_OUTCOME(outcome, errorMsgPrefix, bucket, key) \ + { \ + if (!outcome.IsSuccess()) { \ + auto error = outcome.GetError(); \ + auto errMsg = fmt::format( \ + "{} due to: '{}'. Path:'{}', SDK Error Type:{}, HTTP Status Code:{}, S3 Service:'{}', Message:'{}', RequestID:'{}'.", \ + errorMsgPrefix, \ + getErrorStringFromS3Error(error), \ + s3URI(bucket, key), \ + static_cast(error.GetErrorType()), \ + error.GetResponseCode(), \ + getS3BackendService(error.GetResponseHeaders()), \ + error.GetMessage(), \ + getRequestID(error.GetResponseHeaders())); \ + if (IsRetryableHttpResponseCode(error.GetResponseCode())) { \ + auto retryHint = fmt::format( \ + " Request failed after retrying {} times. Try increasing the value of 'hive.s3.max-attempts'.", \ + outcome.GetRetryCount()); \ + errMsg.append(retryHint); \ + } \ + if (error.GetResponseCode() == Aws::Http::HttpResponseCode::NOT_FOUND) { \ + VELOX_FILE_NOT_FOUND_ERROR(errMsg); \ + } \ + VELOX_FAIL(errMsg); \ + } \ + } + +bool isHostExcludedFromProxy( + const std::string& hostname, + const std::string& noProxyList); + +std::string getHttpProxyEnvVar(); +std::string getHttpsProxyEnvVar(); +std::string getNoProxyEnvVar(); + +class S3ProxyConfigurationBuilder { + public: + S3ProxyConfigurationBuilder(const std::string& s3Endpoint) + : s3Endpoint_(s3Endpoint){}; + + S3ProxyConfigurationBuilder& useSsl(const bool& useSsl) { + useSsl_ = useSsl; + return *this; } + std::optional build(); + + private: + const std::string s3Endpoint_; + bool useSsl_; +}; + } // namespace facebook::velox + +template <> +struct fmt::formatter : formatter { + auto format(Aws::Http::HttpResponseCode s, format_context& ctx) { + return formatter::format(static_cast(s), ctx); + } +}; diff --git a/velox/connectors/hive/storage_adapters/s3fs/benchmark/CMakeLists.txt b/velox/connectors/hive/storage_adapters/s3fs/benchmark/CMakeLists.txt deleted file mode 100644 index b3614d0f3d113..0000000000000 --- a/velox/connectors/hive/storage_adapters/s3fs/benchmark/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -add_executable(velox_s3read_benchmark S3ReadBenchmark.cpp - S3ReadBenchmarkMain.cpp) - -target_link_libraries( - velox_s3read_benchmark - velox_read_benchmark_lib - velox_s3fs - velox_exception - velox_exec_test_lib - fmt::fmt - Folly::folly) diff --git a/velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmark.cpp b/velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmark.cpp deleted file mode 100644 index 8720f13c0b519..0000000000000 --- a/velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmark.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmark.h" -#include "velox/core/Config.h" - -#include - -DEFINE_string(s3_config, "", "Path of S3 config file"); - -namespace facebook::velox { - -// From presto-cpp -std::shared_ptr readConfig(const std::string& filePath) { - std::ifstream configFile(filePath); - if (!configFile.is_open()) { - throw std::runtime_error( - fmt::format("Couldn't open config file {} for reading.", filePath)); - } - - std::unordered_map properties; - std::string line; - while (getline(configFile, line)) { - line.erase(std::remove_if(line.begin(), line.end(), isspace), line.end()); - if (line[0] == '#' || line.empty()) { - continue; - } - auto delimiterPos = line.find('='); - auto name = line.substr(0, delimiterPos); - auto value = line.substr(delimiterPos + 1); - properties.emplace(name, value); - } - - return std::make_shared(properties); -} - -} // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmark.h b/velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmark.h deleted file mode 100644 index 0f95f0f1ed62a..0000000000000 --- a/velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmark.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/common/file/benchmark/ReadBenchmark.h" -#include "velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.h" -#include "velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h" - -DECLARE_string(s3_config); - -namespace facebook::velox { - -std::shared_ptr readConfig(const std::string& filePath); - -class S3ReadBenchmark : public ReadBenchmark { - public: - // Initialize a S3ReadFile instance for the specified 'path'. - void initialize() override { - executor_ = - std::make_unique(FLAGS_num_threads); - - filesystems::registerS3FileSystem(); - std::shared_ptr config; - if (!FLAGS_s3_config.empty()) { - config = readConfig(FLAGS_s3_config); - } - auto s3fs = filesystems::getFileSystem(FLAGS_path, config); - readFile_ = s3fs->openFileForRead(FLAGS_path); - - fileSize_ = readFile_->size(); - if (FLAGS_file_size_gb) { - fileSize_ = std::min(FLAGS_file_size_gb << 30, fileSize_); - } - - if (fileSize_ <= FLAGS_measurement_size) { - LOG(ERROR) << "File size " << fileSize_ - << " is <= then --measurement_size " << FLAGS_measurement_size; - exit(1); - } - if (FLAGS_seed) { - rng_.seed(FLAGS_seed); - } - } -}; - -} // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmarkMain.cpp b/velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmarkMain.cpp deleted file mode 100644 index 5b9d233f56f15..0000000000000 --- a/velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmarkMain.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/connectors/hive/storage_adapters/s3fs/benchmark/S3ReadBenchmark.h" - -using namespace facebook::velox; - -// This benchmark measures the throughput of an S3 compatible FileSystem for -// various ReadFile APIs. The output helps us understand the maximum possible -// gains for queries. Example: If a single thread requires reading 1GB of data -// and the IO throughput is 100 MBps, then it takes 10 seconds to just read the -// data. -int main(int argc, char** argv) { - folly::init(&argc, &argv, false); - S3ReadBenchmark bm; - bm.initialize(); - bm.run(); -} diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/CMakeLists.txt b/velox/connectors/hive/storage_adapters/s3fs/tests/CMakeLists.txt index 21e70c1e3d1d3..852038c483b2c 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/CMakeLists.txt +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_executable(velox_s3file_test S3UtilTest.cpp S3FileSystemTest.cpp) +add_executable(velox_s3file_test S3FileSystemTest.cpp S3UtilTest.cpp) add_test(velox_s3file_test velox_s3file_test) target_link_libraries( velox_s3file_test @@ -23,8 +23,8 @@ target_link_libraries( velox_exec_test_lib velox_dwio_common_exception velox_exec - gtest - gtest_main) + GTest::gtest + GTest::gtest_main) add_executable(velox_s3registration_test S3FileSystemRegistrationTest.cpp) add_test(velox_s3registration_test velox_s3registration_test) @@ -37,8 +37,8 @@ target_link_libraries( velox_exec_test_lib velox_dwio_common_exception velox_exec - gtest - gtest_main) + GTest::gtest + GTest::gtest_main) add_executable(velox_s3finalize_test S3FileSystemFinalizeTest.cpp) add_test(velox_s3finalize_test velox_s3finalize_test) @@ -48,8 +48,8 @@ target_link_libraries( velox_hive_config velox_file velox_core - gtest - gtest_main) + GTest::gtest + GTest::gtest_main) add_executable(velox_s3insert_test S3InsertTest.cpp) add_test(velox_s3insert_test velox_s3insert_test) @@ -62,5 +62,36 @@ target_link_libraries( velox_exec_test_lib velox_dwio_common_exception velox_exec - gtest - gtest_main) + GTest::gtest + GTest::gtest_main) + +add_executable(velox_s3read_test S3ReadTest.cpp) +add_test( + NAME velox_s3read_test + COMMAND velox_s3read_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries( + velox_s3read_test + velox_file + velox_s3fs + velox_hive_config + velox_core + velox_exec_test_lib + velox_dwio_common_exception + velox_exec + GTest::gtest + GTest::gtest_main) + +add_executable(velox_s3multiendpoints_test S3MultipleEndpointsTest.cpp) +add_test(velox_s3multiendpoints_test velox_s3multiendpoints_test) +target_link_libraries( + velox_s3multiendpoints_test + velox_file + velox_s3fs + velox_hive_config + velox_core + velox_exec_test_lib + velox_dwio_common_exception + velox_exec + GTest::gtest + GTest::gtest_main) diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/MinioServer.h b/velox/connectors/hive/storage_adapters/s3fs/tests/MinioServer.h index 64cf52b45b584..591ed403f350a 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/MinioServer.h +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/MinioServer.h @@ -14,7 +14,10 @@ * limitations under the License. */ -#include "velox/core/Config.h" +#pragma once + +#include "velox/common/config/Config.h" +#include "velox/exec/tests/utils/PortUtil.h" #include "velox/exec/tests/utils/TempDirectoryPath.h" #include "boost/process.hpp" @@ -22,7 +25,7 @@ using namespace facebook::velox; namespace { -constexpr char const* kMinioExecutableName{"minio"}; +constexpr char const* kMinioExecutableName{"minio-2022-05-26"}; constexpr char const* kMinioAccessKey{"minio"}; constexpr char const* kMinioSecretKey{"miniopass"}; } // namespace @@ -31,24 +34,27 @@ constexpr char const* kMinioSecretKey{"miniopass"}; // Adapted from the Apache Arrow library. class MinioServer { public: - MinioServer(const std::string_view& connectionString) - : tempPath_(::exec::test::TempDirectoryPath::create()), - connectionString_(connectionString) {} + MinioServer() : tempPath_(::exec::test::TempDirectoryPath::create()) { + constexpr auto kHostAddressTemplate = "127.0.0.1:{}"; + auto ports = facebook::velox::exec::test::getFreePorts(2); + connectionString_ = fmt::format(kHostAddressTemplate, ports[0]); + consoleAddress_ = fmt::format(kHostAddressTemplate, ports[1]); + } void start(); void stop(); void addBucket(const char* bucket) { - const std::string path = tempPath_->path + "/" + bucket; + const std::string path = tempPath_->getPath() + "/" + bucket; mkdir(path.c_str(), S_IRWXU | S_IRWXG); } std::string path() const { - return tempPath_->path; + return tempPath_->getPath(); } - std::shared_ptr hiveConfig( + std::shared_ptr hiveConfig( const std::unordered_map configOverride = {}) const { std::unordered_map config({ @@ -64,12 +70,13 @@ class MinioServer { config[configName] = configValue; } - return std::make_shared(std::move(config)); + return std::make_shared(std::move(config)); } private: const std::shared_ptr tempPath_; - const std::string connectionString_; + std::string connectionString_; + std::string consoleAddress_; const std::string accessKey_ = kMinioAccessKey; const std::string secretKey_ = kMinioSecretKey; std::shared_ptr<::boost::process::child> serverProcess_; @@ -85,6 +92,7 @@ void MinioServer::start() { VELOX_FAIL("Failed to find minio executable {}'", kMinioExecutableName); } + const auto path = tempPath_->getPath(); try { serverProcess_ = std::make_shared( env, @@ -94,7 +102,9 @@ void MinioServer::start() { "--compat", "--address", connectionString_, - tempPath_->path.c_str()); + "--console-address", + consoleAddress_, + path.c_str()); } catch (const std::exception& e) { VELOX_FAIL("Failed to launch Minio server: {}", e.what()); } diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemFinalizeTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemFinalizeTest.cpp index 1d713553e260f..1ee0387faa0ec 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemFinalizeTest.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemFinalizeTest.cpp @@ -15,15 +15,17 @@ */ #include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/config/Config.h" #include "velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h" -#include "velox/core/Config.h" #include "gtest/gtest.h" -using namespace facebook::velox; +namespace facebook::velox { +namespace { TEST(S3FileSystemFinalizeTest, finalize) { - auto s3Config = std::make_shared(); + auto s3Config = std::make_shared( + std::unordered_map()); ASSERT_TRUE(filesystems::initializeS3(s3Config.get())); ASSERT_FALSE(filesystems::initializeS3(s3Config.get())); { @@ -36,3 +38,6 @@ TEST(S3FileSystemFinalizeTest, finalize) { filesystems::initializeS3(s3Config.get()), "Attempt to initialize S3 after it has been finalized."); } + +} // namespace +} // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemRegistrationTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemRegistrationTest.cpp index 60b8fabb3e887..c247a74309438 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemRegistrationTest.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemRegistrationTest.cpp @@ -17,28 +17,21 @@ #include "velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.h" #include "velox/connectors/hive/storage_adapters/s3fs/tests/S3Test.h" -using namespace facebook::velox; - -static constexpr std::string_view kMinioConnectionString = "127.0.0.1:8000"; +namespace facebook::velox { +namespace { class S3FileSystemRegistrationTest : public S3Test { protected: - static void SetUpTestSuite() { - if (minioServer_ == nullptr) { - minioServer_ = std::make_shared(kMinioConnectionString); - minioServer_->start(); - } + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); filesystems::registerS3FileSystem(); } - static void TearDownTestSuite() { + static void TearDownTestCase() { filesystems::finalizeS3FileSystem(); - if (minioServer_ != nullptr) { - minioServer_->stop(); - minioServer_ = nullptr; - } } }; +} // namespace TEST_F(S3FileSystemRegistrationTest, readViaRegistry) { const char* bucketName = "data2"; @@ -71,11 +64,10 @@ TEST_F(S3FileSystemRegistrationTest, fileHandle) { } auto hiveConfig = minioServer_->hiveConfig(); FileHandleFactory factory( - std::make_unique< - SimpleLRUCache>>(1000), + std::make_unique>(1000), std::make_unique(hiveConfig)); - auto fileHandle = factory.generate(s3File).second; - readData(fileHandle->file.get()); + auto fileHandleCachePtr = factory.generate(s3File); + readData(fileHandleCachePtr->file.get()); } TEST_F(S3FileSystemRegistrationTest, finalize) { @@ -85,3 +77,4 @@ TEST_F(S3FileSystemRegistrationTest, finalize) { filesystems::finalizeS3FileSystem(), "Cannot finalize S3FileSystem while in use"); } +} // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemTest.cpp index d74ba33d2c6ae..1b92147d5b88d 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemTest.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3FileSystemTest.cpp @@ -20,31 +20,31 @@ #include -using namespace facebook::velox; - -static constexpr std::string_view kMinioConnectionString = "127.0.0.1:9000"; +namespace facebook::velox { +namespace { class S3FileSystemTest : public S3Test { protected: - static void SetUpTestSuite() { - if (minioServer_ == nullptr) { - minioServer_ = std::make_shared(kMinioConnectionString); - minioServer_->start(); - } + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + void SetUp() override { + S3Test::SetUp(); auto hiveConfig = minioServer_->hiveConfig({{"hive.s3.log-level", "Info"}}); filesystems::initializeS3(hiveConfig.get()); } static void TearDownTestSuite() { - if (minioServer_ != nullptr) { - minioServer_->stop(); - minioServer_ = nullptr; - } filesystems::finalizeS3(); } }; +} // namespace TEST_F(S3FileSystemTest, writeAndRead) { + /// The hive config used for Minio defaults to turning + /// off using proxy settings if the environment provides them. + setenv("HTTP_PROXY", "http://test:test@127.0.0.1:8888", 1); const char* bucketName = "data"; const char* file = "test.txt"; const std::string filename = localPath(bucketName) + "/" + file; @@ -62,11 +62,11 @@ TEST_F(S3FileSystemTest, writeAndRead) { TEST_F(S3FileSystemTest, invalidCredentialsConfig) { { - const std::unordered_map config( + std::unordered_map config( {{"hive.s3.use-instance-credentials", "true"}, {"hive.s3.iam-role", "dummy-iam-role"}}); auto hiveConfig = - std::make_shared(std::move(config)); + std::make_shared(std::move(config)); // Both instance credentials and iam-role cannot be specified VELOX_ASSERT_THROW( @@ -74,34 +74,34 @@ TEST_F(S3FileSystemTest, invalidCredentialsConfig) { "Invalid configuration: specify only one among 'access/secret keys', 'use instance credentials', 'IAM role'"); } { - const std::unordered_map config( + std::unordered_map config( {{"hive.s3.aws-secret-key", "dummy-key"}, {"hive.s3.aws-access-key", "dummy-key"}, {"hive.s3.iam-role", "dummy-iam-role"}}); auto hiveConfig = - std::make_shared(std::move(config)); + std::make_shared(std::move(config)); // Both access/secret keys and iam-role cannot be specified VELOX_ASSERT_THROW( filesystems::S3FileSystem(hiveConfig), "Invalid configuration: specify only one among 'access/secret keys', 'use instance credentials', 'IAM role'"); } { - const std::unordered_map config( + std::unordered_map config( {{"hive.s3.aws-secret-key", "dummy"}, {"hive.s3.aws-access-key", "dummy"}, {"hive.s3.use-instance-credentials", "true"}}); auto hiveConfig = - std::make_shared(std::move(config)); + std::make_shared(std::move(config)); // Both access/secret keys and instance credentials cannot be specified VELOX_ASSERT_THROW( filesystems::S3FileSystem(hiveConfig), "Invalid configuration: specify only one among 'access/secret keys', 'use instance credentials', 'IAM role'"); } { - const std::unordered_map config( + std::unordered_map config( {{"hive.s3.aws-secret-key", "dummy"}}); auto hiveConfig = - std::make_shared(std::move(config)); + std::make_shared(std::move(config)); // Both access key and secret key must be specified VELOX_ASSERT_THROW( filesystems::S3FileSystem(hiveConfig), @@ -116,16 +116,18 @@ TEST_F(S3FileSystemTest, missingFile) { addBucket(bucketName); auto hiveConfig = minioServer_->hiveConfig(); filesystems::S3FileSystem s3fs(hiveConfig); - VELOX_ASSERT_THROW( + VELOX_ASSERT_RUNTIME_THROW_CODE( s3fs.openFileForRead(s3File), + error_code::kFileNotFound, "Failed to get metadata for S3 object due to: 'Resource not found'. Path:'s3://data1/i-do-not-exist.txt', SDK Error Type:16, HTTP Status Code:404, S3 Service:'MinIO', Message:'No response body.'"); } TEST_F(S3FileSystemTest, missingBucket) { auto hiveConfig = minioServer_->hiveConfig(); filesystems::S3FileSystem s3fs(hiveConfig); - VELOX_ASSERT_THROW( + VELOX_ASSERT_RUNTIME_THROW_CODE( s3fs.openFileForRead(kDummyPath), + error_code::kFileNotFound, "Failed to get metadata for S3 object due to: 'Resource not found'. Path:'s3://dummy/foo.txt', SDK Error Type:16, HTTP Status Code:404, S3 Service:'MinIO', Message:'No response body.'"); } @@ -165,7 +167,8 @@ TEST_F(S3FileSystemTest, noBackendServer) { TEST_F(S3FileSystemTest, logLevel) { std::unordered_map config; auto checkLogLevelName = [&config](std::string_view expected) { - auto s3Config = std::make_shared(config); + auto s3Config = + std::make_shared(std::move(config)); filesystems::S3FileSystem s3fs(s3Config); EXPECT_EQ(s3fs.getLogLevelName(), expected); }; @@ -187,8 +190,9 @@ TEST_F(S3FileSystemTest, writeFileAndRead) { auto hiveConfig = minioServer_->hiveConfig(); filesystems::S3FileSystem s3fs(hiveConfig); - auto pool = memory::defaultMemoryManager().addLeafPool("S3FileSystemTest"); - auto writeFile = s3fs.openFileForWrite(s3File, {{}, pool.get()}); + auto pool = memory::memoryManager()->addLeafPool("S3FileSystemTest"); + auto writeFile = + s3fs.openFileForWrite(s3File, {{}, pool.get(), std::nullopt}); auto s3WriteFile = dynamic_cast(writeFile.get()); std::string dataContent = "Dance me to your beauty with a burning violin" @@ -250,3 +254,13 @@ TEST_F(S3FileSystemTest, writeFileAndRead) { // Verify the last chunk. ASSERT_EQ(readFile->pread(contentSize * 250'000, contentSize), dataContent); } + +TEST_F(S3FileSystemTest, invalidConnectionSettings) { + auto hiveConfig = + minioServer_->hiveConfig({{"hive.s3.connect-timeout", "400"}}); + VELOX_ASSERT_THROW(filesystems::S3FileSystem(hiveConfig), "Invalid duration"); + + hiveConfig = minioServer_->hiveConfig({{"hive.s3.socket-timeout", "abc"}}); + VELOX_ASSERT_THROW(filesystems::S3FileSystem(hiveConfig), "Invalid duration"); +} +} // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3InsertTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3InsertTest.cpp index 57dfcded14c07..727b62f98e178 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/S3InsertTest.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3InsertTest.cpp @@ -15,62 +15,46 @@ */ #include +#include -#include "gtest/gtest.h" -#include "velox/common/file/FileSystems.h" +#include "velox/common/memory/Memory.h" #include "velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.h" -#include "velox/connectors/hive/storage_adapters/s3fs/tests/MinioServer.h" -#include "velox/dwio/parquet/reader/ParquetReader.h" +#include "velox/connectors/hive/storage_adapters/s3fs/tests/S3Test.h" #include "velox/exec/TableWriter.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" -#include "velox/exec/tests/utils/HiveConnectorTestBase.h" #include "velox/exec/tests/utils/PlanBuilder.h" -using namespace facebook::velox; -using namespace facebook::velox::core; -using namespace facebook::velox::exec; using namespace facebook::velox::exec::test; -using namespace facebook::velox::connector; -using namespace facebook::velox::connector::hive; -using namespace facebook::velox::dwio::common; -using namespace facebook::velox::test; -using namespace facebook::velox::filesystems; - -class S3InsertTest : public testing::Test, public VectorTestBase { - public: - static constexpr char const* kMinioConnectionString{"127.0.0.1:7000"}; - /// We use static initialization because we want a single version of the - /// Minio server running. - /// Each test must use a unique bucket to avoid concurrency issues. - static void SetUpTestSuite() { - minioServer_ = std::make_shared(kMinioConnectionString); - minioServer_->start(); - - ioExecutor_ = std::make_unique(3); + +namespace facebook::velox { +namespace { + +class S3InsertTest : public S3Test { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + void SetUp() override { + S3Test::SetUp(); filesystems::registerS3FileSystem(); auto hiveConnector = connector::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( - kHiveConnectorId, + ::exec::test::kHiveConnectorId, minioServer_->hiveConfig(), ioExecutor_.get()); connector::registerConnector(hiveConnector); } - static void TearDownTestSuite() { + void TearDown() override { + connector::unregisterConnector(::exec::test::kHiveConnectorId); + S3Test::TearDown(); filesystems::finalizeS3FileSystem(); - unregisterConnector(kHiveConnectorId); - minioServer_->stop(); - minioServer_ = nullptr; } - - static std::shared_ptr minioServer_; - static std::unique_ptr ioExecutor_; }; -std::shared_ptr S3InsertTest::minioServer_ = nullptr; -std::unique_ptr S3InsertTest::ioExecutor_ = - nullptr; +} // namespace TEST_F(S3InsertTest, s3InsertTest) { const int64_t kExpectedRows = 1'000; @@ -100,14 +84,14 @@ TEST_F(S3InsertTest, s3InsertTest) { // First column has number of rows written in the first row and nulls in other // rows. - auto rowCount = results->childAt(TableWriteTraits::kRowCountChannel) + auto rowCount = results->childAt(exec::TableWriteTraits::kRowCountChannel) ->as>(); ASSERT_FALSE(rowCount->isNullAt(0)); ASSERT_EQ(kExpectedRows, rowCount->valueAt(0)); ASSERT_TRUE(rowCount->isNullAt(1)); // Second column contains details about written files. - auto details = results->childAt(TableWriteTraits::kFragmentChannel) + auto details = results->childAt(exec::TableWriteTraits::kFragmentChannel) ->as>(); ASSERT_TRUE(details->isNullAt(0)); ASSERT_FALSE(details->isNullAt(1)); @@ -122,16 +106,19 @@ TEST_F(S3InsertTest, s3InsertTest) { // Read from 'writeFileName' and verify the data matches the original. plan = PlanBuilder().tableScan(rowType).planNode(); - auto splits = HiveConnectorTestBase::makeHiveConnectorSplits( - fmt::format("{}/{}", kOutputDirectory, writeFileName), - 1, - dwio::common::FileFormat::PARQUET); - auto copy = AssertQueryBuilder(plan).split(splits[0]).copyResults(pool()); + auto filePath = fmt::format("{}{}", kOutputDirectory, writeFileName); + const int64_t fileSize = fileWriteInfos[0]["fileSize"].asInt(); + auto split = HiveConnectorSplitBuilder(filePath) + .fileFormat(dwio::common::FileFormat::PARQUET) + .length(fileSize) + .build(); + auto copy = AssertQueryBuilder(plan).split(split).copyResults(pool()); assertEqualResults({input}, {copy}); } +} // namespace facebook::velox int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); - folly::init(&argc, &argv, false); + folly::Init init{&argc, &argv, false}; return RUN_ALL_TESTS(); } diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3MultipleEndpointsTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3MultipleEndpointsTest.cpp new file mode 100644 index 0000000000000..6e87626bf675d --- /dev/null +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3MultipleEndpointsTest.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "gtest/gtest.h" +#include "velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.h" +#include "velox/connectors/hive/storage_adapters/s3fs/tests/S3Test.h" +#include "velox/exec/TableWriter.h" +#include "velox/exec/tests/utils/AssertQueryBuilder.h" +#include "velox/exec/tests/utils/PlanBuilder.h" + +static const std::string_view kConnectorId1 = "test-hive1"; +static const std::string_view kConnectorId2 = "test-hive2"; + +using namespace facebook::velox::exec::test; + +namespace facebook::velox { +namespace { + +class S3MultipleEndpoints : public S3Test { + public: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + void SetUp() override { + S3Test::SetUp(); + filesystems::registerS3FileSystem(); + auto hiveConnector1 = + connector::getConnectorFactory( + connector::hive::HiveConnectorFactory::kHiveConnectorName) + ->newConnector( + std::string(kConnectorId1), + minioServer_->hiveConfig(), + ioExecutor_.get()); + connector::registerConnector(hiveConnector1); + minioSecondServer_ = std::make_unique(); + minioSecondServer_->start(); + auto hiveConnector2 = + connector::getConnectorFactory( + connector::hive::HiveConnectorFactory::kHiveConnectorName) + ->newConnector( + std::string(kConnectorId2), + minioSecondServer_->hiveConfig(), + ioExecutor_.get()); + connector::registerConnector(hiveConnector2); + } + + void TearDown() override { + connector::unregisterConnector(std::string(kConnectorId1)); + connector::unregisterConnector(std::string(kConnectorId2)); + S3Test::TearDown(); + filesystems::finalizeS3FileSystem(); + } + + folly::dynamic writeData( + const RowVectorPtr input, + const std::string& outputDirectory, + const std::string& connectorId) { + auto plan = PlanBuilder() + .values({input}) + .tableWrite( + outputDirectory.data(), + {}, + 0, + {}, + {}, + dwio::common::FileFormat::PARQUET, + {}, + connectorId) + .planNode(); + // Execute the write plan. + auto results = AssertQueryBuilder(plan).copyResults(pool()); + // Second column contains details about written files. + auto details = results->childAt(exec::TableWriteTraits::kFragmentChannel) + ->as>(); + folly::dynamic obj = folly::parseJson(details->valueAt(1)); + return obj["fileWriteInfos"]; + } + + std::shared_ptr createSplit( + folly::dynamic tableWriteInfo, + std::string outputDirectory, + std::string connectorId) { + auto writeFileName = tableWriteInfo[0]["writeFileName"].asString(); + auto filePath = fmt::format("{}{}", outputDirectory, writeFileName); + const int64_t fileSize = tableWriteInfo[0]["fileSize"].asInt(); + + return HiveConnectorSplitBuilder(filePath) + .connectorId(connectorId) + .fileFormat(dwio::common::FileFormat::PARQUET) + .length(fileSize) + .build(); + } + + std::unique_ptr minioSecondServer_; +}; +} // namespace + +TEST_F(S3MultipleEndpoints, s3Join) { + const int64_t kExpectedRows = 1'000; + const std::string_view kOutputDirectory{"s3://writedata/"}; + + auto rowType1 = ROW( + {"a0", "a1", "a2", "a3"}, {BIGINT(), INTEGER(), SMALLINT(), DOUBLE()}); + auto rowType2 = ROW( + {"b0", "b1", "b2", "b3"}, {BIGINT(), INTEGER(), SMALLINT(), DOUBLE()}); + + auto input1 = makeRowVector( + rowType1->names(), + {makeFlatVector(kExpectedRows, [](auto row) { return row; }), + makeFlatVector(kExpectedRows, [](auto row) { return row; }), + makeFlatVector(kExpectedRows, [](auto row) { return row; }), + makeFlatVector(kExpectedRows, [](auto row) { return row; })}); + auto input2 = makeRowVector(rowType2->names(), input1->children()); + minioServer_->addBucket("writedata"); + minioSecondServer_->addBucket("writedata"); + + // Insert input data into both tables. + auto table1WriteInfo = + writeData(input1, kOutputDirectory.data(), std::string(kConnectorId1)); + auto table2WriteInfo = + writeData(input2, kOutputDirectory.data(), std::string(kConnectorId2)); + + // Inner Join both the tables. + core::PlanNodeId scan1, scan2; + auto planNodeIdGenerator = std::make_shared(); + auto table1Scan = PlanBuilder(planNodeIdGenerator, pool()) + .startTableScan() + .tableName("hive_table1") + .outputType(rowType1) + .connectorId(std::string(kConnectorId1)) + .endTableScan() + .capturePlanNodeId(scan1) + .planNode(); + auto join = + PlanBuilder(planNodeIdGenerator, pool()) + .startTableScan() + .tableName("hive_table1") + .outputType(rowType2) + .connectorId(std::string(kConnectorId2)) + .endTableScan() + .capturePlanNodeId(scan2) + .hashJoin({"b0"}, {"a0"}, table1Scan, "", {"a0", "a1", "a2", "a3"}) + .planNode(); + + auto split1 = createSplit( + table1WriteInfo, kOutputDirectory.data(), std::string(kConnectorId1)); + auto split2 = createSplit( + table2WriteInfo, kOutputDirectory.data(), std::string(kConnectorId2)); + auto results = AssertQueryBuilder(join) + .split(scan1, split1) + .split(scan2, split2) + .copyResults(pool()); + assertEqualResults({input1}, {results}); +} +} // namespace facebook::velox + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + folly::Init init{&argc, &argv, false}; + return RUN_ALL_TESTS(); +} diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3ReadTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3ReadTest.cpp new file mode 100644 index 0000000000000..c53d08764ecbc --- /dev/null +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3ReadTest.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/common/memory/Memory.h" +#include "velox/connectors/hive/storage_adapters/s3fs/RegisterS3FileSystem.h" +#include "velox/connectors/hive/storage_adapters/s3fs/tests/S3Test.h" +#include "velox/dwio/common/tests/utils/DataFiles.h" +#include "velox/exec/tests/utils/AssertQueryBuilder.h" +#include "velox/exec/tests/utils/PlanBuilder.h" + +using namespace facebook::velox::exec::test; + +namespace facebook::velox { +namespace { + +class S3ReadTest : public S3Test { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + void SetUp() override { + S3Test::SetUp(); + filesystems::registerS3FileSystem(); + auto hiveConnector = + connector::getConnectorFactory( + connector::hive::HiveConnectorFactory::kHiveConnectorName) + ->newConnector(kHiveConnectorId, minioServer_->hiveConfig()); + connector::registerConnector(hiveConnector); + } + + void TearDown() override { + filesystems::finalizeS3FileSystem(); + connector::unregisterConnector(kHiveConnectorId); + S3Test::TearDown(); + } +}; +} // namespace + +TEST_F(S3ReadTest, s3ReadTest) { + const auto sourceFile = test::getDataFilePath( + "velox/connectors/hive/storage_adapters/s3fs/tests", + "../../../../../dwio/parquet/tests/examples/int.parquet"); + const char* bucketName = "data"; + const auto destinationFile = S3Test::localPath(bucketName) + "/int.parquet"; + minioServer_->addBucket(bucketName); + std::ifstream src(sourceFile, std::ios::binary); + std::ofstream dest(destinationFile, std::ios::binary); + // Copy source file to destination bucket. + dest << src.rdbuf(); + ASSERT_GT(dest.tellp(), 0) << "Unable to copy from source " << sourceFile; + dest.close(); + + // Read the parquet file via the S3 bucket. + const auto readDirectory{s3URI(bucketName)}; + auto rowType = ROW({"int", "bigint"}, {INTEGER(), BIGINT()}); + auto plan = PlanBuilder().tableScan(rowType).planNode(); + auto split = HiveConnectorSplitBuilder( + fmt::format("{}/{}", readDirectory, "int.parquet")) + .fileFormat(dwio::common::FileFormat::PARQUET) + .build(); + auto copy = AssertQueryBuilder(plan).split(split).copyResults(pool()); + + // expectedResults is the data in int.parquet file. + const int64_t kExpectedRows = 10; + auto expectedResults = makeRowVector( + {makeFlatVector( + kExpectedRows, [](auto row) { return row + 100; }), + makeFlatVector( + kExpectedRows, [](auto row) { return row + 1000; })}); + assertEqualResults({expectedResults}, {copy}); +} +} // namespace facebook::velox + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + folly::Init init{&argc, &argv, false}; + return RUN_ALL_TESTS(); +} diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3Test.h b/velox/connectors/hive/storage_adapters/s3fs/tests/S3Test.h index ef43f899468f5..cc7f003ee877d 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/S3Test.h +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3Test.h @@ -20,6 +20,7 @@ #include "velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.h" #include "velox/connectors/hive/storage_adapters/s3fs/S3Util.h" #include "velox/connectors/hive/storage_adapters/s3fs/tests/MinioServer.h" +#include "velox/exec/tests/utils/HiveConnectorTestBase.h" #include "velox/exec/tests/utils/TempFilePath.h" #include "gtest/gtest.h" @@ -30,8 +31,18 @@ constexpr int kOneMB = 1 << 20; static constexpr std::string_view kDummyPath = "s3://dummy/foo.txt"; -class S3Test : public testing::Test { +class S3Test : public testing::Test, public ::test::VectorTestBase { protected: + void SetUp() override { + minioServer_ = std::make_unique(); + minioServer_->start(); + ioExecutor_ = std::make_unique(3); + } + + void TearDown() override { + minioServer_->stop(); + } + void addBucket(const char* bucket) { minioServer_->addBucket(bucket); } @@ -76,7 +87,8 @@ class S3Test : public testing::Test { folly::Range(middle, sizeof(middle)), folly::Range( nullptr, - (char*)(uint64_t)(15 + kOneMB - 500000 - sizeof(head) - sizeof(middle) - sizeof(tail))), + (char*)(uint64_t)(15 + kOneMB - 500000 - sizeof(head) - + sizeof(middle) - sizeof(tail))), folly::Range(tail, sizeof(tail))}; ASSERT_EQ(15 + kOneMB, readFile->preadv(0, buffers)); ASSERT_EQ(std::string_view(head, sizeof(head)), "aaaaabbbbbcc"); @@ -84,7 +96,6 @@ class S3Test : public testing::Test { ASSERT_EQ(std::string_view(tail, sizeof(tail)), "ccddddd"); } - static std::shared_ptr minioServer_; + std::unique_ptr minioServer_; + std::unique_ptr ioExecutor_; }; - -std::shared_ptr S3Test::minioServer_ = nullptr; diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3UtilTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3UtilTest.cpp index 9501781bcda9d..eb3b5dbff450d 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/S3UtilTest.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3UtilTest.cpp @@ -18,7 +18,7 @@ #include "gtest/gtest.h" -using namespace facebook::velox; +namespace facebook::velox { // TODO: Each prefix should be implemented as its own filesystem. TEST(S3UtilTest, isS3File) { @@ -108,3 +108,142 @@ TEST(S3UtilTest, bucketAndKeyFromS3Path) { EXPECT_EQ(bucket, "bucket"); EXPECT_EQ(key, "file.txt"); } + +TEST(S3UtilTest, isDomainExcludedFromProxy) { + auto hostname = "test.foobar.com"; + + std::vector> tests = { + {"localhost,.foobar.com", true}, + {"localhost,.,foobar.com,.com", true}, + {"localhost,test.foobar.com", true}, + {"localhost,foobar.com,*.com", true}, + {"localhost,*.foobar.com", true}, + {"localhost", false}, + {"localhost,foobar.com", false}, + {"", false}, + }; + + for (auto pair : tests) { + EXPECT_EQ(isHostExcludedFromProxy(hostname, pair.first), pair.second); + } +} + +TEST(S3UtilTest, isIpExcludedFromProxy) { + auto hostname = "127.0.0.1"; + + std::vector> tests = { + {"localhost,127.0.0.1,.foobar.com", true}, + {"localhost,127.0.0.0/24,.foobar.com", true}, + {"localhost,foobar.com,127.0.0.0/16,.1,.com", true}, + {"localhost,foobar.com,.1,.com", true}, + {"localhost,test.foobar.com", false}, + {"localhost,foobar.com,*.1,*.com", true}, + {"localhost", false}, + {"localhost,127.1.0.1", false}, + {"", false}, + }; + + for (auto pair : tests) { + EXPECT_EQ(isHostExcludedFromProxy(hostname, pair.first), pair.second) + << pair.first; + } +} + +class S3UtilProxyTest : public ::testing::TestWithParam {}; + +TEST_P(S3UtilProxyTest, proxyBuilderBadEndpoint) { + auto s3Endpoint = "http://127.0.0.1:8888"; + auto useSsl = GetParam(); + + setenv("HTTP_PROXY", "http://127.0.0.1:12345", 1); + setenv("HTTPS_PROXY", "http://127.0.0.1:12345", 1); + EXPECT_FALSE(S3ProxyConfigurationBuilder(s3Endpoint) + .useSsl(useSsl) + .build() + .has_value()); +} + +TEST_P(S3UtilProxyTest, proxyBuilderNoProxy) { + auto s3Endpoint = "127.0.0.1:8888"; + auto useSsl = GetParam(); + + setenv("HTTP_PROXY", "", 1); + setenv("HTTPS_PROXY", "", 1); + EXPECT_FALSE(S3ProxyConfigurationBuilder(s3Endpoint) + .useSsl(useSsl) + .build() + .has_value()); +} + +TEST_P(S3UtilProxyTest, proxyBuilderSameHttpProxy) { + auto s3Endpoint = "192.168.0.1:12345"; + auto useSsl = GetParam(); + + setenv("HTTP_PROXY", "http://127.0.0.1:8888", 1); + setenv("HTTPS_PROXY", "http://127.0.0.1:8888", 1); + auto proxyConfig = + S3ProxyConfigurationBuilder(s3Endpoint).useSsl(useSsl).build(); + ASSERT_TRUE(proxyConfig.has_value()); + EXPECT_EQ(proxyConfig.value().scheme(), "http"); + EXPECT_EQ(proxyConfig.value().host(), "127.0.0.1"); + EXPECT_EQ(proxyConfig.value().port(), 8888); + EXPECT_EQ(proxyConfig.value().username(), ""); + EXPECT_EQ(proxyConfig.value().password(), ""); +} + +TEST_P(S3UtilProxyTest, proxyBuilderMixProxy) { + auto s3Endpoint = "192.168.0.1:12345"; + auto useSsl = GetParam(); + + const std::string httpProxy = "https://test1:testpw1@80.67.3.1:35631"; + setenv("HTTP_PROXY", httpProxy.c_str(), 1); + EXPECT_EQ(getHttpProxyEnvVar(), httpProxy) + << "HTTP_PROXY environment variable not set."; + const std::string httpsProxy = "http://test2:testpw2@80.80.5.1:45631"; + setenv("HTTPS_PROXY", httpsProxy.c_str(), 1); + EXPECT_EQ(getHttpsProxyEnvVar(), httpsProxy) + << "HTTPS_PROXY environment variable not set."; + auto proxyConfig = + S3ProxyConfigurationBuilder(s3Endpoint).useSsl(useSsl).build(); + ASSERT_TRUE(proxyConfig.has_value()); + EXPECT_EQ(proxyConfig.value().scheme(), (useSsl ? "http" : "https")); + EXPECT_EQ(proxyConfig.value().host(), (useSsl ? "80.80.5.1" : "80.67.3.1")); + EXPECT_EQ(proxyConfig.value().port(), (useSsl ? 45631 : 35631)); + EXPECT_EQ(proxyConfig.value().username(), (useSsl ? "test2" : "test1")); + EXPECT_EQ(proxyConfig.value().password(), (useSsl ? "testpw2" : "testpw1")); +} + +TEST_P(S3UtilProxyTest, proxyBuilderMixProxyLowerCase) { + auto s3Endpoint = "192.168.0.1:12345"; + auto useSsl = GetParam(); + + const std::string lcHttpProxy = "https://lctest1:lctestpw1@80.67.3.1:35631"; + const std::string ucHttpProxy = "https://uctest1:uctestpw1@80.67.3.2:35632"; + setenv("http_proxy", lcHttpProxy.c_str(), 1); + setenv("HTTP_PROXY", ucHttpProxy.c_str(), 1); + // Lower case value takes precedence. + EXPECT_EQ(getHttpProxyEnvVar(), lcHttpProxy) + << "http_proxy environment variable not set."; + const std::string lcHttpsProxy = "http://lctest2:lctestpw2@80.80.5.1:45631"; + const std::string ucHttpsProxy = "http://uctest2:uctestpw2@80.80.5.2:45632"; + setenv("https_proxy", lcHttpsProxy.c_str(), 1); + setenv("HTTPS_PROXY", ucHttpsProxy.c_str(), 1); + EXPECT_EQ(getHttpsProxyEnvVar(), lcHttpsProxy) + << "https_proxy environment variable not set."; + auto proxyConfig = + S3ProxyConfigurationBuilder(s3Endpoint).useSsl(useSsl).build(); + ASSERT_TRUE(proxyConfig.has_value()); + EXPECT_EQ(proxyConfig.value().scheme(), (useSsl ? "http" : "https")); + EXPECT_EQ(proxyConfig.value().host(), (useSsl ? "80.80.5.1" : "80.67.3.1")); + EXPECT_EQ(proxyConfig.value().port(), (useSsl ? 45631 : 35631)); + EXPECT_EQ(proxyConfig.value().username(), (useSsl ? "lctest2" : "lctest1")); + EXPECT_EQ( + proxyConfig.value().password(), (useSsl ? "lctestpw2" : "lctestpw1")); +} + +INSTANTIATE_TEST_SUITE_P( + S3UtilTest, + S3UtilProxyTest, + ::testing::Values(true, false)); + +} // namespace facebook::velox diff --git a/velox/connectors/hive/tests/CMakeLists.txt b/velox/connectors/hive/tests/CMakeLists.txt index f3b348ca1767b..f4235cfa13c05 100644 --- a/velox/connectors/hive/tests/CMakeLists.txt +++ b/velox/connectors/hive/tests/CMakeLists.txt @@ -13,12 +13,14 @@ # limitations under the License. add_executable( velox_hive_connector_test - HiveDataSinkTest.cpp - HivePartitionFunctionTest.cpp FileHandleTest.cpp - HivePartitionUtilTest.cpp + HiveConfigTest.cpp + HiveDataSinkTest.cpp HiveConnectorTest.cpp + HiveConnectorUtilTest.cpp HiveConnectorSerDeTest.cpp + HivePartitionFunctionTest.cpp + HivePartitionUtilTest.cpp PartitionIdGeneratorTest.cpp TableHandleTest.cpp) add_test(velox_hive_connector_test velox_hive_connector_test) @@ -32,5 +34,14 @@ target_link_libraries( velox_vector_test_lib velox_exec velox_exec_test_lib - gtest - gtest_main) + GTest::gtest + GTest::gtest_main) + +if(VELOX_ENABLE_PARQUET) + + target_include_directories(velox_hive_connector_test + PUBLIC ${ARROW_PREFIX}/install/include) + target_link_libraries(velox_hive_connector_test + velox_dwio_native_parquet_reader) + +endif() diff --git a/velox/connectors/hive/tests/FileHandleTest.cpp b/velox/connectors/hive/tests/FileHandleTest.cpp index 659f0299f9eeb..e641cb3916270 100644 --- a/velox/connectors/hive/tests/FileHandleTest.cpp +++ b/velox/connectors/hive/tests/FileHandleTest.cpp @@ -27,8 +27,8 @@ using namespace facebook::velox; TEST(FileHandleTest, localFile) { filesystems::registerLocalFileSystem(); - auto tempFile = ::exec::test::TempFilePath::create(); - const auto& filename = tempFile->path; + auto tempFile = exec::test::TempFilePath::create(); + const auto& filename = tempFile->getPath(); remove(filename.c_str()); { @@ -37,10 +37,35 @@ TEST(FileHandleTest, localFile) { } FileHandleFactory factory( - std::make_unique< - SimpleLRUCache>>(1000), + std::make_unique>(1000), std::make_unique()); - auto fileHandle = factory.generate(filename).second; + auto fileHandle = factory.generate(filename); + ASSERT_EQ(fileHandle->file->size(), 3); + char buffer[3]; + ASSERT_EQ(fileHandle->file->pread(0, 3, &buffer), "foo"); + + // Clean up + remove(filename.c_str()); +} + +TEST(FileHandleTest, localFileWithProperties) { + filesystems::registerLocalFileSystem(); + + auto tempFile = exec::test::TempFilePath::create(); + const auto& filename = tempFile->getPath(); + remove(filename.c_str()); + + { + LocalWriteFile writeFile(filename); + writeFile.append("foo"); + } + + FileHandleFactory factory( + std::make_unique>(1000), + std::make_unique()); + FileProperties properties = { + tempFile->fileSize(), tempFile->fileModifiedTime()}; + auto fileHandle = factory.generate(filename, &properties); ASSERT_EQ(fileHandle->file->size(), 3); char buffer[3]; ASSERT_EQ(fileHandle->file->pread(0, 3, &buffer), "foo"); diff --git a/velox/connectors/hive/tests/HiveConfigTest.cpp b/velox/connectors/hive/tests/HiveConfigTest.cpp new file mode 100644 index 0000000000000..b7bbfdbc7904b --- /dev/null +++ b/velox/connectors/hive/tests/HiveConfigTest.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/HiveConfig.h" +#include "gtest/gtest.h" +#include "velox/common/config/Config.h" + +using namespace facebook::velox; +using namespace facebook::velox::connector::hive; +using facebook::velox::connector::hive::HiveConfig; + +TEST(HiveConfigTest, defaultConfig) { + HiveConfig hiveConfig(std::make_shared( + std::unordered_map())); + const auto emptySession = std::make_unique( + std::unordered_map()); + ASSERT_EQ( + hiveConfig.insertExistingPartitionsBehavior(emptySession.get()), + facebook::velox::connector::hive::HiveConfig:: + InsertExistingPartitionsBehavior::kError); + ASSERT_EQ(hiveConfig.maxPartitionsPerWriters(emptySession.get()), 100); + ASSERT_EQ(hiveConfig.immutablePartitions(), false); + ASSERT_EQ(hiveConfig.s3UseVirtualAddressing(), true); + ASSERT_EQ(hiveConfig.s3GetLogLevel(), "FATAL"); + ASSERT_EQ(hiveConfig.s3UseSSL(), true); + ASSERT_EQ(hiveConfig.s3UseInstanceCredentials(), false); + ASSERT_EQ(hiveConfig.s3Endpoint(), ""); + ASSERT_EQ(hiveConfig.s3AccessKey(), std::nullopt); + ASSERT_EQ(hiveConfig.s3SecretKey(), std::nullopt); + ASSERT_EQ(hiveConfig.s3IAMRole(), std::nullopt); + ASSERT_EQ(hiveConfig.s3IAMRoleSessionName(), "velox-session"); + ASSERT_EQ(hiveConfig.gcsEndpoint(), ""); + ASSERT_EQ(hiveConfig.gcsScheme(), "https"); + ASSERT_EQ(hiveConfig.gcsCredentials(), ""); + ASSERT_EQ(hiveConfig.isOrcUseColumnNames(emptySession.get()), false); + ASSERT_EQ( + hiveConfig.isFileColumnNamesReadAsLowerCase(emptySession.get()), false); + + ASSERT_EQ(hiveConfig.maxCoalescedBytes(), 128 << 20); + ASSERT_EQ(hiveConfig.maxCoalescedDistanceBytes(), 512 << 10); + ASSERT_EQ(hiveConfig.numCacheFileHandles(), 20'000); + ASSERT_EQ(hiveConfig.isFileHandleCacheEnabled(), true); + ASSERT_EQ( + hiveConfig.orcWriterMaxStripeSize(emptySession.get()), + 64L * 1024L * 1024L); + ASSERT_EQ( + hiveConfig.orcWriterMaxDictionaryMemory(emptySession.get()), + 16L * 1024L * 1024L); + ASSERT_EQ( + hiveConfig.isOrcWriterIntegerDictionaryEncodingEnabled( + emptySession.get()), + true); + ASSERT_EQ( + hiveConfig.isOrcWriterStringDictionaryEncodingEnabled(emptySession.get()), + true); + ASSERT_EQ(hiveConfig.sortWriterMaxOutputRows(emptySession.get()), 1024); + ASSERT_EQ( + hiveConfig.sortWriterMaxOutputBytes(emptySession.get()), 10UL << 20); + ASSERT_EQ(hiveConfig.isPartitionPathAsLowerCase(emptySession.get()), true); + ASSERT_EQ(hiveConfig.allowNullPartitionKeys(emptySession.get()), true); + ASSERT_EQ(hiveConfig.orcWriterMinCompressionSize(emptySession.get()), 1024); + ASSERT_EQ( + hiveConfig.orcWriterCompressionLevel(emptySession.get()), std::nullopt); + ASSERT_EQ( + hiveConfig.orcWriterLinearStripeSizeHeuristics(emptySession.get()), true); + ASSERT_FALSE(hiveConfig.cacheNoRetention(emptySession.get())); +} + +TEST(HiveConfigTest, overrideConfig) { + std::unordered_map configFromFile = { + {HiveConfig::kInsertExistingPartitionsBehavior, "OVERWRITE"}, + {HiveConfig::kMaxPartitionsPerWriters, "120"}, + {HiveConfig::kImmutablePartitions, "true"}, + {HiveConfig::kS3PathStyleAccess, "true"}, + {HiveConfig::kS3LogLevel, "Warning"}, + {HiveConfig::kS3SSLEnabled, "false"}, + {HiveConfig::kS3UseInstanceCredentials, "true"}, + {HiveConfig::kS3Endpoint, "hey"}, + {HiveConfig::kS3AwsAccessKey, "hello"}, + {HiveConfig::kS3AwsSecretKey, "hello"}, + {HiveConfig::kS3IamRole, "hello"}, + {HiveConfig::kS3IamRoleSessionName, "velox"}, + {HiveConfig::kGCSEndpoint, "hey"}, + {HiveConfig::kGCSScheme, "http"}, + {HiveConfig::kGCSCredentials, "hey"}, + {HiveConfig::kOrcUseColumnNames, "true"}, + {HiveConfig::kFileColumnNamesReadAsLowerCase, "true"}, + {HiveConfig::kAllowNullPartitionKeys, "false"}, + {HiveConfig::kMaxCoalescedBytes, "100"}, + {HiveConfig::kMaxCoalescedDistanceBytes, "100"}, + {HiveConfig::kNumCacheFileHandles, "100"}, + {HiveConfig::kEnableFileHandleCache, "false"}, + {HiveConfig::kOrcWriterMaxStripeSize, "100MB"}, + {HiveConfig::kOrcWriterMaxDictionaryMemory, "100MB"}, + {HiveConfig::kOrcWriterIntegerDictionaryEncodingEnabled, "false"}, + {HiveConfig::kOrcWriterStringDictionaryEncodingEnabled, "false"}, + {HiveConfig::kSortWriterMaxOutputRows, "100"}, + {HiveConfig::kSortWriterMaxOutputBytes, "100MB"}, + {HiveConfig::kOrcWriterLinearStripeSizeHeuristics, "false"}, + {HiveConfig::kOrcWriterMinCompressionSize, "512"}, + {HiveConfig::kOrcWriterCompressionLevel, "1"}, + {HiveConfig::kCacheNoRetention, "true"}}; + HiveConfig hiveConfig( + std::make_shared(std::move(configFromFile))); + auto emptySession = std::make_shared( + std::unordered_map()); + ASSERT_EQ( + hiveConfig.insertExistingPartitionsBehavior(emptySession.get()), + facebook::velox::connector::hive::HiveConfig:: + InsertExistingPartitionsBehavior::kOverwrite); + ASSERT_EQ(hiveConfig.maxPartitionsPerWriters(emptySession.get()), 120); + ASSERT_EQ(hiveConfig.immutablePartitions(), true); + ASSERT_EQ(hiveConfig.s3UseVirtualAddressing(), false); + ASSERT_EQ(hiveConfig.s3GetLogLevel(), "Warning"); + ASSERT_EQ(hiveConfig.s3UseSSL(), false); + ASSERT_EQ(hiveConfig.s3UseInstanceCredentials(), true); + ASSERT_EQ(hiveConfig.s3Endpoint(), "hey"); + ASSERT_EQ(hiveConfig.s3AccessKey(), std::optional("hello")); + ASSERT_EQ(hiveConfig.s3SecretKey(), std::optional("hello")); + ASSERT_EQ(hiveConfig.s3IAMRole(), std::optional("hello")); + ASSERT_EQ(hiveConfig.s3IAMRoleSessionName(), "velox"); + ASSERT_EQ(hiveConfig.gcsEndpoint(), "hey"); + ASSERT_EQ(hiveConfig.gcsScheme(), "http"); + ASSERT_EQ(hiveConfig.gcsCredentials(), "hey"); + ASSERT_EQ(hiveConfig.isOrcUseColumnNames(emptySession.get()), true); + ASSERT_EQ( + hiveConfig.isFileColumnNamesReadAsLowerCase(emptySession.get()), true); + ASSERT_EQ(hiveConfig.allowNullPartitionKeys(emptySession.get()), false); + ASSERT_EQ(hiveConfig.maxCoalescedBytes(), 100); + ASSERT_EQ(hiveConfig.maxCoalescedDistanceBytes(), 100); + ASSERT_EQ(hiveConfig.numCacheFileHandles(), 100); + ASSERT_EQ(hiveConfig.isFileHandleCacheEnabled(), false); + ASSERT_EQ( + hiveConfig.orcWriterMaxStripeSize(emptySession.get()), + 100L * 1024L * 1024L); + ASSERT_EQ( + hiveConfig.orcWriterMaxDictionaryMemory(emptySession.get()), + 100L * 1024L * 1024L); + ASSERT_EQ( + hiveConfig.isOrcWriterIntegerDictionaryEncodingEnabled( + emptySession.get()), + false); + ASSERT_EQ( + hiveConfig.isOrcWriterStringDictionaryEncodingEnabled(emptySession.get()), + false); + ASSERT_EQ(hiveConfig.sortWriterMaxOutputRows(emptySession.get()), 100); + ASSERT_EQ( + hiveConfig.sortWriterMaxOutputBytes(emptySession.get()), 100UL << 20); + ASSERT_EQ(hiveConfig.orcWriterMinCompressionSize(emptySession.get()), 512); + ASSERT_EQ(hiveConfig.orcWriterCompressionLevel(emptySession.get()), 1); + ASSERT_EQ( + hiveConfig.orcWriterLinearStripeSizeHeuristics(emptySession.get()), + false); + ASSERT_TRUE(hiveConfig.cacheNoRetention(emptySession.get())); +} + +TEST(HiveConfigTest, overrideSession) { + HiveConfig hiveConfig(std::make_shared( + std::unordered_map())); + std::unordered_map sessionOverride = { + {HiveConfig::kInsertExistingPartitionsBehaviorSession, "OVERWRITE"}, + {HiveConfig::kOrcUseColumnNamesSession, "true"}, + {HiveConfig::kFileColumnNamesReadAsLowerCaseSession, "true"}, + {HiveConfig::kOrcWriterMaxStripeSizeSession, "22MB"}, + {HiveConfig::kOrcWriterMaxDictionaryMemorySession, "22MB"}, + {HiveConfig::kOrcWriterIntegerDictionaryEncodingEnabledSession, "false"}, + {HiveConfig::kOrcWriterStringDictionaryEncodingEnabledSession, "false"}, + {HiveConfig::kSortWriterMaxOutputRowsSession, "20"}, + {HiveConfig::kSortWriterMaxOutputBytesSession, "20MB"}, + {HiveConfig::kPartitionPathAsLowerCaseSession, "false"}, + {HiveConfig::kAllowNullPartitionKeysSession, "false"}, + {HiveConfig::kIgnoreMissingFilesSession, "true"}, + {HiveConfig::kOrcWriterMinCompressionSizeSession, "512"}, + {HiveConfig::kOrcWriterCompressionLevelSession, "1"}, + {HiveConfig::kOrcWriterLinearStripeSizeHeuristicsSession, "false"}, + {HiveConfig::kCacheNoRetentionSession, "true"}}; + const auto session = + std::make_unique(std::move(sessionOverride)); + ASSERT_EQ( + hiveConfig.insertExistingPartitionsBehavior(session.get()), + facebook::velox::connector::hive::HiveConfig:: + InsertExistingPartitionsBehavior::kOverwrite); + ASSERT_EQ(hiveConfig.maxPartitionsPerWriters(session.get()), 100); + ASSERT_EQ(hiveConfig.immutablePartitions(), false); + ASSERT_EQ(hiveConfig.s3UseVirtualAddressing(), true); + ASSERT_EQ(hiveConfig.s3GetLogLevel(), "FATAL"); + ASSERT_EQ(hiveConfig.s3UseSSL(), true); + ASSERT_EQ(hiveConfig.s3UseInstanceCredentials(), false); + ASSERT_EQ(hiveConfig.s3Endpoint(), ""); + ASSERT_EQ(hiveConfig.s3AccessKey(), std::nullopt); + ASSERT_EQ(hiveConfig.s3SecretKey(), std::nullopt); + ASSERT_EQ(hiveConfig.s3IAMRole(), std::nullopt); + ASSERT_EQ(hiveConfig.s3IAMRoleSessionName(), "velox-session"); + ASSERT_EQ(hiveConfig.gcsEndpoint(), ""); + ASSERT_EQ(hiveConfig.gcsScheme(), "https"); + ASSERT_EQ(hiveConfig.gcsCredentials(), ""); + ASSERT_EQ(hiveConfig.isOrcUseColumnNames(session.get()), true); + ASSERT_EQ(hiveConfig.isFileColumnNamesReadAsLowerCase(session.get()), true); + + ASSERT_EQ(hiveConfig.maxCoalescedBytes(), 128 << 20); + ASSERT_EQ(hiveConfig.maxCoalescedDistanceBytes(), 512 << 10); + ASSERT_EQ(hiveConfig.numCacheFileHandles(), 20'000); + ASSERT_EQ(hiveConfig.isFileHandleCacheEnabled(), true); + ASSERT_EQ( + hiveConfig.orcWriterMaxStripeSize(session.get()), 22L * 1024L * 1024L); + ASSERT_EQ( + hiveConfig.orcWriterMaxDictionaryMemory(session.get()), + 22L * 1024L * 1024L); + ASSERT_EQ( + hiveConfig.isOrcWriterIntegerDictionaryEncodingEnabled(session.get()), + false); + ASSERT_EQ( + hiveConfig.isOrcWriterStringDictionaryEncodingEnabled(session.get()), + false); + ASSERT_EQ(hiveConfig.sortWriterMaxOutputRows(session.get()), 20); + ASSERT_EQ(hiveConfig.sortWriterMaxOutputBytes(session.get()), 20UL << 20); + ASSERT_EQ(hiveConfig.isPartitionPathAsLowerCase(session.get()), false); + ASSERT_EQ(hiveConfig.allowNullPartitionKeys(session.get()), false); + ASSERT_EQ(hiveConfig.ignoreMissingFiles(session.get()), true); + ASSERT_EQ( + hiveConfig.orcWriterLinearStripeSizeHeuristics(session.get()), false); + ASSERT_EQ(hiveConfig.orcWriterMinCompressionSize(session.get()), 512); + ASSERT_EQ(hiveConfig.orcWriterCompressionLevel(session.get()), 1); + ASSERT_TRUE(hiveConfig.cacheNoRetention(session.get())); +} diff --git a/velox/connectors/hive/tests/HiveConnectorTest.cpp b/velox/connectors/hive/tests/HiveConnectorTest.cpp index b0becde79d98a..49f12ac6a113b 100644 --- a/velox/connectors/hive/tests/HiveConnectorTest.cpp +++ b/velox/connectors/hive/tests/HiveConnectorTest.cpp @@ -17,8 +17,9 @@ #include #include "velox/exec/tests/utils/HiveConnectorTestBase.h" +#include "velox/common/base/tests/GTestUtils.h" #include "velox/connectors/hive/HiveConfig.h" -#include "velox/connectors/hive/HiveConnector.h" +#include "velox/connectors/hive/HiveConnectorUtil.h" #include "velox/connectors/hive/HiveDataSource.h" #include "velox/expression/ExprToSubfieldFilter.h" @@ -31,7 +32,7 @@ using namespace facebook::velox::exec::test; class HiveConnectorTest : public exec::test::HiveConnectorTestBase { protected: std::shared_ptr pool_ = - memory::addDefaultLeafMemoryPool(); + memory::memoryManager()->addLeafPool(); }; void validateNullConstant(const ScanSpec& spec, const Type& type) { @@ -62,6 +63,11 @@ groupSubfields(const std::vector& subfields) { return grouped; } +bool mapKeyIsNotNull(const ScanSpec& mapSpec) { + return dynamic_cast( + mapSpec.childByName(ScanSpec::kMapKeysFieldName)->filter()); +} + TEST_F(HiveConnectorTest, hiveConfig) { ASSERT_EQ( HiveConfig::insertExistingPartitionsBehaviorString( @@ -85,8 +91,15 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_multilevel) { VARCHAR(), ROW({{"c0c1c0", BIGINT()}, {"c0c1c1", BIGINT()}})))}}); auto rowType = ROW({{"c0", columnType}}); auto subfields = makeSubfields({"c0.c0c1[3][\"foo\"].c0c1c0"}); - auto scanSpec = HiveDataSource::makeScanSpec( - rowType, groupSubfields(subfields), {}, nullptr, pool_.get()); + auto scanSpec = makeScanSpec( + rowType, + groupSubfields(subfields), + {}, + nullptr, + {}, + {}, + nullptr, + pool_.get()); auto* c0c0 = scanSpec->childByName("c0")->childByName("c0c0"); validateNullConstant(*c0c0, *BIGINT()); auto* c0c1 = scanSpec->childByName("c0")->childByName("c0c1"); @@ -114,12 +127,15 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeFields) { {"c0c0c2", BIGINT()}})}, {"c0c1", ROW({{"c0c1c0", BIGINT()}, {"c0c1c1", BIGINT()}})}}); auto rowType = ROW({{"c0", columnType}}); - auto scanSpec = HiveDataSource::makeScanSpec( + auto scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields( {"c0.c0c0.c0c0c0", "c0.c0c0.c0c0c2", "c0.c0c1", "c0.c0c1.c0c1c0"})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); auto* c0c0 = scanSpec->childByName("c0")->childByName("c0c0"); ASSERT_FALSE(c0c0->childByName("c0c0c0")->isConstant()); @@ -136,32 +152,53 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeArray) { auto columnType = ARRAY(ROW({{"c0c0", BIGINT()}, {"c0c1", BIGINT()}, {"c0c2", BIGINT()}})); auto rowType = ROW({{"c0", columnType}}); - auto scanSpec = HiveDataSource::makeScanSpec( + auto scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields({"c0[1].c0c0", "c0[2].c0c2"})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); auto* c0 = scanSpec->childByName("c0"); ASSERT_EQ(c0->maxArrayElementsCount(), 2); + ASSERT_TRUE(c0->flatMapFeatureSelection().empty()); auto* elements = c0->childByName(ScanSpec::kArrayElementsFieldName); ASSERT_FALSE(elements->childByName("c0c0")->isConstant()); ASSERT_FALSE(elements->childByName("c0c2")->isConstant()); validateNullConstant(*elements->childByName("c0c1"), *BIGINT()); } +TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeArrayNegative) { + auto columnType = + ARRAY(ROW({{"c0c0", BIGINT()}, {"c0c1", BIGINT()}, {"c0c2", BIGINT()}})); + auto rowType = ROW({{"c0", columnType}}); + auto subfields = makeSubfields({"c0[1].c0c0", "c0[-1].c0c2"}); + auto groupedSubfields = groupSubfields(subfields); + VELOX_ASSERT_USER_THROW( + makeScanSpec( + rowType, groupedSubfields, {}, nullptr, {}, {}, nullptr, pool_.get()), + "Non-positive array subscript cannot be push down"); +} + TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_mergeMap) { auto columnType = MAP(BIGINT(), ROW({{"c0c0", BIGINT()}, {"c0c1", BIGINT()}, {"c0c2", BIGINT()}})); auto rowType = ROW({{"c0", columnType}}); - auto scanSpec = HiveDataSource::makeScanSpec( + auto scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields({"c0[10].c0c0", "c0[20].c0c2"})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); auto* c0 = scanSpec->childByName("c0"); + ASSERT_EQ( + c0->flatMapFeatureSelection(), std::vector({"10", "20"})); auto* keysFilter = c0->childByName(ScanSpec::kMapKeysFieldName)->filter(); ASSERT_TRUE(keysFilter); ASSERT_TRUE(applyFilter(*keysFilter, 10)); @@ -179,14 +216,18 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_allSubscripts) { auto rowType = ROW({{"c0", columnType}}); for (auto* path : {"c0", "c0[*]", "c0[*][*]"}) { SCOPED_TRACE(path); - auto scanSpec = HiveDataSource::makeScanSpec( + auto scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields({path})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); auto* c0 = scanSpec->childByName("c0"); - ASSERT_FALSE(c0->childByName(ScanSpec::kMapKeysFieldName)->filter()); + ASSERT_TRUE(c0->flatMapFeatureSelection().empty()); + ASSERT_TRUE(mapKeyIsNotNull(*c0)); auto* values = c0->childByName(ScanSpec::kMapValuesFieldName); ASSERT_EQ( values->maxArrayElementsCount(), @@ -196,14 +237,17 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_allSubscripts) { ASSERT_FALSE(elements->childByName("c0c0")->isConstant()); ASSERT_FALSE(elements->childByName("c0c1")->isConstant()); } - auto scanSpec = HiveDataSource::makeScanSpec( + auto scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields({"c0[*][*].c0c0"})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); auto* c0 = scanSpec->childByName("c0"); - ASSERT_FALSE(c0->childByName(ScanSpec::kMapKeysFieldName)->filter()); + ASSERT_TRUE(mapKeyIsNotNull(*c0)); auto* values = c0->childByName(ScanSpec::kMapValuesFieldName); ASSERT_EQ( values->maxArrayElementsCount(), @@ -217,11 +261,14 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_allSubscripts) { TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) { auto rowType = ROW({{"c0", MAP(REAL(), BIGINT())}, {"c1", MAP(DOUBLE(), BIGINT())}}); - auto scanSpec = HiveDataSource::makeScanSpec( + auto scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields({"c0[0]", "c1[-1]"})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); auto* keysFilter = scanSpec->childByName("c0") ->childByName(ScanSpec::kMapKeysFieldName) @@ -242,12 +289,15 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) { ASSERT_FALSE(applyFilter(*keysFilter, -2.0)); // Integer min and max means infinities. - scanSpec = HiveDataSource::makeScanSpec( + scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields( {"c0[-9223372036854775808]", "c1[9223372036854775807]"})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); keysFilter = scanSpec->childByName("c0") ->childByName(ScanSpec::kMapKeysFieldName) @@ -259,12 +309,15 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) { ->filter(); ASSERT_TRUE(applyFilter(*keysFilter, 1e100)); ASSERT_FALSE(applyFilter(*keysFilter, 9223372036854700000.0)); - scanSpec = HiveDataSource::makeScanSpec( + scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields( {"c0[9223372036854775807]", "c0[-9223372036854775808]"})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); keysFilter = scanSpec->childByName("c0") ->childByName(ScanSpec::kMapKeysFieldName) @@ -274,11 +327,14 @@ TEST_F(HiveConnectorTest, makeScanSpec_requiredSubfields_doubleMapKey) { ASSERT_TRUE(applyFilter(*keysFilter, 1e30f)); // Unrepresentable values. - scanSpec = HiveDataSource::makeScanSpec( + scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields({"c0[-100000000]", "c0[100000000]"})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); keysFilter = scanSpec->childByName("c0") ->childByName(ScanSpec::kMapKeysFieldName) @@ -308,11 +364,14 @@ TEST_F(HiveConnectorTest, makeScanSpec_filtersNotInRequiredSubfields) { filters.emplace(Subfield("c0.c0c2"), exec::isNotNull()); filters.emplace(Subfield("c0.c0c3"), exec::isNotNull()); filters.emplace(Subfield("c1.c1c0.c1c0c0"), exec::equal(43)); - auto scanSpec = HiveDataSource::makeScanSpec( + auto scanSpec = makeScanSpec( ROW({{"c0", c0Type}}), groupSubfields(makeSubfields({"c0.c0c1", "c0.c0c3"})), filters, ROW({{"c0", c0Type}, {"c1", c1Type}}), + {}, + {}, + nullptr, pool_.get()); auto c0 = scanSpec->childByName("c0"); ASSERT_FALSE(c0->isConstant()); @@ -350,12 +409,15 @@ TEST_F(HiveConnectorTest, makeScanSpec_duplicateSubfields) { auto c0Type = MAP(BIGINT(), MAP(BIGINT(), BIGINT())); auto c1Type = MAP(VARCHAR(), MAP(BIGINT(), BIGINT())); auto rowType = ROW({{"c0", c0Type}, {"c1", c1Type}}); - auto scanSpec = HiveDataSource::makeScanSpec( + auto scanSpec = makeScanSpec( rowType, groupSubfields(makeSubfields( {"c0[10][1]", "c0[10][2]", "c1[\"foo\"][1]", "c1[\"foo\"][2]"})), {}, nullptr, + {}, + {}, + nullptr, pool_.get()); auto* c0 = scanSpec->childByName("c0"); ASSERT_EQ(c0->children().size(), 2); @@ -363,24 +425,65 @@ TEST_F(HiveConnectorTest, makeScanSpec_duplicateSubfields) { ASSERT_EQ(c1->children().size(), 2); } +// For TEXTFILE, partition key is not included in data columns. +TEST_F(HiveConnectorTest, makeScanSpec_filterPartitionKey) { + auto rowType = ROW({{"c0", BIGINT()}}); + SubfieldFilters filters; + filters.emplace(Subfield("ds"), exec::equal("2023-10-13")); + auto scanSpec = makeScanSpec( + rowType, + {}, + filters, + rowType, + {{"ds", nullptr}}, + {}, + nullptr, + pool_.get()); + ASSERT_TRUE(scanSpec->childByName("c0")->projectOut()); + ASSERT_FALSE(scanSpec->childByName("ds")->projectOut()); +} + +TEST_F(HiveConnectorTest, makeScanSpec_prunedMapNonNullMapKey) { + auto rowType = + ROW({"c0"}, + {ROW( + {{"c0c0", MAP(BIGINT(), MAP(BIGINT(), BIGINT()))}, + {"c0c1", BIGINT()}})}); + auto scanSpec = makeScanSpec( + rowType, + groupSubfields(makeSubfields({"c0.c0c1"})), + {}, + nullptr, + {}, + {}, + nullptr, + pool_.get()); + auto* c0 = scanSpec->childByName("c0"); + ASSERT_EQ(c0->children().size(), 2); + ASSERT_TRUE(c0->childByName("c0c0")->isConstant()); +} + TEST_F(HiveConnectorTest, extractFiltersFromRemainingFilter) { - core::QueryCtx queryCtx; - exec::SimpleExpressionEvaluator evaluator(&queryCtx, pool_.get()); + auto queryCtx = core::QueryCtx::create(); + exec::SimpleExpressionEvaluator evaluator(queryCtx.get(), pool_.get()); auto rowType = ROW({"c0", "c1", "c2"}, {BIGINT(), BIGINT(), DECIMAL(20, 0)}); auto expr = parseExpr("not (c0 > 0 or c1 > 0)", rowType); SubfieldFilters filters; - auto remaining = HiveDataSource::extractFiltersFromRemainingFilter( - expr, &evaluator, false, filters); + double sampleRate = 1; + auto remaining = extractFiltersFromRemainingFilter( + expr, &evaluator, false, filters, sampleRate); ASSERT_FALSE(remaining); + ASSERT_EQ(sampleRate, 1); ASSERT_EQ(filters.size(), 2); ASSERT_GT(filters.count(Subfield("c0")), 0); ASSERT_GT(filters.count(Subfield("c1")), 0); expr = parseExpr("not (c0 > 0 or c1 > c0)", rowType); filters.clear(); - remaining = HiveDataSource::extractFiltersFromRemainingFilter( - expr, &evaluator, false, filters); + remaining = extractFiltersFromRemainingFilter( + expr, &evaluator, false, filters, sampleRate); + ASSERT_EQ(sampleRate, 1); ASSERT_EQ(filters.size(), 1); ASSERT_GT(filters.count(Subfield("c0")), 0); ASSERT_TRUE(remaining); @@ -389,8 +492,9 @@ TEST_F(HiveConnectorTest, extractFiltersFromRemainingFilter) { expr = parseExpr( "not (c2 > 1::decimal(20, 0) or c2 < 0::decimal(20, 0))", rowType); filters.clear(); - remaining = HiveDataSource::extractFiltersFromRemainingFilter( - expr, &evaluator, false, filters); + remaining = extractFiltersFromRemainingFilter( + expr, &evaluator, false, filters, sampleRate); + ASSERT_EQ(sampleRate, 1); ASSERT_GT(filters.count(Subfield("c2")), 0); // Change these once HUGEINT filter merge is fixed. ASSERT_TRUE(remaining); @@ -398,5 +502,49 @@ TEST_F(HiveConnectorTest, extractFiltersFromRemainingFilter) { remaining->toString(), "not(lt(ROW[\"c2\"],cast 0 as DECIMAL(20, 0)))"); } +TEST_F(HiveConnectorTest, prestoTableSampling) { + auto queryCtx = core::QueryCtx::create(); + exec::SimpleExpressionEvaluator evaluator(queryCtx.get(), pool_.get()); + auto rowType = ROW({"c0"}, {BIGINT()}); + + auto expr = parseExpr("rand() < 0.5", rowType); + SubfieldFilters filters; + double sampleRate = 1; + auto remaining = extractFiltersFromRemainingFilter( + expr, &evaluator, false, filters, sampleRate); + ASSERT_FALSE(remaining); + ASSERT_EQ(sampleRate, 0.5); + ASSERT_TRUE(filters.empty()); + + expr = parseExpr("c0 > 0 and rand() < 0.5", rowType); + filters.clear(); + sampleRate = 1; + remaining = extractFiltersFromRemainingFilter( + expr, &evaluator, false, filters, sampleRate); + ASSERT_FALSE(remaining); + ASSERT_EQ(sampleRate, 0.5); + ASSERT_EQ(filters.size(), 1); + ASSERT_GT(filters.count(Subfield("c0")), 0); + + expr = parseExpr("rand() < 0.5 and rand() < 0.5", rowType); + filters.clear(); + sampleRate = 1; + remaining = extractFiltersFromRemainingFilter( + expr, &evaluator, false, filters, sampleRate); + ASSERT_FALSE(remaining); + ASSERT_EQ(sampleRate, 0.25); + ASSERT_TRUE(filters.empty()); + + expr = parseExpr("c0 > 0 or rand() < 0.5", rowType); + filters.clear(); + sampleRate = 1; + remaining = extractFiltersFromRemainingFilter( + expr, &evaluator, false, filters, sampleRate); + ASSERT_TRUE(remaining); + ASSERT_EQ(*remaining, *expr); + ASSERT_EQ(sampleRate, 1); + ASSERT_TRUE(filters.empty()); +} + } // namespace } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp b/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp new file mode 100644 index 0000000000000..394304b7951d9 --- /dev/null +++ b/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp @@ -0,0 +1,420 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/HiveConnectorUtil.h" +#include +#include "velox/connectors/hive/HiveConfig.h" +#include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/connectors/hive/TableHandle.h" +#include "velox/exec/tests/utils/HiveConnectorTestBase.h" +#include "velox/exec/tests/utils/PrefixSortUtils.h" + +#include "velox/dwio/dwrf/writer/Writer.h" + +#ifdef VELOX_ENABLE_PARQUET +#include "velox/dwio/parquet/writer/Writer.h" +#endif + +namespace facebook::velox::connector { + +using namespace dwio::common; + +class HiveConnectorUtilTest : public exec::test::HiveConnectorTestBase { + protected: + static bool compareSerDeOptions( + const SerDeOptions& l, + const SerDeOptions& r) { + return l.isEscaped == r.isEscaped && l.escapeChar == r.escapeChar && + l.lastColumnTakesRest == r.lastColumnTakesRest && + l.nullString == r.nullString && l.separators == r.separators; + } + + std::shared_ptr pool_ = + memory::memoryManager()->addLeafPool(); +}; + +TEST_F(HiveConnectorUtilTest, configureReaderOptions) { + config::ConfigBase sessionProperties({}); + auto connectorQueryCtx = std::make_unique( + pool_.get(), + pool_.get(), + &sessionProperties, + nullptr, + exec::test::defaultPrefixSortConfig(), + nullptr, + nullptr, + "query.HiveConnectorUtilTest", + "task.HiveConnectorUtilTest", + "planNodeId.HiveConnectorUtilTest", + 0, + ""); + auto hiveConfig = + std::make_shared(std::make_shared( + std::unordered_map())); + const std::unordered_map> + partitionKeys; + const std::unordered_map customSplitInfo; + + // Dynamic parameters. + dwio::common::ReaderOptions readerOptions(pool_.get()); + FileFormat fileFormat{FileFormat::DWRF}; + std::unordered_map tableParameters; + std::unordered_map serdeParameters; + SerDeOptions expectedSerDe; + + auto createTableHandle = [&]() { + return std::make_shared( + "testConnectorId", + "testTable", + false, + hive::SubfieldFilters{}, + nullptr, + nullptr, + tableParameters); + }; + + auto createSplit = [&]() { + return std::make_shared( + "testConnectorId", + "/tmp/", + fileFormat, + 0UL, + std::numeric_limits::max(), + partitionKeys, + std::nullopt, + customSplitInfo, + nullptr, + serdeParameters); + }; + + auto performConfigure = [&]() { + auto tableHandle = createTableHandle(); + auto split = createSplit(); + configureReaderOptions( + readerOptions, hiveConfig, connectorQueryCtx.get(), tableHandle, split); + }; + + auto clearDynamicParameters = [&](FileFormat newFileFormat) { + readerOptions = dwio::common::ReaderOptions(pool_.get()); + fileFormat = newFileFormat; + tableParameters.clear(); + serdeParameters.clear(); + expectedSerDe = SerDeOptions{}; + }; + + // Default. + performConfigure(); + EXPECT_EQ(readerOptions.fileFormat(), fileFormat); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + EXPECT_EQ(readerOptions.loadQuantum(), hiveConfig->loadQuantum()); + EXPECT_EQ(readerOptions.maxCoalesceBytes(), hiveConfig->maxCoalescedBytes()); + EXPECT_EQ( + readerOptions.maxCoalesceDistance(), + hiveConfig->maxCoalescedDistanceBytes()); + EXPECT_EQ( + readerOptions.fileColumnNamesReadAsLowerCase(), + hiveConfig->isFileColumnNamesReadAsLowerCase(&sessionProperties)); + EXPECT_EQ( + readerOptions.useColumnNamesForColumnMapping(), + hiveConfig->isOrcUseColumnNames(&sessionProperties)); + EXPECT_EQ( + readerOptions.footerEstimatedSize(), hiveConfig->footerEstimatedSize()); + EXPECT_EQ( + readerOptions.filePreloadThreshold(), hiveConfig->filePreloadThreshold()); + EXPECT_EQ(readerOptions.prefetchRowGroups(), hiveConfig->prefetchRowGroups()); + + // Modify field delimiter and change the file format. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kFieldDelim] = '\t'; + expectedSerDe.separators[size_t(SerDeSeparator::FIELD_DELIM)] = '\t'; + performConfigure(); + EXPECT_EQ(readerOptions.fileFormat(), fileFormat); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Modify collection delimiter. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kCollectionDelim] = '='; + expectedSerDe.separators[size_t(SerDeSeparator::COLLECTION_DELIM)] = '='; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Modify map key delimiter. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kMapKeyDelim] = '&'; + expectedSerDe.separators[size_t(SerDeSeparator::MAP_KEY_DELIM)] = '&'; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Modify null string. + clearDynamicParameters(FileFormat::TEXT); + tableParameters[TableParameter::kSerializationNullFormat] = "x-x"; + expectedSerDe.nullString = "x-x"; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Empty escape delim means default escape char. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kEscapeChar] = ""; + expectedSerDe.escapeChar = '\\'; + expectedSerDe.isEscaped = true; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Convertible to byte escape char - use it. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kEscapeChar] = "38"; + expectedSerDe.escapeChar = '&'; + expectedSerDe.isEscaped = true; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Overflow byte escape char - fall back to the 1st character of the string. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kEscapeChar] = "381"; + expectedSerDe.escapeChar = '3'; + expectedSerDe.isEscaped = true; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Not convertible string - fall back to the 1st character of the string. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kEscapeChar] = "7!"; + expectedSerDe.escapeChar = '7'; + expectedSerDe.isEscaped = true; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Modify all previous together. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kFieldDelim] = '~'; + expectedSerDe.separators[size_t(SerDeSeparator::FIELD_DELIM)] = '~'; + serdeParameters[SerDeOptions::kCollectionDelim] = '$'; + expectedSerDe.separators[size_t(SerDeSeparator::COLLECTION_DELIM)] = '$'; + serdeParameters[SerDeOptions::kMapKeyDelim] = '*'; + expectedSerDe.separators[size_t(SerDeSeparator::MAP_KEY_DELIM)] = '*'; + serdeParameters[SerDeOptions::kEscapeChar] = '*'; + expectedSerDe.escapeChar = '*'; + expectedSerDe.isEscaped = true; + tableParameters[TableParameter::kSerializationNullFormat] = ""; + expectedSerDe.nullString = ""; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Tests other custom reader options. + clearDynamicParameters(FileFormat::TEXT); + std::unordered_map customHiveConfigProps; + customHiveConfigProps[hive::HiveConfig::kLoadQuantum] = "321"; + customHiveConfigProps[hive::HiveConfig::kMaxCoalescedBytes] = "129"; + customHiveConfigProps[hive::HiveConfig::kMaxCoalescedDistanceBytes] = "513"; + customHiveConfigProps[hive::HiveConfig::kFileColumnNamesReadAsLowerCase] = + "true"; + customHiveConfigProps[hive::HiveConfig::kOrcUseColumnNames] = "true"; + customHiveConfigProps[hive::HiveConfig::kFooterEstimatedSize] = "1111"; + customHiveConfigProps[hive::HiveConfig::kFilePreloadThreshold] = "9999"; + customHiveConfigProps[hive::HiveConfig::kPrefetchRowGroups] = "10"; + hiveConfig = std::make_shared( + std::make_shared(std::move(customHiveConfigProps))); + performConfigure(); + EXPECT_EQ(readerOptions.loadQuantum(), hiveConfig->loadQuantum()); + EXPECT_EQ(readerOptions.maxCoalesceBytes(), hiveConfig->maxCoalescedBytes()); + EXPECT_EQ( + readerOptions.maxCoalesceDistance(), + hiveConfig->maxCoalescedDistanceBytes()); + EXPECT_EQ( + readerOptions.fileColumnNamesReadAsLowerCase(), + hiveConfig->isFileColumnNamesReadAsLowerCase(&sessionProperties)); + EXPECT_EQ( + readerOptions.useColumnNamesForColumnMapping(), + hiveConfig->isOrcUseColumnNames(&sessionProperties)); + EXPECT_EQ( + readerOptions.footerEstimatedSize(), hiveConfig->footerEstimatedSize()); + EXPECT_EQ( + readerOptions.filePreloadThreshold(), hiveConfig->filePreloadThreshold()); + EXPECT_EQ(readerOptions.prefetchRowGroups(), hiveConfig->prefetchRowGroups()); +} + +TEST_F(HiveConnectorUtilTest, configureRowReaderOptions) { + auto split = + std::make_shared("", "", FileFormat::UNKNOWN); + auto rowType = ROW({{"float_features", MAP(INTEGER(), REAL())}}); + auto spec = std::make_shared(""); + spec->addAllChildFields(*rowType); + auto* float_features = spec->childByName("float_features"); + float_features->childByName(common::ScanSpec::kMapKeysFieldName) + ->setFilter(common::createBigintValues({1, 3}, false)); + float_features->setFlatMapFeatureSelection({"1", "3"}); +} + +TEST_F( + HiveConnectorUtilTest, + updateWriterOptionsFromHiveConfigDWRFWithoutSessionProperties) { + auto fileFormat = dwio::common::FileFormat::DWRF; + std::unordered_map connectorConfig = { + {hive::HiveConfig::kOrcWriterMaxStripeSize, "100MB"}, + {hive::HiveConfig::kOrcWriterMaxDictionaryMemory, "128MB"}, + {hive::HiveConfig::kOrcWriterIntegerDictionaryEncodingEnabled, "true"}, + {hive::HiveConfig::kOrcWriterStringDictionaryEncodingEnabled, "false"}, + {hive::HiveConfig::kOrcWriterLinearStripeSizeHeuristics, "true"}, + {hive::HiveConfig::kOrcWriterMinCompressionSize, "512"}, + {hive::HiveConfig::kOrcWriterCompressionLevel, "1"}}; + auto hiveConfig = std::make_shared( + std::make_shared(std::move(connectorConfig))); + std::shared_ptr connectorSessionProperties = + std::make_shared( + std::unordered_map()); + std::shared_ptr options = + std::make_shared(); + options->compressionKind = velox::common::CompressionKind_ZLIB; + + updateWriterOptionsFromHiveConfig( + fileFormat, hiveConfig, connectorSessionProperties.get(), options); + + auto dwrfOptions = std::dynamic_pointer_cast(options); + ASSERT_EQ( + dwrfOptions->config->get(dwrf::Config::COMPRESSION.key), + "1"); + ASSERT_EQ( + dwrfOptions->config->get(dwrf::Config::STRIPE_SIZE.key), + std::to_string(100 * 1024 * 1024)); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::MAX_DICTIONARY_SIZE.key), + std::to_string(128 * 1024 * 1024)); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::INTEGER_DICTIONARY_ENCODING_ENABLED.key), + true); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::STRING_DICTIONARY_ENCODING_ENABLED.key), + false); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::LINEAR_STRIPE_SIZE_HEURISTICS.key), + true); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::COMPRESSION_BLOCK_SIZE_MIN.key), + "512"); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::ZLIB_COMPRESSION_LEVEL.key), + "1"); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::ZSTD_COMPRESSION_LEVEL.key), + "1"); +} + +TEST_F( + HiveConnectorUtilTest, + updateWriterOptionsFromHiveConfigDWRFWithSessionProperties) { + auto fileFormat = dwio::common::FileFormat::DWRF; + std::unordered_map connectorConfig = { + {hive::HiveConfig::kOrcWriterMaxStripeSize, "100MB"}, + {hive::HiveConfig::kOrcWriterMaxDictionaryMemory, "128MB"}, + {hive::HiveConfig::kOrcWriterIntegerDictionaryEncodingEnabled, "true"}, + {hive::HiveConfig::kOrcWriterStringDictionaryEncodingEnabled, "false"}, + {hive::HiveConfig::kOrcWriterLinearStripeSizeHeuristics, "true"}, + {hive::HiveConfig::kOrcWriterMinCompressionSize, "512"}, + {hive::HiveConfig::kOrcWriterCompressionLevel, "1"}}; + auto hiveConfig = std::make_shared( + std::make_shared(std::move(connectorConfig))); + + std::unordered_map sessionConfig = { + {hive::HiveConfig::kOrcWriterMaxStripeSizeSession, "128MB"}, + {hive::HiveConfig::kOrcWriterMaxDictionaryMemorySession, "100MB"}, + {hive::HiveConfig::kOrcWriterIntegerDictionaryEncodingEnabledSession, + "false"}, + {hive::HiveConfig::kOrcWriterStringDictionaryEncodingEnabledSession, + "true"}, + {hive::HiveConfig::kOrcWriterLinearStripeSizeHeuristicsSession, "false"}, + {hive::HiveConfig::kOrcWriterMinCompressionSizeSession, "1024"}, + {hive::HiveConfig::kOrcWriterCompressionLevelSession, "2"}}; + + std::shared_ptr connectorSessionProperties = + std::make_shared(std::move(sessionConfig)); + std::shared_ptr options = + std::make_shared(); + options->compressionKind = velox::common::CompressionKind_ZLIB; + + updateWriterOptionsFromHiveConfig( + fileFormat, hiveConfig, connectorSessionProperties.get(), options); + + auto dwrfOptions = std::dynamic_pointer_cast(options); + ASSERT_EQ( + dwrfOptions->config->get(dwrf::Config::COMPRESSION.key), + "1"); + ASSERT_EQ( + dwrfOptions->config->get(dwrf::Config::STRIPE_SIZE.key), + std::to_string(128 * 1024 * 1024)); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::MAX_DICTIONARY_SIZE.key), + std::to_string(100 * 1024 * 1024)); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::INTEGER_DICTIONARY_ENCODING_ENABLED.key), + false); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::STRING_DICTIONARY_ENCODING_ENABLED.key), + true); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::LINEAR_STRIPE_SIZE_HEURISTICS.key), + false); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::COMPRESSION_BLOCK_SIZE_MIN.key), + "1024"); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::ZLIB_COMPRESSION_LEVEL.key), + "2"); + ASSERT_EQ( + dwrfOptions->config->get( + dwrf::Config::ZSTD_COMPRESSION_LEVEL.key), + "2"); +} + +#ifdef VELOX_ENABLE_PARQUET +TEST_F(HiveConnectorUtilTest, updateWriterOptionsFromHiveConfigParquet) { + auto fileFormat = dwio::common::FileFormat::PARQUET; + std::unordered_map connectorConfig = { + {parquet::WriterOptions::kParquetSessionWriteTimestampUnit, "3"}, + {core::QueryConfig::kSessionTimezone, "UTC"}}; + auto hiveConfig = std::make_shared( + std::make_shared(std::move(connectorConfig))); + std::shared_ptr connectorSessionProperties = + std::make_shared( + std::unordered_map()); + std::shared_ptr options = + std::make_shared(); + options->compressionKind = velox::common::CompressionKind_ZLIB; + + updateWriterOptionsFromHiveConfig( + fileFormat, hiveConfig, connectorSessionProperties.get(), options); + + auto parquetOptions = + std::dynamic_pointer_cast(options); + ASSERT_EQ( + parquetOptions->parquetWriteTimestampUnit.value(), TimestampUnit::kMilli); + ASSERT_EQ(parquetOptions->parquetWriteTimestampTimeZone.value(), "UTC"); +} +#endif + +} // namespace facebook::velox::connector diff --git a/velox/connectors/hive/tests/HiveDataSinkTest.cpp b/velox/connectors/hive/tests/HiveDataSinkTest.cpp index 2504d068e000e..e464e14784ef9 100644 --- a/velox/connectors/hive/tests/HiveDataSinkTest.cpp +++ b/velox/connectors/hive/tests/HiveDataSinkTest.cpp @@ -18,11 +18,23 @@ #include "velox/exec/tests/utils/HiveConnectorTestBase.h" #include +#include #include "velox/common/base/Fs.h" #include "velox/common/base/tests/GTestUtils.h" -#include "velox/core/Config.h" +#include "velox/common/testutil/TestValue.h" +#include "velox/dwio/common/BufferedInput.h" #include "velox/dwio/common/Options.h" +#include "velox/dwio/dwrf/reader/DwrfReader.h" +#include "velox/dwio/dwrf/writer/FlushPolicy.h" +#include "velox/dwio/dwrf/writer/Writer.h" + +#ifdef VELOX_ENABLE_PARQUET +#include "velox/dwio/parquet/reader/ParquetReader.h" +#include "velox/dwio/parquet/writer/Writer.h" +#endif + #include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/exec/tests/utils/PrefixSortUtils.h" #include "velox/exec/tests/utils/TempDirectoryPath.h" #include "velox/vector/fuzzer/VectorFuzzer.h" @@ -31,62 +43,141 @@ namespace { using namespace facebook::velox::common; using namespace facebook::velox::exec::test; - -constexpr const char* kHiveConnectorId = "test-hive"; +using namespace facebook::velox::common::testutil; class HiveDataSinkTest : public exec::test::HiveConnectorTestBase { protected: void SetUp() override { + HiveConnectorTestBase::SetUp(); Type::registerSerDe(); HiveSortingColumn::registerSerDe(); HiveBucketProperty::registerSerDe(); rowType_ = - ROW({"c0", "c1", "c2", "c3", "c4", "c5"}, - {BIGINT(), INTEGER(), SMALLINT(), REAL(), DOUBLE(), VARCHAR()}); + ROW({"c0", "c1", "c2", "c3", "c4", "c5", "c6"}, + {BIGINT(), + INTEGER(), + SMALLINT(), + REAL(), + DOUBLE(), + VARCHAR(), + BOOLEAN()}); + + setupMemoryPools(); + + spillExecutor_ = std::make_unique( + std::thread::hardware_concurrency()); + } + + void TearDown() override { + connectorQueryCtx_.reset(); + connectorPool_.reset(); + opPool_.reset(); + root_.reset(); + HiveConnectorTestBase::TearDown(); + } + + std::vector createVectors(int vectorSize, int numVectors) { + VectorFuzzer::Options options; + options.vectorSize = vectorSize; + VectorFuzzer fuzzer(options, pool()); + std::vector vectors; + for (int i = 0; i < numVectors; ++i) { + vectors.push_back(fuzzer.fuzzInputRow(rowType_)); + } + return vectors; + } + + std::unique_ptr getSpillConfig( + const std::string& spillPath, + uint64_t writerFlushThreshold) { + return std::make_unique( + [spillPath]() -> const std::string& { return spillPath; }, + [&](uint64_t) {}, + "", + 0, + 0, + /*readBufferSize=*/1 << 20, + spillExecutor_.get(), + 10, + 20, + 0, + 0, + 0, + 0, + writerFlushThreshold, + "none"); + } + + void setupMemoryPools() { + connectorQueryCtx_.reset(); + connectorPool_.reset(); + opPool_.reset(); + root_.reset(); + + root_ = memory::memoryManager()->addRootPool( + "HiveDataSinkTest", 1L << 30, exec::MemoryReclaimer::create()); + opPool_ = root_->addLeafChild("operator"); + connectorPool_ = + root_->addAggregateChild("connector", exec::MemoryReclaimer::create()); + connectorQueryCtx_ = std::make_unique( opPool_.get(), connectorPool_.get(), + connectorSessionProperties_.get(), nullptr, - connectorConfig_.get(), - nullptr, + exec::test::defaultPrefixSortConfig(), nullptr, nullptr, "query.HiveDataSinkTest", "task.HiveDataSinkTest", "planNodeId.HiveDataSinkTest", - 0); - - auto hiveConnector = - connector::getConnectorFactory( - connector::hive::HiveConnectorFactory::kHiveConnectorName) - ->newConnector(kHiveConnectorId, nullptr); - connector::registerConnector(std::move(hiveConnector)); + 0, + ""); } std::shared_ptr createHiveInsertTableHandle( const RowTypePtr& outputRowType, - const std::string& outputDirectoryPath) { + const std::string& outputDirectoryPath, + dwio::common::FileFormat fileFormat = dwio::common::FileFormat::DWRF, + const std::vector& partitionedBy = {}, + const std::shared_ptr& + bucketProperty = nullptr, + const std::shared_ptr& writerOptions = + nullptr) { return makeHiveInsertTableHandle( outputRowType->names(), outputRowType->children(), - {}, - nullptr, + partitionedBy, + bucketProperty, makeLocationHandle( outputDirectoryPath, std::nullopt, connector::hive::LocationHandle::TableType::kNew), - dwio::common::FileFormat::DWRF, - CompressionKind::CompressionKind_ZSTD); + fileFormat, + CompressionKind::CompressionKind_ZSTD, + writerOptions); } std::shared_ptr createDataSink( const RowTypePtr& rowType, - const std::string& outputDirectoryPath) { + const std::string& outputDirectoryPath, + dwio::common::FileFormat fileFormat = dwio::common::FileFormat::DWRF, + const std::vector& partitionedBy = {}, + const std::shared_ptr& + bucketProperty = nullptr, + const std::shared_ptr& writerOptions = + nullptr) { return std::make_shared( rowType, - createHiveInsertTableHandle(rowType, outputDirectoryPath), + createHiveInsertTableHandle( + rowType, + outputDirectoryPath, + fileFormat, + partitionedBy, + bucketProperty, + writerOptions), connectorQueryCtx_.get(), CommitStrategy::kNoCommit, connectorConfig_); @@ -102,28 +193,39 @@ class HiveDataSinkTest : public exec::test::HiveConnectorTestBase { return files; } - void verifyWrittenData(const std::string& dirPath) { + void verifyWrittenData(const std::string& dirPath, int32_t numFiles = 1) { const std::vector filePaths = listFiles(dirPath); - ASSERT_EQ(filePaths.size(), 1); + ASSERT_EQ(filePaths.size(), numFiles); + std::vector> splits; + std::for_each(filePaths.begin(), filePaths.end(), [&](auto filePath) { + splits.push_back(makeHiveConnectorSplit(filePath)); + }); HiveConnectorTestBase::assertQuery( PlanBuilder().tableScan(rowType_).planNode(), - {makeHiveConnectorSplit(filePaths[0])}, + splits, fmt::format("SELECT * FROM tmp")); } + void setConnectorQueryContext( + std::unique_ptr connectorQueryCtx) { + connectorQueryCtx_ = std::move(connectorQueryCtx); + } + const std::shared_ptr pool_ = - memory::addDefaultLeafMemoryPool(); - const std::shared_ptr root_ = - memory::defaultMemoryManager().addRootPool("HiveDataSinkTest"); - const std::shared_ptr opPool_ = - root_->addLeafChild("operator"); - const std::shared_ptr connectorPool_ = - root_->addAggregateChild("connector"); - const std::shared_ptr connectorConfig_{ - std::make_unique()}; + memory::memoryManager()->addLeafPool(); + std::shared_ptr root_; + std::shared_ptr opPool_; + std::shared_ptr connectorPool_; RowTypePtr rowType_; + std::shared_ptr connectorSessionProperties_ = + std::make_shared( + std::unordered_map()); std::unique_ptr connectorQueryCtx_; + std::shared_ptr connectorConfig_ = + std::make_shared(std::make_shared( + std::unordered_map())); + std::unique_ptr spillExecutor_; }; TEST_F(HiveDataSinkTest, hiveSortingColumn) { @@ -397,58 +499,116 @@ TEST_F(HiveDataSinkTest, hiveBucketProperty) { } TEST_F(HiveDataSinkTest, basic) { + const auto outputDirectory = TempDirectoryPath::create(); + auto dataSink = createDataSink(rowType_, outputDirectory->getPath()); + auto stats = dataSink->stats(); + ASSERT_TRUE(stats.empty()) << stats.toString(); + ASSERT_EQ( + stats.toString(), + "numWrittenBytes 0B numWrittenFiles 0 spillRuns[0] spilledInputBytes[0B] " + "spilledBytes[0B] spilledRows[0] spilledPartitions[0] spilledFiles[0] " + "spillFillTimeNanos[0ns] spillSortTimeNanos[0ns] spillSerializationTimeNanos[0ns] " + "spillWrites[0] spillFlushTimeNanos[0ns] spillWriteTimeNanos[0ns] " + "maxSpillExceededLimitCount[0] spillReadBytes[0B] spillReads[0] " + "spillReadTimeNanos[0ns] spillReadDeserializationTimeNanos[0ns]"); + const int numBatches = 10; + const auto vectors = createVectors(500, numBatches); + for (const auto& vector : vectors) { + dataSink->appendData(vector); + } + stats = dataSink->stats(); + ASSERT_FALSE(stats.empty()); + ASSERT_GT(stats.numWrittenBytes, 0); + ASSERT_EQ(stats.numWrittenFiles, 0); + + const auto partitions = dataSink->close(); + stats = dataSink->stats(); + ASSERT_FALSE(stats.empty()); + ASSERT_EQ(partitions.size(), 1); + + createDuckDbTable(vectors); + verifyWrittenData(outputDirectory->getPath()); +} + +TEST_F(HiveDataSinkTest, basicBucket) { const auto outputDirectory = TempDirectoryPath::create(); - auto dataSink = createDataSink(rowType_, outputDirectory->path); - - VectorFuzzer::Options options; - options.vectorSize = 500; - VectorFuzzer fuzzer(options, pool()); - std::vector vectors; - for (int i = 0; i < numBatches; ++i) { - vectors.push_back(fuzzer.fuzzRow(rowType_)); - dataSink->appendData(vectors.back()); + + const int32_t numBuckets = 4; + auto bucketProperty = std::make_shared( + HiveBucketProperty::Kind::kHiveCompatible, + numBuckets, + std::vector{"c0"}, + std::vector{BIGINT()}, + std::vector>{ + std::make_shared( + "c1", core::SortOrder{false, false})}); + auto dataSink = createDataSink( + rowType_, + outputDirectory->getPath(), + dwio::common::FileFormat::DWRF, + {}, + bucketProperty); + auto stats = dataSink->stats(); + ASSERT_TRUE(stats.empty()) << stats.toString(); + ASSERT_EQ( + stats.toString(), + "numWrittenBytes 0B numWrittenFiles 0 spillRuns[0] spilledInputBytes[0B] " + "spilledBytes[0B] spilledRows[0] spilledPartitions[0] spilledFiles[0] " + "spillFillTimeNanos[0ns] spillSortTimeNanos[0ns] spillSerializationTimeNanos[0ns] " + "spillWrites[0] spillFlushTimeNanos[0ns] spillWriteTimeNanos[0ns] " + "maxSpillExceededLimitCount[0] spillReadBytes[0B] spillReads[0] " + "spillReadTimeNanos[0ns] spillReadDeserializationTimeNanos[0ns]"); + + const int numBatches = 10; + const auto vectors = createVectors(500, numBatches); + for (const auto& vector : vectors) { + dataSink->appendData(vector); } - const auto results = dataSink->close(true); - ASSERT_EQ(results.size(), 1); + stats = dataSink->stats(); + ASSERT_FALSE(stats.empty()); + ASSERT_GT(stats.numWrittenBytes, 0); + ASSERT_EQ(stats.numWrittenFiles, 0); + + const auto partitions = dataSink->close(); + stats = dataSink->stats(); + ASSERT_FALSE(stats.empty()); + ASSERT_EQ(partitions.size(), numBuckets); createDuckDbTable(vectors); - verifyWrittenData(outputDirectory->path); + verifyWrittenData(outputDirectory->getPath(), numBuckets); } TEST_F(HiveDataSinkTest, close) { for (bool empty : {true, false}) { SCOPED_TRACE(fmt::format("Data sink is empty: {}", empty)); const auto outputDirectory = TempDirectoryPath::create(); - auto dataSink = createDataSink(rowType_, outputDirectory->path); + auto dataSink = createDataSink(rowType_, outputDirectory->getPath()); + + auto vectors = createVectors(500, 1); - std::vector vectors; - VectorFuzzer::Options options; - options.vectorSize = 1; - VectorFuzzer fuzzer(options, pool()); - vectors.push_back(fuzzer.fuzzRow(rowType_)); if (!empty) { - dataSink->appendData(vectors.back()); - ASSERT_GT(dataSink->getCompletedBytes(), 0); + dataSink->appendData(vectors[0]); + ASSERT_GT(dataSink->stats().numWrittenBytes, 0); } else { - ASSERT_EQ(dataSink->getCompletedBytes(), 0); + ASSERT_EQ(dataSink->stats().numWrittenBytes, 0); } - const auto results = dataSink->close(true); + const auto partitions = dataSink->close(); // Can't append after close. VELOX_ASSERT_THROW( - dataSink->appendData(vectors.back()), "Hive data sink has been closed"); - ASSERT_EQ(dataSink->close(true), results); - VELOX_ASSERT_THROW( - dataSink->close(false), "Can't abort a closed hive data sink"); + dataSink->appendData(vectors.back()), "Hive data sink is not running"); + VELOX_ASSERT_THROW(dataSink->close(), "Hive data sink is not running"); + VELOX_ASSERT_THROW(dataSink->abort(), "Hive data sink is not running"); + const auto stats = dataSink->stats(); if (!empty) { - ASSERT_EQ(results.size(), 1); - ASSERT_GT(dataSink->getCompletedBytes(), 0); + ASSERT_EQ(partitions.size(), 1); + ASSERT_GT(stats.numWrittenBytes, 0); createDuckDbTable(vectors); - verifyWrittenData(outputDirectory->path); + verifyWrittenData(outputDirectory->getPath()); } else { - ASSERT_TRUE(results.empty()); - ASSERT_EQ(dataSink->getCompletedBytes(), 0); + ASSERT_TRUE(partitions.empty()); + ASSERT_EQ(stats.numWrittenBytes, 0); } } } @@ -457,33 +617,467 @@ TEST_F(HiveDataSinkTest, abort) { for (bool empty : {true, false}) { SCOPED_TRACE(fmt::format("Data sink is empty: {}", empty)); const auto outputDirectory = TempDirectoryPath::create(); - auto dataSink = createDataSink(rowType_, outputDirectory->path); + auto dataSink = createDataSink(rowType_, outputDirectory->getPath()); - std::vector vectors; - VectorFuzzer::Options options; - options.vectorSize = 1; - VectorFuzzer fuzzer(options, pool()); - vectors.push_back(fuzzer.fuzzRow(rowType_)); + auto vectors = createVectors(1, 1); int initialBytes = 0; if (!empty) { dataSink->appendData(vectors.back()); - initialBytes = dataSink->getCompletedBytes(); + initialBytes = dataSink->stats().numWrittenBytes; ASSERT_GT(initialBytes, 0); } else { - initialBytes = dataSink->getCompletedBytes(); + initialBytes = dataSink->stats().numWrittenBytes; ASSERT_EQ(initialBytes, 0); } - ASSERT_TRUE(dataSink->close(false).empty()); + dataSink->abort(); + const auto stats = dataSink->stats(); + ASSERT_TRUE(stats.empty()); // Can't close after abort. - VELOX_ASSERT_THROW( - dataSink->close(true), "Can't close an aborted hive data sink"); - ASSERT_TRUE(dataSink->close(false).empty()); + VELOX_ASSERT_THROW(dataSink->close(), "Hive data sink is not running"); + VELOX_ASSERT_THROW(dataSink->abort(), "Hive data sink is not running"); // Can't append after abort. VELOX_ASSERT_THROW( - dataSink->appendData(vectors.back()), - "Hive data sink hash been aborted"); + dataSink->appendData(vectors.back()), "Hive data sink is not running"); + } +} + +DEBUG_ONLY_TEST_F(HiveDataSinkTest, memoryReclaim) { + const int numBatches = 200; + auto vectors = createVectors(500, 200); + + struct { + dwio::common::FileFormat format; + bool sortWriter; + bool writerSpillEnabled; + uint64_t writerFlushThreshold; + bool expectedWriterReclaimEnabled; + bool expectedWriterReclaimed; + + std::string debugString() const { + return fmt::format( + "format: {}, sortWriter: {}, writerSpillEnabled: {}, writerFlushThreshold: {}, expectedWriterReclaimEnabled: {}, expectedWriterReclaimed: {}", + dwio::common::toString(format), + sortWriter, + writerSpillEnabled, + succinctBytes(writerFlushThreshold), + expectedWriterReclaimEnabled, + expectedWriterReclaimed); + } + } testSettings[] = { + {dwio::common::FileFormat::DWRF, true, true, 1 << 30, true, true}, + {dwio::common::FileFormat::DWRF, true, true, 1, true, true}, + {dwio::common::FileFormat::DWRF, true, false, 1 << 30, false, false}, + {dwio::common::FileFormat::DWRF, true, false, 1, false, false}, + {dwio::common::FileFormat::DWRF, false, true, 1 << 30, true, false}, + {dwio::common::FileFormat::DWRF, false, true, 1, true, true}, + {dwio::common::FileFormat::DWRF, false, false, 1 << 30, false, false}, + {dwio::common::FileFormat::DWRF, false, false, 1, false, false}, + // Add Parquet with https://github.com/facebookincubator/velox/issues/5560 +#if 0 + {dwio::common::FileFormat::PARQUET, true, true, 1 << 30, false, false}, + {dwio::common::FileFormat::PARQUET, true, true, 1, false, false}, + {dwio::common::FileFormat::PARQUET, true, false, 1 << 30, false, false}, + {dwio::common::FileFormat::PARQUET, true, false, 1, false, false}, + {dwio::common::FileFormat::PARQUET, false, true, 1 << 30, false, false}, + {dwio::common::FileFormat::PARQUET, false, true, 1, false, false}, + {dwio::common::FileFormat::PARQUET, false, false, 1 << 30, false, false}, + {dwio::common::FileFormat::PARQUET, false, false, 1, false, false} +#endif + }; + SCOPED_TESTVALUE_SET( + "facebook::velox::dwrf::Writer::MemoryReclaimer::reclaimableBytes", + std::function([&](dwrf::Writer* writer) { + // Release before reclaim to make it not able to reclaim from reserved + // memory. + writer->getContext().releaseMemoryReservation(); + })); + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + setupMemoryPools(); + + const auto outputDirectory = TempDirectoryPath::create(); + std::shared_ptr bucketProperty; + std::vector partitionBy; + if (testData.sortWriter) { + partitionBy = {"c6"}; + bucketProperty = std::make_shared( + HiveBucketProperty::Kind::kHiveCompatible, + 4, + std::vector{"c0"}, + std::vector{BIGINT()}, + std::vector>{ + std::make_shared( + "c1", core::SortOrder{false, false})}); + } + std::shared_ptr spillDirectory; + std::unique_ptr spillConfig; + if (testData.writerSpillEnabled) { + spillDirectory = exec::test::TempDirectoryPath::create(); + spillConfig = getSpillConfig( + spillDirectory->getPath(), testData.writerFlushThreshold); + auto connectorQueryCtx = std::make_unique( + opPool_.get(), + connectorPool_.get(), + connectorSessionProperties_.get(), + spillConfig.get(), + exec::test::defaultPrefixSortConfig(), + nullptr, + nullptr, + "query.HiveDataSinkTest", + "task.HiveDataSinkTest", + "planNodeId.HiveDataSinkTest", + 0, + ""); + setConnectorQueryContext(std::move(connectorQueryCtx)); + } else { + auto connectorQueryCtx = std::make_unique( + opPool_.get(), + connectorPool_.get(), + connectorSessionProperties_.get(), + nullptr, + exec::test::defaultPrefixSortConfig(), + nullptr, + nullptr, + "query.HiveDataSinkTest", + "task.HiveDataSinkTest", + "planNodeId.HiveDataSinkTest", + 0, + ""); + setConnectorQueryContext(std::move(connectorQueryCtx)); + } + + auto dataSink = createDataSink( + rowType_, + outputDirectory->getPath(), + testData.format, + partitionBy, + bucketProperty); + auto* hiveDataSink = static_cast(dataSink.get()); + ASSERT_EQ( + hiveDataSink->canReclaim(), testData.expectedWriterReclaimEnabled); + for (int i = 0; i < numBatches; ++i) { + dataSink->appendData(vectors[i]); + } + memory::MemoryArbitrator::Stats oldStats = + memory::memoryManager()->arbitrator()->stats(); + uint64_t reclaimableBytes{0}; + if (testData.expectedWriterReclaimed) { + reclaimableBytes = root_->reclaimableBytes().value(); + ASSERT_GT(reclaimableBytes, 0); + memory::testingRunArbitration(); + memory::MemoryArbitrator::Stats curStats = + memory::memoryManager()->arbitrator()->stats(); + ASSERT_GT(curStats.reclaimTimeUs - oldStats.reclaimTimeUs, 0); + ASSERT_GT(curStats.numReclaimedBytes - oldStats.numReclaimedBytes, 0); + // We expect dwrf writer set numNonReclaimableAttempts counter. + ASSERT_LE( + curStats.numNonReclaimableAttempts - + oldStats.numNonReclaimableAttempts, + 1); + } else { + ASSERT_FALSE(root_->reclaimableBytes().has_value()); + memory::testingRunArbitration(); + memory::MemoryArbitrator::Stats curStats = + memory::memoryManager()->arbitrator()->stats(); + ASSERT_EQ(curStats.reclaimTimeUs - oldStats.reclaimTimeUs, 0); + ASSERT_EQ(curStats.numReclaimedBytes - oldStats.numReclaimedBytes, 0); + } + const auto partitions = dataSink->close(); + if (testData.sortWriter && testData.expectedWriterReclaimed) { + ASSERT_FALSE(dataSink->stats().spillStats.empty()); + } else { + ASSERT_TRUE(dataSink->stats().spillStats.empty()); + } + ASSERT_GE(partitions.size(), 1); + } +} + +TEST_F(HiveDataSinkTest, memoryReclaimAfterClose) { + const int numBatches = 10; + const auto vectors = createVectors(500, 10); + + struct { + dwio::common::FileFormat format; + bool sortWriter; + bool writerSpillEnabled; + bool close; + bool expectedWriterReclaimEnabled; + + std::string debugString() const { + return fmt::format( + "format: {}, sortWriter: {}, writerSpillEnabled: {}, close: {}, expectedWriterReclaimEnabled: {}", + dwio::common::toString(format), + sortWriter, + writerSpillEnabled, + close, + expectedWriterReclaimEnabled); + } + } testSettings[] = { + {dwio::common::FileFormat::DWRF, true, true, true, true}, + {dwio::common::FileFormat::DWRF, true, false, true, false}, + {dwio::common::FileFormat::DWRF, true, true, false, true}, + {dwio::common::FileFormat::DWRF, true, false, false, false}, + {dwio::common::FileFormat::DWRF, false, true, true, true}, + {dwio::common::FileFormat::DWRF, false, false, true, false}, + {dwio::common::FileFormat::DWRF, false, true, false, true}, + {dwio::common::FileFormat::DWRF, false, false, false, false} + // Add parquet file format after fix + // https://github.com/facebookincubator/velox/issues/5560 + }; + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + + std::unordered_map connectorConfig; + // Always allow memory reclaim from the file writer/ + connectorConfig.emplace( + "file_writer_flush_threshold_bytes", folly::to(0)); + // Avoid internal the stripe flush while data write. + connectorConfig.emplace("hive.orc.writer.stripe-max-size", "1GB"); + connectorConfig.emplace("hive.orc.writer.dictionary-max-memory", "1GB"); + + connectorConfig_ = std::make_shared( + std::make_shared(std::move(connectorConfig))); + const auto outputDirectory = TempDirectoryPath::create(); + std::shared_ptr bucketProperty; + std::vector partitionBy; + if (testData.sortWriter) { + partitionBy = {"c6"}; + bucketProperty = std::make_shared( + HiveBucketProperty::Kind::kHiveCompatible, + 4, + std::vector{"c0"}, + std::vector{BIGINT()}, + std::vector>{ + std::make_shared( + "c1", core::SortOrder{false, false})}); + } + std::shared_ptr spillDirectory; + std::unique_ptr spillConfig; + if (testData.writerSpillEnabled) { + spillDirectory = exec::test::TempDirectoryPath::create(); + spillConfig = getSpillConfig(spillDirectory->getPath(), 0); + auto connectorQueryCtx = std::make_unique( + opPool_.get(), + connectorPool_.get(), + connectorSessionProperties_.get(), + spillConfig.get(), + exec::test::defaultPrefixSortConfig(), + nullptr, + nullptr, + "query.HiveDataSinkTest", + "task.HiveDataSinkTest", + "planNodeId.HiveDataSinkTest", + 0, + ""); + setConnectorQueryContext(std::move(connectorQueryCtx)); + } else { + auto connectorQueryCtx = std::make_unique( + opPool_.get(), + connectorPool_.get(), + connectorSessionProperties_.get(), + nullptr, + exec::test::defaultPrefixSortConfig(), + nullptr, + nullptr, + "query.HiveDataSinkTest", + "task.HiveDataSinkTest", + "planNodeId.HiveDataSinkTest", + 0, + ""); + setConnectorQueryContext(std::move(connectorQueryCtx)); + } + + auto dataSink = createDataSink( + rowType_, + outputDirectory->getPath(), + testData.format, + partitionBy, + bucketProperty); + auto* hiveDataSink = static_cast(dataSink.get()); + ASSERT_EQ( + hiveDataSink->canReclaim(), testData.expectedWriterReclaimEnabled); + + for (int i = 0; i < numBatches; ++i) { + dataSink->appendData(vectors[i]); + } + if (testData.close) { + const auto partitions = dataSink->close(); + ASSERT_GE(partitions.size(), 1); + } else { + dataSink->abort(); + ASSERT_TRUE(dataSink->stats().empty()); + } + + memory::MemoryReclaimer::Stats stats; + uint64_t reclaimableBytes{0}; + if (testData.expectedWriterReclaimEnabled) { + reclaimableBytes = root_->reclaimableBytes().value(); + if (testData.close) { + // NOTE: file writer might not release all the memory on close + // immediately. + ASSERT_GE(reclaimableBytes, 0); + } else { + ASSERT_EQ(reclaimableBytes, 0); + } + } else { + ASSERT_FALSE(root_->reclaimableBytes().has_value()); + } + ASSERT_EQ(root_->reclaim(1L << 30, 0, stats), 0); + ASSERT_EQ(stats.reclaimExecTimeUs, 0); + ASSERT_EQ(stats.reclaimedBytes, 0); + if (testData.expectedWriterReclaimEnabled) { + ASSERT_GE(stats.numNonReclaimableAttempts, 0); + } else { + ASSERT_EQ(stats.numNonReclaimableAttempts, 0); + } } } + +DEBUG_ONLY_TEST_F(HiveDataSinkTest, sortWriterFailureTest) { + auto vectors = createVectors(500, 10); + + const auto outputDirectory = TempDirectoryPath::create(); + const std::vector partitionBy{"c6"}; + const auto bucketProperty = std::make_shared( + HiveBucketProperty::Kind::kHiveCompatible, + 4, + std::vector{"c0"}, + std::vector{BIGINT()}, + std::vector>{ + std::make_shared( + "c1", core::SortOrder{false, false})}); + const std::shared_ptr spillDirectory = + exec::test::TempDirectoryPath::create(); + std::unique_ptr spillConfig = + getSpillConfig(spillDirectory->getPath(), 0); + // Triggers the memory reservation in sort buffer. + spillConfig->minSpillableReservationPct = 1'000; + auto connectorQueryCtx = std::make_unique( + opPool_.get(), + connectorPool_.get(), + connectorSessionProperties_.get(), + spillConfig.get(), + exec::test::defaultPrefixSortConfig(), + nullptr, + nullptr, + "query.HiveDataSinkTest", + "task.HiveDataSinkTest", + "planNodeId.HiveDataSinkTest", + 0, + ""); + setConnectorQueryContext(std::move(connectorQueryCtx)); + + auto dataSink = createDataSink( + rowType_, + outputDirectory->getPath(), + dwio::common::FileFormat::DWRF, + partitionBy, + bucketProperty); + for (auto& vector : vectors) { + dataSink->appendData(vector); + } + + SCOPED_TESTVALUE_SET( + "facebook::velox::dwrf::Writer::write", + std::function( + [&](memory::MemoryPool* pool) { VELOX_FAIL("inject failure"); })); + + VELOX_ASSERT_THROW(dataSink->close(), "inject failure"); +} + +TEST_F(HiveDataSinkTest, insertTableHandleToString) { + const int32_t numBuckets = 4; + auto bucketProperty = std::make_shared( + HiveBucketProperty::Kind::kHiveCompatible, + numBuckets, + std::vector{"c5"}, + std::vector{VARCHAR()}, + std::vector>{ + std::make_shared( + "c5", core::SortOrder{false, false})}); + auto insertTableHandle = createHiveInsertTableHandle( + rowType_, + "/path/to/test", + dwio::common::FileFormat::DWRF, + {"c5", "c6"}, + bucketProperty); + ASSERT_EQ( + insertTableHandle->toString(), + "HiveInsertTableHandle [dwrf zstd], [inputColumns: [ HiveColumnHandle [name: c0, columnType: Regular, dataType: BIGINT, requiredSubfields: [ ]] HiveColumnHandle [name: c1, columnType: Regular, dataType: INTEGER, requiredSubfields: [ ]] HiveColumnHandle [name: c2, columnType: Regular, dataType: SMALLINT, requiredSubfields: [ ]] HiveColumnHandle [name: c3, columnType: Regular, dataType: REAL, requiredSubfields: [ ]] HiveColumnHandle [name: c4, columnType: Regular, dataType: DOUBLE, requiredSubfields: [ ]] HiveColumnHandle [name: c5, columnType: PartitionKey, dataType: VARCHAR, requiredSubfields: [ ]] HiveColumnHandle [name: c6, columnType: PartitionKey, dataType: BOOLEAN, requiredSubfields: [ ]] ], locationHandle: LocationHandle [targetPath: /path/to/test, writePath: /path/to/test, tableType: kNew,, bucketProperty: \nHiveBucketProperty[\n\tBucket Columns:\n\t\tc5\n\tBucket Types:\n\t\tVARCHAR\n\tSortedBy Columns:\n\t\t[COLUMN[c5] ORDER[DESC NULLS LAST]]\n]\n]"); +} + +#ifdef VELOX_ENABLE_PARQUET +TEST_F(HiveDataSinkTest, flushPolicyWithParquet) { + const auto outputDirectory = TempDirectoryPath::create(); + auto flushPolicyFactory = []() { + return std::make_unique(1234, 0); + }; + auto writeOptions = std::make_shared(); + writeOptions->flushPolicyFactory = flushPolicyFactory; + auto dataSink = createDataSink( + rowType_, + outputDirectory->getPath(), + dwio::common::FileFormat::PARQUET, + {}, + nullptr, + writeOptions); + + const int numBatches = 10; + const auto vectors = createVectors(500, numBatches); + for (const auto& vector : vectors) { + dataSink->appendData(vector); + } + dataSink->close(); + + dwio::common::ReaderOptions readerOpts{pool_.get()}; + const std::vector filePaths = + listFiles(outputDirectory->getPath()); + auto bufferedInput = std::make_unique( + std::make_shared(filePaths[0]), readerOpts.memoryPool()); + + auto reader = std::make_unique( + std::move(bufferedInput), readerOpts); + auto fileMeta = reader->fileMetaData(); + EXPECT_EQ(fileMeta.numRowGroups(), 10); + EXPECT_EQ(fileMeta.rowGroup(0).numRows(), 500); +} +#endif + +TEST_F(HiveDataSinkTest, flushPolicyWithDWRF) { + const auto outputDirectory = TempDirectoryPath::create(); + auto flushPolicyFactory = []() { + return std::make_unique(1234, 0); + }; + + auto writeOptions = std::make_shared(); + writeOptions->flushPolicyFactory = flushPolicyFactory; + auto dataSink = createDataSink( + rowType_, + outputDirectory->getPath(), + dwio::common::FileFormat::DWRF, + {}, + nullptr, + writeOptions); + + const int numBatches = 10; + const auto vectors = createVectors(500, numBatches); + for (const auto& vector : vectors) { + dataSink->appendData(vector); + } + dataSink->close(); + + dwio::common::ReaderOptions readerOpts{pool_.get()}; + const std::vector filePaths = + listFiles(outputDirectory->getPath()); + auto bufferedInput = std::make_unique( + std::make_shared(filePaths[0]), readerOpts.memoryPool()); + + auto reader = std::make_unique( + readerOpts, std::move(bufferedInput)); + EXPECT_EQ(reader->getNumberOfStripes(), 10); + EXPECT_EQ(reader->getRowsPerStripe()[0], 500); +} + } // namespace } // namespace facebook::velox::connector::hive @@ -492,6 +1086,6 @@ int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); // Signal handler required for ThreadDebugInfoTest facebook::velox::process::addDefaultFatalSignalHandler(); - folly::init(&argc, &argv, false); + folly::Init init{&argc, &argv, false}; return RUN_ALL_TESTS(); } diff --git a/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp b/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp index 0b35f596bb545..eee2eb8139c49 100644 --- a/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp +++ b/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp @@ -25,6 +25,10 @@ using namespace facebook::velox; class HivePartitionFunctionTest : public ::testing::Test, public test::VectorTestBase { protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + void assertPartitions( const VectorPtr& vector, int bucketCount, @@ -315,23 +319,17 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) { auto rawSizes = sizesBuffer->asMutable(); auto rawNulls = nullsBuffer->asMutable(); - // Make the elements overlap and have gaps. + // Make the elements have gaps. // Set the values in position 2 to be invalid since that Array should be null. std::vector offsets{ - 0, 2, std::numeric_limits().max(), 1, 8}; + 0, 2, std::numeric_limits().max(), 4, 8}; std::vector sizes{ - 4, 3, std::numeric_limits().max(), 5, 2}; + 2, 1, std::numeric_limits().max(), 3, 2}; memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t)); memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t)); bits::setNull(rawNulls, 2); - // Produces arrays that look like: - // [9, NULL, 7, 6] - // [7, 6, 5] - // NULL - // [NULL, 7, 6, 5, NULL] - // [1, NULL] auto values = std::make_shared( pool_.get(), ARRAY(elements->type()), @@ -342,9 +340,9 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) { encodedElements); assertPartitions(values, 1, {0, 0, 0, 0, 0}); - assertPartitions(values, 2, {0, 0, 0, 0, 1}); - assertPartitions(values, 500, {342, 418, 0, 458, 31}); - assertPartitions(values, 997, {149, 936, 0, 103, 31}); + assertPartitions(values, 2, {1, 1, 0, 0, 1}); + assertPartitions(values, 500, {279, 7, 0, 308, 31}); + assertPartitions(values, 997, {279, 7, 0, 820, 31}); assertPartitionsWithConstChannel(values, 1); assertPartitionsWithConstChannel(values, 2); @@ -424,23 +422,17 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) { auto rawSizes = sizesBuffer->asMutable(); auto rawNulls = nullsBuffer->asMutable(); - // Make the elements overlap and have gaps. + // Make the elements have gaps. // Set the values in position 2 to be invalid since that Map should be null. std::vector offsets{ - 0, 2, std::numeric_limits().max(), 1, 8}; + 0, 2, std::numeric_limits().max(), 4, 8}; std::vector sizes{ - 4, 3, std::numeric_limits().max(), 5, 2}; + 2, 1, std::numeric_limits().max(), 3, 2}; memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t)); memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t)); bits::setNull(rawNulls, 2); - // Produces maps that look like: - // {key_0: 9, key_3: NULL, key_6: 7, key_9: 6} - // {key_6: 7, key_9: 6, key_2: 5} - // NULL - // {key_3: NULL, key_6: 7, key_9: 6, key_2: 5, key_5: NULL} - // {key_4: 1, key_7: NULL} auto values = std::make_shared( pool_.get(), MAP(mapKeys->type(), mapValues->type()), @@ -453,8 +445,8 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) { assertPartitions(values, 1, {0, 0, 0, 0, 0}); assertPartitions(values, 2, {0, 1, 0, 1, 0}); - assertPartitions(values, 500, {176, 259, 0, 91, 336}); - assertPartitions(values, 997, {694, 24, 0, 365, 345}); + assertPartitions(values, 500, {336, 413, 0, 259, 336}); + assertPartitions(values, 997, {345, 666, 0, 24, 345}); assertPartitionsWithConstChannel(values, 1); assertPartitionsWithConstChannel(values, 2); @@ -727,3 +719,17 @@ TEST_F(HivePartitionFunctionTest, function) { } } } + +TEST_F(HivePartitionFunctionTest, unknown) { + auto values = makeAllNullFlatVector(4); + + assertPartitions(values, 1, {0, 0, 0, 0}); + assertPartitions(values, 2, {0, 0, 0, 0}); + assertPartitions(values, 500, {0, 0, 0, 0}); + assertPartitions(values, 997, {0, 0, 0, 0}); + + assertPartitionsWithConstChannel(values, 1); + assertPartitionsWithConstChannel(values, 2); + assertPartitionsWithConstChannel(values, 500); + assertPartitionsWithConstChannel(values, 997); +} diff --git a/velox/connectors/hive/tests/HivePartitionUtilTest.cpp b/velox/connectors/hive/tests/HivePartitionUtilTest.cpp index 7bd16286dec5b..187e5c49b9755 100644 --- a/velox/connectors/hive/tests/HivePartitionUtilTest.cpp +++ b/velox/connectors/hive/tests/HivePartitionUtilTest.cpp @@ -92,8 +92,10 @@ TEST_F(HivePartitionUtilTest, partitionName) { std::iota(partitionChannels.begin(), partitionChannels.end(), 0); EXPECT_EQ( - FileUtils::makePartName(extractPartitionKeyValues( - makePartitionsVector(input, partitionChannels), 0)), + FileUtils::makePartName( + extractPartitionKeyValues( + makePartitionsVector(input, partitionChannels), 0), + true), folly::join( "/", std::vector( @@ -112,8 +114,40 @@ TEST_F(HivePartitionUtilTest, partitionName) { std::vector partitionChannels{0}; VELOX_ASSERT_THROW( - FileUtils::makePartName(extractPartitionKeyValues( - makePartitionsVector(input, partitionChannels), 0)), + FileUtils::makePartName( + extractPartitionKeyValues( + makePartitionsVector(input, partitionChannels), 0), + true), "Unsupported partition type: MAP"); } } + +TEST_F(HivePartitionUtilTest, partitionNameForNull) { + std::vector partitionColumnNames{ + "flat_bool_col", + "flat_tinyint_col", + "flat_smallint_col", + "flat_int_col", + "flat_bigint_col", + "flat_string_col", + "const_date_col"}; + + RowVectorPtr input = makeRowVector( + partitionColumnNames, + {makeNullableFlatVector({std::nullopt}), + makeNullableFlatVector({std::nullopt}), + makeNullableFlatVector({std::nullopt}), + makeNullableFlatVector({std::nullopt}), + makeNullableFlatVector({std::nullopt}), + makeNullableFlatVector({std::nullopt}), + makeConstant(std::nullopt, 1, DATE())}); + + for (auto i = 0; i < partitionColumnNames.size(); i++) { + std::vector partitionChannels = {(column_index_t)i}; + auto partitionEntries = extractPartitionKeyValues( + makePartitionsVector(input, partitionChannels), 0); + EXPECT_EQ(1, partitionEntries.size()); + EXPECT_EQ(partitionColumnNames[i], partitionEntries[0].first); + EXPECT_EQ("", partitionEntries[0].second); + } +} diff --git a/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp b/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp index a366a8b38f362..7f980cffe7b52 100644 --- a/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp +++ b/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp @@ -23,12 +23,17 @@ namespace facebook::velox::connector::hive { class PartitionIdGeneratorTest : public ::testing::Test, - public test::VectorTestBase {}; + public test::VectorTestBase { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } +}; TEST_F(PartitionIdGeneratorTest, consecutiveIdsSingleKey) { auto numPartitions = 100; - PartitionIdGenerator idGenerator(ROW({VARCHAR()}), {0}, 100, pool()); + PartitionIdGenerator idGenerator(ROW({VARCHAR()}), {0}, 100, pool(), true); auto input = makeRowVector( {makeFlatVector(numPartitions * 3, [&](auto row) { @@ -50,7 +55,7 @@ TEST_F(PartitionIdGeneratorTest, consecutiveIdsSingleKey) { TEST_F(PartitionIdGeneratorTest, consecutiveIdsMultipleKeys) { PartitionIdGenerator idGenerator( - ROW({VARCHAR(), INTEGER()}), {0, 1}, 100, pool()); + ROW({VARCHAR(), INTEGER()}), {0, 1}, 100, pool(), true); auto input = makeRowVector({ makeFlatVector( @@ -75,8 +80,35 @@ TEST_F(PartitionIdGeneratorTest, consecutiveIdsMultipleKeys) { numPartitions - 1); } +TEST_F(PartitionIdGeneratorTest, multipleBoolKeys) { + PartitionIdGenerator idGenerator( + ROW({BOOLEAN(), BOOLEAN()}), {0, 1}, 100, pool(), true); + + auto input = makeRowVector({ + makeFlatVector( + 1'000, [](vector_size_t row) { return row < 50; }, nullEvery(7)), + makeFlatVector( + 1'000, + [](vector_size_t row) { return (row % 2) == 0; }, + nullEvery(3)), + }); + + raw_vector ids; + idGenerator.run(input, ids); + + // distinctIds contains 9 ids. + const auto numPartitions = 9; + + std::unordered_set distinctIds(ids.begin(), ids.end()); + EXPECT_EQ(distinctIds.size(), numPartitions); + EXPECT_EQ(*std::min_element(distinctIds.begin(), distinctIds.end()), 0); + EXPECT_EQ( + *std::max_element(distinctIds.begin(), distinctIds.end()), + numPartitions - 1); +} + TEST_F(PartitionIdGeneratorTest, stableIdsSingleKey) { - PartitionIdGenerator idGenerator(ROW({BIGINT()}), {0}, 100, pool()); + PartitionIdGenerator idGenerator(ROW({BIGINT()}), {0}, 100, pool(), true); auto numPartitions = 40; auto input = makeRowVector({ @@ -103,7 +135,7 @@ TEST_F(PartitionIdGeneratorTest, stableIdsSingleKey) { TEST_F(PartitionIdGeneratorTest, stableIdsMultipleKeys) { PartitionIdGenerator idGenerator( - ROW({BIGINT(), VARCHAR(), INTEGER()}), {1, 2}, 100, pool()); + ROW({BIGINT(), VARCHAR(), INTEGER()}), {1, 2}, 100, pool(), true); const vector_size_t size = 1'000; auto input = makeRowVector({ @@ -140,8 +172,23 @@ TEST_F(PartitionIdGeneratorTest, stableIdsMultipleKeys) { } } +TEST_F(PartitionIdGeneratorTest, partitionKeysCaseSensitive) { + PartitionIdGenerator idGenerator( + ROW({"cc0", "Cc1"}, {BIGINT(), VARCHAR()}), {1}, 100, pool(), false); + + auto input = makeRowVector({ + makeFlatVector({1, 2, 3}), + makeFlatVector({"apple", "orange", "apple"}), + }); + + raw_vector firstTimeIds; + idGenerator.run(input, firstTimeIds); + EXPECT_EQ("Cc1=apple", idGenerator.partitionName(0)); + EXPECT_EQ("Cc1=orange", idGenerator.partitionName(1)); +} + TEST_F(PartitionIdGeneratorTest, numPartitions) { - PartitionIdGenerator idGenerator(ROW({BIGINT()}), {0}, 100, pool()); + PartitionIdGenerator idGenerator(ROW({BIGINT()}), {0}, 100, pool(), true); // First run to process partition 0,..,9. Total num of partitions processed by // far is 10. @@ -176,7 +223,7 @@ TEST_F(PartitionIdGeneratorTest, limitOfPartitionNumber) { auto maxPartitions = 100; PartitionIdGenerator idGenerator( - ROW({INTEGER()}), {0}, maxPartitions, pool()); + ROW({INTEGER()}), {0}, maxPartitions, pool(), true); auto input = makeRowVector({ makeFlatVector(maxPartitions + 1, [](auto row) { return row; }), @@ -204,7 +251,8 @@ TEST_F(PartitionIdGeneratorTest, supportedPartitionKeyTypes) { }), {0, 1, 2, 3, 4, 5, 6}, 100, - pool()); + pool(), + true); auto input = makeRowVector({ makeNullableFlatVector( @@ -238,7 +286,8 @@ TEST_F(PartitionIdGeneratorTest, supportedPartitionKeyTypes) { for (column_index_t i = 1; i < input->childrenSize(); i++) { VELOX_ASSERT_THROW( - PartitionIdGenerator(asRowType(input->type()), {i}, 100, pool()), + PartitionIdGenerator( + asRowType(input->type()), {i}, 100, pool(), true), fmt::format( "Unsupported partition type: {}.", input->childAt(i)->type()->toString())); diff --git a/velox/connectors/tests/CMakeLists.txt b/velox/connectors/tests/CMakeLists.txt index c024da41b3e05..4475ce474ee8f 100644 --- a/velox/connectors/tests/CMakeLists.txt +++ b/velox/connectors/tests/CMakeLists.txt @@ -17,8 +17,8 @@ add_test(velox_connector_test velox_connector_test) target_link_libraries( velox_connector_test velox_connector - gtest - gtest_main + GTest::gtest + GTest::gtest_main glog::glog gflags::gflags Folly::folly) diff --git a/velox/connectors/tests/ConnectorTest.cpp b/velox/connectors/tests/ConnectorTest.cpp index ba54bceed625b..5681bc80cb04e 100644 --- a/velox/connectors/tests/ConnectorTest.cpp +++ b/velox/connectors/tests/ConnectorTest.cpp @@ -15,6 +15,9 @@ */ #include "velox/connectors/Connector.h" +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/config/Config.h" + #include namespace facebook::velox::connector { @@ -25,7 +28,7 @@ namespace { class TestConnector : public connector::Connector { public: - TestConnector(const std::string& id) : connector::Connector(id, nullptr) {} + TestConnector(const std::string& id) : connector::Connector(id) {} std::unique_ptr createDataSource( const RowTypePtr& /* outputType */, @@ -47,13 +50,36 @@ class TestConnector : public connector::Connector { } }; +class TestConnectorFactory : public connector::ConnectorFactory { + public: + static constexpr const char* kConnectorFactoryName = "test-factory"; + + TestConnectorFactory() : ConnectorFactory(kConnectorFactoryName) {} + + std::shared_ptr newConnector( + const std::string& id, + std::shared_ptr /*config*/, + folly::Executor* /*executor*/ = nullptr) override { + return std::make_shared(id); + } +}; + } // namespace TEST_F(ConnectorTest, getAllConnectors) { + registerConnectorFactory(std::make_shared()); + VELOX_ASSERT_THROW( + registerConnectorFactory(std::make_shared()), + "ConnectorFactory with name 'test-factory' is already registered"); + EXPECT_TRUE(hasConnectorFactory(TestConnectorFactory::kConnectorFactoryName)); const int32_t numConnectors = 10; for (int32_t i = 0; i < numConnectors; i++) { registerConnector( - std::make_shared(fmt::format("connector-{}", i))); + getConnectorFactory(TestConnectorFactory::kConnectorFactoryName) + ->newConnector( + fmt::format("connector-{}", i), + std::make_shared( + std::unordered_map()))); } const auto& connectors = getAllConnectors(); EXPECT_EQ(connectors.size(), numConnectors); @@ -64,5 +90,9 @@ TEST_F(ConnectorTest, getAllConnectors) { unregisterConnector(fmt::format("connector-{}", i)); } EXPECT_EQ(getAllConnectors().size(), 0); + EXPECT_TRUE( + unregisterConnectorFactory(TestConnectorFactory::kConnectorFactoryName)); + EXPECT_FALSE( + unregisterConnectorFactory(TestConnectorFactory::kConnectorFactoryName)); } } // namespace facebook::velox::connector diff --git a/velox/connectors/tpch/CMakeLists.txt b/velox/connectors/tpch/CMakeLists.txt index b7123f2f6f0e8..b8373349e7d00 100644 --- a/velox/connectors/tpch/CMakeLists.txt +++ b/velox/connectors/tpch/CMakeLists.txt @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_tpch_connector OBJECT TpchConnector.cpp) +velox_add_library(velox_tpch_connector OBJECT TpchConnector.cpp) -target_link_libraries(velox_tpch_connector velox_connector velox_tpch_gen) +velox_link_libraries(velox_tpch_connector velox_connector velox_tpch_gen + fmt::fmt) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) diff --git a/velox/connectors/tpch/TpchConnector.cpp b/velox/connectors/tpch/TpchConnector.cpp index 1bf4eac4c5901..41df3e7251c28 100644 --- a/velox/connectors/tpch/TpchConnector.cpp +++ b/velox/connectors/tpch/TpchConnector.cpp @@ -63,7 +63,7 @@ TpchDataSource::TpchDataSource( const std::unordered_map< std::string, std::shared_ptr>& columnHandles, - velox::memory::MemoryPool* FOLLY_NONNULL pool) + velox::memory::MemoryPool* pool) : pool_(pool) { auto tpchTableHandle = std::dynamic_pointer_cast(tableHandle); diff --git a/velox/connectors/tpch/TpchConnector.h b/velox/connectors/tpch/TpchConnector.h index 8f25c4649c914..babcda8e5cb01 100644 --- a/velox/connectors/tpch/TpchConnector.h +++ b/velox/connectors/tpch/TpchConnector.h @@ -15,6 +15,7 @@ */ #pragma once +#include "velox/common/config/Config.h" #include "velox/connectors/Connector.h" #include "velox/connectors/tpch/TpchConnectorSplit.h" #include "velox/tpch/gen/TpchGen.h" @@ -75,7 +76,7 @@ class TpchDataSource : public DataSource { const std::unordered_map< std::string, std::shared_ptr>& columnHandles, - velox::memory::MemoryPool* FOLLY_NONNULL pool); + velox::memory::MemoryPool* pool); void addSplit(std::shared_ptr split) override; @@ -123,16 +124,16 @@ class TpchDataSource : public DataSource { size_t completedRows_{0}; size_t completedBytes_{0}; - memory::MemoryPool* FOLLY_NONNULL pool_; + memory::MemoryPool* pool_; }; class TpchConnector final : public Connector { public: TpchConnector( const std::string& id, - std::shared_ptr properties, - folly::Executor* FOLLY_NULLABLE /*executor*/) - : Connector(id, properties) {} + std::shared_ptr config, + folly::Executor* /*executor*/) + : Connector(id) {} std::unique_ptr createDataSource( const std::shared_ptr& outputType, @@ -140,7 +141,7 @@ class TpchConnector final : public Connector { const std::unordered_map< std::string, std::shared_ptr>& columnHandles, - ConnectorQueryCtx* FOLLY_NONNULL connectorQueryCtx) override final { + ConnectorQueryCtx* connectorQueryCtx) override final { return std::make_unique( outputType, tableHandle, @@ -160,18 +161,18 @@ class TpchConnector final : public Connector { class TpchConnectorFactory : public ConnectorFactory { public: - static constexpr const char* FOLLY_NONNULL kTpchConnectorName{"tpch"}; + static constexpr const char* kTpchConnectorName{"tpch"}; TpchConnectorFactory() : ConnectorFactory(kTpchConnectorName) {} - explicit TpchConnectorFactory(const char* FOLLY_NONNULL connectorName) + explicit TpchConnectorFactory(const char* connectorName) : ConnectorFactory(connectorName) {} std::shared_ptr newConnector( const std::string& id, - std::shared_ptr properties, - folly::Executor* FOLLY_NULLABLE executor = nullptr) override { - return std::make_shared(id, properties, executor); + std::shared_ptr config, + folly::Executor* executor = nullptr) override { + return std::make_shared(id, config, executor); } }; diff --git a/velox/connectors/tpch/TpchConnectorSplit.h b/velox/connectors/tpch/TpchConnectorSplit.h index 298a36ac85df5..47588ab6d44a0 100644 --- a/velox/connectors/tpch/TpchConnectorSplit.h +++ b/velox/connectors/tpch/TpchConnectorSplit.h @@ -15,6 +15,7 @@ */ #pragma once +#include #include "velox/connectors/Connector.h" namespace facebook::velox::connector::tpch { @@ -40,3 +41,24 @@ struct TpchConnectorSplit : public connector::ConnectorSplit { }; } // namespace facebook::velox::connector::tpch + +template <> +struct fmt::formatter + : formatter { + auto format( + facebook::velox::connector::tpch::TpchConnectorSplit s, + format_context& ctx) { + return formatter::format(s.toString(), ctx); + } +}; + +template <> +struct fmt::formatter< + std::shared_ptr> + : formatter { + auto format( + std::shared_ptr s, + format_context& ctx) const { + return formatter::format(s->toString(), ctx); + } +}; diff --git a/velox/connectors/tpch/tests/CMakeLists.txt b/velox/connectors/tpch/tests/CMakeLists.txt index 3071da3eb57c0..5474de3cfa906 100644 --- a/velox/connectors/tpch/tests/CMakeLists.txt +++ b/velox/connectors/tpch/tests/CMakeLists.txt @@ -21,10 +21,15 @@ target_link_libraries( velox_vector_test_lib velox_exec_test_lib velox_aggregates - gtest - gtest_main) + GTest::gtest + GTest::gtest_main) add_executable(velox_tpch_speed_test SpeedTest.cpp) -target_link_libraries(velox_tpch_speed_test velox_tpch_connector velox_exec - velox_exec_test_lib velox_memory fmt::fmt) +target_link_libraries( + velox_tpch_speed_test + velox_tpch_connector + velox_exec + velox_exec_test_lib + velox_memory + fmt::fmt) diff --git a/velox/connectors/tpch/tests/SpeedTest.cpp b/velox/connectors/tpch/tests/SpeedTest.cpp index 7df845877df62..8bb713c9c69b1 100644 --- a/velox/connectors/tpch/tests/SpeedTest.cpp +++ b/velox/connectors/tpch/tests/SpeedTest.cpp @@ -59,7 +59,10 @@ class TpchSpeedTest { auto tpchConnector = connector::getConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName) - ->newConnector(kTpchConnectorId_, nullptr); + ->newConnector( + kTpchConnectorId_, + std::make_shared( + std::unordered_map())); connector::registerConnector(tpchConnector); } @@ -74,7 +77,7 @@ class TpchSpeedTest { core::PlanNodeId scanId; auto plan = PlanBuilder() - .tableScan( + .tpchTableScan( table, folly::copy(getTableSchema(table)->names()), scaleFactor) .capturePlanNodeId(scanId) .planNode(); @@ -91,19 +94,18 @@ class TpchSpeedTest { params.planNode = plan; params.maxDrivers = FLAGS_max_drivers; - TaskCursor taskCursor(params); - taskCursor.start(); + auto taskCursor = TaskCursor::create(params); + taskCursor->start(); - auto task = taskCursor.task(); + auto task = taskCursor->task(); addSplits(*task, scanId, numSplits); - while (taskCursor.moveNext()) { - processBatch(taskCursor.current()); + while (taskCursor->moveNext()) { + processBatch(taskCursor->current()); } // Wait for the task to finish. - auto& inlineExecutor = folly::QueuedImmediateExecutor::instance(); - task->taskCompletionFuture(0).via(&inlineExecutor).wait(); + task->taskCompletionFuture().wait(); std::chrono::duration elapsed = system_clock::now() - startTime; LOG(INFO) << "Summary:"; @@ -179,7 +181,7 @@ class TpchSpeedTest { } // namespace int main(int argc, char** argv) { - folly::init(&argc, &argv, false); + folly::Init init{&argc, &argv, false}; TpchSpeedTest speedTest; speedTest.run( diff --git a/velox/connectors/tpch/tests/TpchConnectorTest.cpp b/velox/connectors/tpch/tests/TpchConnectorTest.cpp index 7685bebf516bd..04f4199d348a8 100644 --- a/velox/connectors/tpch/tests/TpchConnectorTest.cpp +++ b/velox/connectors/tpch/tests/TpchConnectorTest.cpp @@ -39,7 +39,10 @@ class TpchConnectorTest : public exec::test::OperatorTestBase { auto tpchConnector = connector::getConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName) - ->newConnector(kTpchConnectorId, nullptr); + ->newConnector( + kTpchConnectorId, + std::make_shared( + std::unordered_map())); connector::registerConnector(tpchConnector); } @@ -68,7 +71,7 @@ class TpchConnectorTest : public exec::test::OperatorTestBase { // Simple scan of first 5 rows of "nation". TEST_F(TpchConnectorTest, simple) { auto plan = PlanBuilder() - .tableScan( + .tpchTableScan( Table::TBL_NATION, {"n_nationkey", "n_name", "n_regionkey", "n_comment"}) .limit(0, 5, false) @@ -102,7 +105,8 @@ TEST_F(TpchConnectorTest, simple) { // Extract single column from "nation". TEST_F(TpchConnectorTest, singleColumn) { - auto plan = PlanBuilder().tableScan(Table::TBL_NATION, {"n_name"}).planNode(); + auto plan = + PlanBuilder().tpchTableScan(Table::TBL_NATION, {"n_name"}).planNode(); auto output = getResults(plan, {makeTpchSplit()}); auto expected = makeRowVector({makeFlatVector({ @@ -125,16 +129,17 @@ TEST_F(TpchConnectorTest, singleColumnWithAlias) { auto outputType = ROW({aliasedName}, {VARCHAR()}); auto plan = PlanBuilder() - .tableScan( - outputType, - std::make_shared( - kTpchConnectorId, Table::TBL_NATION), - { - {aliasedName, std::make_shared("n_name")}, - {"other_name", std::make_shared("n_name")}, - {"third_column", - std::make_shared("n_regionkey")}, - }) + .startTableScan() + .outputType(outputType) + .tableHandle(std::make_shared( + kTpchConnectorId, Table::TBL_NATION)) + .assignments({ + {aliasedName, std::make_shared("n_name")}, + {"other_name", std::make_shared("n_name")}, + {"third_column", + std::make_shared("n_regionkey")}, + }) + .endTableScan() .limit(0, 1, false) .planNode(); @@ -150,11 +155,11 @@ TEST_F(TpchConnectorTest, singleColumnWithAlias) { void TpchConnectorTest::runScaleFactorTest(double scaleFactor) { auto plan = PlanBuilder() - .tableScan( - ROW({}, {}), - std::make_shared( - kTpchConnectorId, Table::TBL_SUPPLIER, scaleFactor), - {}) + .startTableScan() + .outputType(ROW({}, {})) + .tableHandle(std::make_shared( + kTpchConnectorId, Table::TBL_SUPPLIER, scaleFactor)) + .endTableScan() .singleAggregation({}, {"count(1)"}) .planNode(); @@ -179,11 +184,11 @@ TEST_F(TpchConnectorTest, lineitemTinyRowCount) { // Lineitem row count depends on the orders. // Verify against Java tiny result. auto plan = PlanBuilder() - .tableScan( - ROW({}, {}), - std::make_shared( - kTpchConnectorId, Table::TBL_LINEITEM, 0.01), - {}) + .startTableScan() + .outputType(ROW({}, {})) + .tableHandle(std::make_shared( + kTpchConnectorId, Table::TBL_LINEITEM, 0.01)) + .endTableScan() .singleAggregation({}, {"count(1)"}) .planNode(); @@ -195,7 +200,7 @@ TEST_F(TpchConnectorTest, unknownColumn) { EXPECT_THROW( { PlanBuilder() - .tableScan(Table::TBL_NATION, {"does_not_exist"}) + .tpchTableScan(Table::TBL_NATION, {"does_not_exist"}) .planNode(); }, VeloxUserError); @@ -205,7 +210,7 @@ TEST_F(TpchConnectorTest, unknownColumn) { // same dataset in the end. TEST_F(TpchConnectorTest, multipleSplits) { auto plan = PlanBuilder() - .tableScan( + .tpchTableScan( Table::TBL_NATION, {"n_nationkey", "n_name", "n_regionkey", "n_comment"}) .planNode(); @@ -237,14 +242,14 @@ TEST_F(TpchConnectorTest, join) { core::PlanNodeId regionScanId; auto plan = PlanBuilder(planNodeIdGenerator) - .tableScan( + .tpchTableScan( tpch::Table::TBL_NATION, {"n_regionkey"}, 1.0 /*scaleFactor*/) .capturePlanNodeId(nationScanId) .hashJoin( {"n_regionkey"}, {"r_regionkey"}, PlanBuilder(planNodeIdGenerator) - .tableScan( + .tpchTableScan( tpch::Table::TBL_REGION, {"r_regionkey", "r_name"}, 1.0 /*scaleFactor*/) @@ -271,7 +276,7 @@ TEST_F(TpchConnectorTest, join) { TEST_F(TpchConnectorTest, orderDateCount) { auto plan = PlanBuilder() - .tableScan(Table::TBL_ORDERS, {"o_orderdate"}, 0.01) + .tpchTableScan(Table::TBL_ORDERS, {"o_orderdate"}, 0.01) .filter("o_orderdate = '1992-01-01'::DATE") .limit(0, 10, false) .planNode(); @@ -287,6 +292,6 @@ TEST_F(TpchConnectorTest, orderDateCount) { int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); - folly::init(&argc, &argv, false); + folly::Init init{&argc, &argv, false}; return RUN_ALL_TESTS(); } diff --git a/velox/core/CMakeLists.txt b/velox/core/CMakeLists.txt index eb4c18cdbe802..33c2bb5370435 100644 --- a/velox/core/CMakeLists.txt +++ b/velox/core/CMakeLists.txt @@ -15,17 +15,20 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() -add_library(velox_config Config.cpp) -target_link_libraries(velox_config PUBLIC velox_exception Folly::folly) - -add_library(velox_core Expressions.cpp PlanFragment.cpp PlanNode.cpp - QueryConfig.cpp QueryCtx.cpp SimpleFunctionMetadata.cpp) +velox_add_library( + velox_core + Expressions.cpp + PlanFragment.cpp + PlanNode.cpp + QueryConfig.cpp + QueryCtx.cpp + SimpleFunctionMetadata.cpp) -target_link_libraries( +velox_link_libraries( velox_core PUBLIC velox_arrow_bridge velox_caching - velox_config + velox_common_config velox_connector velox_exception velox_expression_functions @@ -34,4 +37,5 @@ target_link_libraries( velox_vector Boost::headers Folly::folly + fmt::fmt PRIVATE velox_encode) diff --git a/velox/core/Config.cpp b/velox/core/Config.cpp deleted file mode 100644 index 8ccd7a8ccbd28..0000000000000 --- a/velox/core/Config.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "velox/core/Config.h" - -namespace facebook::velox::core { - -folly::Optional MemConfig::get(const std::string& key) const { - folly::Optional val; - auto it = values_.find(key); - if (it != values_.end()) { - val = it->second; - } - return val; -} - -bool MemConfig::isValueExists(const std::string& key) const { - return values_.find(key) != values_.end(); -} - -folly::Optional MemConfigMutable::get( - const std::string& key) const { - auto lockedValues = values_.rlock(); - folly::Optional val; - auto it = lockedValues->find(key); - if (it != lockedValues->end()) { - val = it->second; - } - return val; -} - -bool MemConfigMutable::isValueExists(const std::string& key) const { - auto lockedValues = values_.rlock(); - return lockedValues->find(key) != lockedValues->end(); -} - -} // namespace facebook::velox::core diff --git a/velox/core/Config.h b/velox/core/Config.h deleted file mode 100644 index 11ccea060588d..0000000000000 --- a/velox/core/Config.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include "velox/common/base/Exceptions.h" - -namespace facebook::velox { - -class Config { - public: - virtual ~Config() = default; - - virtual folly::Optional get(const std::string& key) const = 0; - // virtual const string operator[](const std::string& key) = 0; - // overload and disable not supported cases. - - template - folly::Optional get(const std::string& key) const { - auto val = get(key); - if (val.hasValue()) { - return folly::to(val.value()); - } else { - return folly::none; - } - } - - template - T get(const std::string& key, const T& defaultValue) const { - auto val = get(key); - if (val.hasValue()) { - return folly::to(val.value()); - } else { - return defaultValue; - } - } - - virtual bool isValueExists(const std::string& key) const = 0; - - virtual const std::unordered_map& values() const { - VELOX_UNSUPPORTED("method values() is not supported by this config"); - } - - virtual std::unordered_map valuesCopy() const { - VELOX_UNSUPPORTED("method valuesCopy() is not supported by this config"); - } -}; - -namespace core { - -class MemConfig : public Config { - public: - explicit MemConfig(const std::unordered_map& values) - : values_(values) {} - - explicit MemConfig() : values_{} {} - - explicit MemConfig(std::unordered_map&& values) - : values_(std::move(values)) {} - - folly::Optional get(const std::string& key) const override; - - bool isValueExists(const std::string& key) const override; - - const std::unordered_map& values() const override { - return values_; - } - - std::unordered_map valuesCopy() const override { - return values_; - } - - private: - std::unordered_map values_; -}; - -/// In-memory config allowing changing properties at runtime. -class MemConfigMutable : public Config { - public: - explicit MemConfigMutable( - const std::unordered_map& values) - : values_(values) {} - - explicit MemConfigMutable() : values_{} {} - - explicit MemConfigMutable( - std::unordered_map&& values) - : values_(std::move(values)) {} - - folly::Optional get(const std::string& key) const override; - - bool isValueExists(const std::string& key) const override; - - const std::unordered_map& values() const override { - VELOX_UNSUPPORTED( - "Mutable config cannot return unprotected reference to values."); - return *values_.rlock(); - } - - std::unordered_map valuesCopy() const override { - return *values_.rlock(); - } - - /// Adds or replaces value at the given key. Can be used by debugging or - /// testing code. - void setValue(const std::string& key, const std::string& value) { - (*values_.wlock())[key] = value; - } - - private: - folly::Synchronized> values_; -}; - -} // namespace core -} // namespace facebook::velox diff --git a/velox/core/Expressions.h b/velox/core/Expressions.h index e2036ceb35fe4..e9e9fa65f32fa 100644 --- a/velox/core/Expressions.h +++ b/velox/core/Expressions.h @@ -15,6 +15,7 @@ */ #pragma once +#include #include "velox/common/base/Exceptions.h" #include "velox/core/ITypedExpr.h" #include "velox/vector/BaseVector.h" @@ -23,8 +24,7 @@ namespace facebook::velox::core { class InputTypedExpr : public ITypedExpr { public: - InputTypedExpr(std::shared_ptr type) - : ITypedExpr{std::move(type)} {} + explicit InputTypedExpr(TypePtr type) : ITypedExpr{std::move(type)} {} bool operator==(const ITypedExpr& other) const final { const auto* casted = dynamic_cast(&other); @@ -55,7 +55,7 @@ class ConstantTypedExpr : public ITypedExpr { public: // Creates constant expression. For complex types, only // variant::null() value is supported. - ConstantTypedExpr(std::shared_ptr type, variant value) + ConstantTypedExpr(TypePtr type, variant value) : ITypedExpr{std::move(type)}, value_{std::move(value)} {} // Creates constant expression of scalar or complex type. The value comes from @@ -87,7 +87,7 @@ class ConstantTypedExpr : public ITypedExpr { return valueVector_ != nullptr; } - // Returns scalar value as variant if hasValueVector() is false. + /// Returns scalar value as variant if hasValueVector() is false. const variant& value() const { return value_; } @@ -161,14 +161,48 @@ class ConstantTypedExpr : public ITypedExpr { const VectorPtr valueVector_; }; +using ConstantTypedExprPtr = std::shared_ptr; + +/// Evaluates a scalar function or a special form. +/// +/// Supported special forms are: and, or, cast, try_cast, coalesce, if, switch, +/// try. See registerFunctionCallToSpecialForms in +/// expression/RegisterSpecialForm.h for the up-to-date list. +/// +/// Regular functions have the following properties: (1) return type is fully +/// defined by function name and input types; (2) during evaluation all function +/// arguments are evaluated first before the function itself is evaluated on the +/// results, a failure to evaluate function argument prevents the function from +/// being evaluated. +/// +/// Special forms are different from regular scalar functions as they do not +/// always have the above properties. +/// +/// - CAST doesn't have (1): return type is not defined by input type as it is +/// possible to cast VARCHAR to INTEGER, BOOLEAN, and many other types. +/// - Conjuncts AND, OR don't have (2): these have logic to stop evaluating +/// arguments if the outcome is already decided. For example, a > 10 AND b < 3 +/// applied to a = 0 and b = 0 is fully decided after evaluating a > 10. The +/// result is FALSE. This is important not only from efficiency standpoint, but +/// semantically as well. Not evaluating unnecessary arguments implicitly +/// suppresses the errors that might have happened if evaluation proceeded. For +/// example, a > 10 AND b / a > 1 would fail if both expressions were evaluated +/// on a = 0. +/// - Coalesce, if, switch also don't have (2): these also have logic to stop +/// evaluating arguments if the outcome is already decided. +/// - TRY doesn't have (2) either: it needs to capture and suppress errors +/// received while evaluating the input. class CallTypedExpr : public ITypedExpr { public: + /// @param type Return type. + /// @param inputs List of input expressions. May be empty. + /// @param name Name of the function or special form. CallTypedExpr( - std::shared_ptr type, + TypePtr type, std::vector inputs, - std::string funcName) + std::string name) : ITypedExpr{std::move(type), std::move(inputs)}, - name_(std::move(funcName)) {} + name_(std::move(name)) {} virtual const std::string& name() const { return name_; @@ -206,17 +240,21 @@ class CallTypedExpr : public ITypedExpr { if (!casted) { return false; } - if (casted->name() != this->name()) { + return operator==(*casted); + } + + bool operator==(const CallTypedExpr& other) const { + if (other.name() != this->name()) { return false; } - if (*casted->type() != *this->type()) { + if (*other.type() != *this->type()) { return false; } return std::equal( this->inputs().begin(), this->inputs().end(), - casted->inputs().begin(), - casted->inputs().end(), + other.inputs().begin(), + other.inputs().end(), [](const auto& p1, const auto& p2) { return *p1 == *p2; }); } @@ -282,12 +320,15 @@ class FieldAccessTypedExpr : public ITypedExpr { } std::string toString() const override { + std::stringstream ss; + ss << std::quoted(name(), '"', '"'); if (inputs().empty()) { - return fmt::format("{}", std::quoted(name(), '"', '"')); + return fmt::format("{}", ss.str()); + ; } - return fmt::format( - "{}[{}]", inputs()[0]->toString(), std::quoted(name(), '"', '"')); + return fmt::format("{}[{}]", inputs()[0]->toString(), ss.str()); + ; } size_t localHash() const override { @@ -301,17 +342,21 @@ class FieldAccessTypedExpr : public ITypedExpr { if (!casted) { return false; } - if (casted->name_ != this->name_) { + return operator==(*casted); + } + + bool operator==(const FieldAccessTypedExpr& other) const { + if (other.name_ != this->name_) { return false; } - if (*casted->type() != *this->type()) { + if (*other.type() != *this->type()) { return false; } return std::equal( this->inputs().begin(), this->inputs().end(), - casted->inputs().begin(), - casted->inputs().end(), + other.inputs().begin(), + other.inputs().end(), [](const auto& p1, const auto& p2) { return *p1 == *p2; }); } @@ -360,8 +405,7 @@ class DereferenceTypedExpr : public ITypedExpr { } std::string toString() const override { - return fmt::format( - "{}[{}]", inputs()[0]->toString(), std::quoted(name(), '"', '"')); + return fmt::format("{}[{}]", inputs()[0]->toString(), name()); } size_t localHash() const override { @@ -375,14 +419,18 @@ class DereferenceTypedExpr : public ITypedExpr { if (!casted) { return false; } - if (casted->index_ != this->index_) { + return operator==(*casted); + } + + bool operator==(const DereferenceTypedExpr& other) const { + if (other.index_ != this->index_) { return false; } return std::equal( this->inputs().begin(), this->inputs().end(), - casted->inputs().begin(), - casted->inputs().end(), + other.inputs().begin(), + other.inputs().end(), [](const auto& p1, const auto& p2) { return *p1 == *p2; }); } @@ -396,9 +444,7 @@ class DereferenceTypedExpr : public ITypedExpr { using DereferenceTypedExprPtr = std::shared_ptr; -/* - * Evaluates a list of expressions to produce a row. - */ +/// Evaluates a list of expressions to produce a row. class ConcatTypedExpr : public ITypedExpr { public: ConcatTypedExpr( @@ -437,14 +483,18 @@ class ConcatTypedExpr : public ITypedExpr { if (!casted) { return false; } - if (*casted->type() != *this->type()) { + return operator==(*casted); + } + + bool operator==(const ConcatTypedExpr& other) const { + if (*other.type() != *this->type()) { return false; } return std::equal( this->inputs().begin(), this->inputs().end(), - casted->inputs().begin(), - casted->inputs().end(), + other.inputs().begin(), + other.inputs().end(), [](const auto& p1, const auto& p2) { return *p1 == *p2; }); } @@ -453,10 +503,10 @@ class ConcatTypedExpr : public ITypedExpr { static TypedExprPtr create(const folly::dynamic& obj, void* context); private: - static std::shared_ptr toType( + static TypePtr toType( const std::vector& names, const std::vector& expressions) { - std::vector> children{}; + std::vector children{}; std::vector namesCopy{}; for (size_t i = 0; i < names.size(); ++i) { namesCopy.push_back(names.at(i)); @@ -470,7 +520,7 @@ class LambdaTypedExpr : public ITypedExpr { public: LambdaTypedExpr(RowTypePtr signature, TypedExprPtr body) : ITypedExpr(std::make_shared( - std::vector>(signature->children()), + std::vector(signature->children()), body->type())), signature_(signature), body_(body) {} @@ -527,13 +577,28 @@ class LambdaTypedExpr : public ITypedExpr { using LambdaTypedExprPtr = std::shared_ptr; +/// Converts input values to specified type. class CastTypedExpr : public ITypedExpr { public: + /// @param type Type to convert to. This is the return type of the CAST + /// expresion. + /// @param input Single input. The type of input is referred to as from-type + /// and expected to be different from to-type. + /// @param nullOnFailure Whether to suppress cast errors and return null. CastTypedExpr( - const std::shared_ptr& type, + const TypePtr& type, + const TypedExprPtr& input, + bool nullOnFailure) + : ITypedExpr{type, {input}}, nullOnFailure_(nullOnFailure) {} + + CastTypedExpr( + const TypePtr& type, const std::vector& inputs, bool nullOnFailure) - : ITypedExpr{type, inputs}, nullOnFailure_(nullOnFailure) {} + : ITypedExpr{type, inputs}, nullOnFailure_(nullOnFailure) { + VELOX_USER_CHECK_EQ( + 1, inputs.size(), "Cast expression requires exactly one input"); + } TypedExprPtr rewriteInputNames( const std::unordered_map& mapping) @@ -580,9 +645,45 @@ class CastTypedExpr : public ITypedExpr { static TypedExprPtr create(const folly::dynamic& obj, void* context); private: - // This flag prevents throws and instead returns - // null on cast failure + // Suppress exception and return null on failure to cast. const bool nullOnFailure_; }; +using CastTypedExprPtr = std::shared_ptr; + +/// A collection of convenience methods for working with expressions. +class TypedExprs { + public: + /// Returns true if 'expr' is a field access expression. + static bool isFieldAccess(const TypedExprPtr& expr) { + return dynamic_cast(expr.get()) != nullptr; + } + + /// Returns 'expr' as FieldAccessTypedExprPtr or null if not field access + /// expression. + static FieldAccessTypedExprPtr asFieldAccess(const TypedExprPtr& expr) { + return std::dynamic_pointer_cast(expr); + } + + /// Returns true if 'expr' is a constant expression. + static bool isConstant(const TypedExprPtr& expr) { + return dynamic_cast(expr.get()) != nullptr; + } + + /// Returns 'expr' as ConstantTypedExprPtr or null if not a constant + /// expression. + static ConstantTypedExprPtr asConstant(const TypedExprPtr& expr) { + return std::dynamic_pointer_cast(expr); + } + + /// Returns true if 'expr' is a lambda expression. + static bool isLambda(const TypedExprPtr& expr) { + return dynamic_cast(expr.get()) != nullptr; + } + + /// Returns 'expr' as LambdaTypedExprPtr or null if not a lambda expression. + static LambdaTypedExprPtr asLambda(const TypedExprPtr& expr) { + return std::dynamic_pointer_cast(expr); + } +}; } // namespace facebook::velox::core diff --git a/velox/core/ITypedExpr.h b/velox/core/ITypedExpr.h index 8a8066ddb06c4..0e573507d5637 100644 --- a/velox/core/ITypedExpr.h +++ b/velox/core/ITypedExpr.h @@ -24,16 +24,15 @@ class ITypedExpr; using TypedExprPtr = std::shared_ptr; -/* a strongly-typed expression, such as literal, function call, etc... */ +/// Strongly-typed expression, e.g. literal, function call, etc. class ITypedExpr : public ISerializable { public: - explicit ITypedExpr(std::shared_ptr type) - : type_{std::move(type)}, inputs_{} {} + explicit ITypedExpr(TypePtr type) : type_{std::move(type)}, inputs_{} {} - ITypedExpr(std::shared_ptr type, std::vector inputs) + ITypedExpr(TypePtr type, std::vector inputs) : type_{std::move(type)}, inputs_{std::move(inputs)} {} - const std::shared_ptr& type() const { + const TypePtr& type() const { return type_; } @@ -44,12 +43,12 @@ class ITypedExpr : public ISerializable { } /// Returns a copy of this expression with input fields replaced according - /// to specified 'mapping'. Fields specified in the 'mapping are replaced + /// to specified 'mapping'. Fields specified in the 'mapping' are replaced /// by the corresponding expression in 'mapping'. /// Fields not present in 'mapping' are left unmodified. /// /// Used to bind inputs to lambda functions. - virtual std::shared_ptr rewriteInputNames( + virtual TypedExprPtr rewriteInputNames( const std::unordered_map& mapping) const = 0; virtual std::string toString() const = 0; @@ -64,9 +63,9 @@ class ITypedExpr : public ISerializable { return hash; } - // Returns true if other is recursively equal to 'this'. We do not - // overload == because this is overloaded in a subclass for a - // different purpose. + /// Returns true if other is recursively equal to 'this'. We do not + /// overload == because this is overloaded in a subclass for a + /// different purpose. bool equals(const ITypedExpr& other) const { if (type_ != other.type_ || inputs_.size() != other.inputs_.size()) { return false; @@ -104,8 +103,8 @@ class ITypedExpr : public ISerializable { return false; } - std::shared_ptr type_; - std::vector> inputs_; + TypePtr type_; + std::vector inputs_; }; } // namespace facebook::velox::core diff --git a/velox/core/Metaprogramming.h b/velox/core/Metaprogramming.h index d50a153932506..317837bfb98a2 100644 --- a/velox/core/Metaprogramming.h +++ b/velox/core/Metaprogramming.h @@ -124,10 +124,11 @@ template struct has_method { private: template - static constexpr auto check(T*) -> typename std::is_same< - decltype(std::declval().template resolve( - std::declval()...)), - TRet>::type { + static constexpr auto check(T*) -> + typename std::is_same< + decltype(std::declval().template resolve( + std::declval()...)), + TRet>::type { return {}; } diff --git a/velox/core/PlanFragment.cpp b/velox/core/PlanFragment.cpp index 338860ab2c9de..4fa862cbcc7c1 100644 --- a/velox/core/PlanFragment.cpp +++ b/velox/core/PlanFragment.cpp @@ -28,4 +28,14 @@ bool PlanFragment::canSpill(const QueryConfig& queryConfig) const { }) != nullptr; } +std::string executionStrategyToString(ExecutionStrategy strategy) { + switch (strategy) { + case ExecutionStrategy::kGrouped: + return "GROUPED"; + case ExecutionStrategy::kUngrouped: + return "UNGROUPED"; + default: + return fmt::format("UNKNOWN: {}", static_cast(strategy)); + } +} } // namespace facebook::velox::core diff --git a/velox/core/PlanFragment.h b/velox/core/PlanFragment.h index a2990a7eda23c..365234495b3c8 100644 --- a/velox/core/PlanFragment.h +++ b/velox/core/PlanFragment.h @@ -35,6 +35,8 @@ enum class ExecutionStrategy { kGrouped, }; +std::string executionStrategyToString(ExecutionStrategy strategy); + /// Contains some information on how to execute the fragment of a plan. /// Used to construct Task. struct PlanFragment { @@ -81,3 +83,13 @@ struct PlanFragment { }; } // namespace facebook::velox::core + +template <> +struct fmt::formatter + : formatter { + auto format( + const facebook::velox::core::ExecutionStrategy& s, + format_context& ctx) const { + return formatter::format(static_cast(s), ctx); + } +}; diff --git a/velox/core/PlanNode.cpp b/velox/core/PlanNode.cpp index 0903fe85bf670..f1e56ed7cfd62 100644 --- a/velox/core/PlanNode.cpp +++ b/velox/core/PlanNode.cpp @@ -13,13 +13,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "velox/core/PlanNode.h" +#include + #include "velox/common/encode/Base64.h" +#include "velox/core/PlanNode.h" #include "velox/vector/VectorSaver.h" namespace facebook::velox::core { namespace { + +void appendComma(int32_t i, std::stringstream& sql) { + if (i > 0) { + sql << ", "; + } +} + std::vector deserializeSources( const folly::dynamic& obj, void* context) { @@ -76,8 +85,7 @@ RowTypePtr getAggregationOutputType( std::vector types; for (auto& key : groupingKeys) { - auto field = - std::dynamic_pointer_cast(key); + auto field = TypedExprs::asFieldAccess(key); VELOX_CHECK(field, "Grouping key must be a field reference"); names.push_back(field->name()); types.push_back(field->type()); @@ -90,6 +98,7 @@ RowTypePtr getAggregationOutputType( return std::make_shared(std::move(names), std::move(types)); } + } // namespace AggregationNode::AggregationNode( @@ -99,6 +108,8 @@ AggregationNode::AggregationNode( const std::vector& preGroupedKeys, const std::vector& aggregateNames, const std::vector& aggregates, + const std::vector& globalGroupingSets, + const std::optional& groupId, bool ignoreNullKeys, PlanNodePtr source) : PlanNode(id), @@ -108,6 +119,8 @@ AggregationNode::AggregationNode( aggregateNames_(aggregateNames), aggregates_(aggregates), ignoreNullKeys_(ignoreNullKeys), + groupId_(groupId), + globalGroupingSets_(globalGroupingSets), sources_{source}, outputType_(getAggregationOutputType( groupingKeys_, @@ -134,16 +147,53 @@ AggregationNode::AggregationNode( "Pre-grouped key must be one of the grouping keys: {}.", key->name()); } + + if (groupId_.has_value()) { + VELOX_USER_CHECK_GT( + groupingKeyNames.count(groupId_.value()->name()), + 0, + "GroupId key {} must be one of the grouping keys", + groupId_.value()->name()); + + VELOX_USER_CHECK( + !globalGroupingSets_.empty(), + "GroupId key {} must have global grouping sets", + groupId_.value()->name()); + } + + if (!globalGroupingSets_.empty()) { + VELOX_USER_CHECK( + groupId_.has_value(), "Global grouping sets require GroupId key"); + } } +AggregationNode::AggregationNode( + const PlanNodeId& id, + Step step, + const std::vector& groupingKeys, + const std::vector& preGroupedKeys, + const std::vector& aggregateNames, + const std::vector& aggregates, + bool ignoreNullKeys, + PlanNodePtr source) + : AggregationNode( + id, + step, + groupingKeys, + preGroupedKeys, + aggregateNames, + aggregates, + {}, + std::nullopt, + ignoreNullKeys, + source) {} + namespace { void addFields( std::stringstream& stream, const std::vector& keys) { for (auto i = 0; i < keys.size(); ++i) { - if (i > 0) { - stream << ", "; - } + appendComma(i, stream); stream << keys[i]->name(); } } @@ -151,15 +201,10 @@ void addFields( void addKeys(std::stringstream& stream, const std::vector& keys) { for (auto i = 0; i < keys.size(); ++i) { const auto& expr = keys[i]; - if (i > 0) { - stream << ", "; - } - if (auto field = - std::dynamic_pointer_cast(expr)) { + appendComma(i, stream); + if (auto field = TypedExprs::asFieldAccess(expr)) { stream << field->name(); - } else if ( - auto constant = - std::dynamic_pointer_cast(expr)) { + } else if (auto constant = TypedExprs::asConstant(expr)) { stream << constant->toString(); } else { stream << expr->toString(); @@ -172,17 +217,33 @@ void addSortingKeys( const std::vector& sortingOrders, std::stringstream& stream) { for (auto i = 0; i < sortingKeys.size(); ++i) { - if (i > 0) { - stream << ", "; - } + appendComma(i, stream); stream << sortingKeys[i]->name() << " " << sortingOrders[i].toString(); } } } // namespace +bool AggregationNode::canSpill(const QueryConfig& queryConfig) const { + // TODO: Add spilling for aggregations over distinct inputs. + // https://github.com/facebookincubator/velox/issues/7454 + for (const auto& aggregate : aggregates_) { + if (aggregate.distinct) { + return false; + } + } + // TODO: add spilling for pre-grouped aggregation later: + // https://github.com/facebookincubator/velox/issues/3264 + return (isFinal() || isSingle()) && preGroupedKeys().empty() && + queryConfig.aggregationSpillEnabled(); +} + void AggregationNode::addDetails(std::stringstream& stream) const { stream << stepName(step_) << " "; + if (isPreGrouped()) { + stream << "STREAMING "; + } + if (!groupingKeys_.empty()) { stream << "["; addFields(stream, groupingKeys_); @@ -190,9 +251,7 @@ void AggregationNode::addDetails(std::stringstream& stream) const { } for (auto i = 0; i < aggregateNames_.size(); ++i) { - if (i > 0) { - stream << ", "; - } + appendComma(i, stream); const auto& aggregate = aggregates_[i]; stream << aggregateNames_[i] << " := " << aggregate.call->toString(); if (aggregate.distinct) { @@ -208,6 +267,15 @@ void AggregationNode::addDetails(std::stringstream& stream) const { addSortingKeys(aggregate.sortingKeys, aggregate.sortingOrders, stream); } } + + if (!globalGroupingSets_.empty()) { + stream << " global group IDs: [ " << folly::join(", ", globalGroupingSets_) + << " ]"; + } + + if (groupId_.has_value()) { + stream << " Group Id key: " << groupId_.value()->name(); + } } namespace { @@ -253,6 +321,14 @@ folly::dynamic AggregationNode::serialize() const { obj["aggregates"].push_back(aggregate.serialize()); } + obj["globalGroupingSets"] = folly::dynamic::array; + for (const auto& globalGroup : globalGroupingSets_) { + obj["globalGroupingSets"].push_back(globalGroup); + } + + if (groupId_.has_value()) { + obj["groupId"] = ISerializable::serialize(groupId_.value()); + } obj["ignoreNullKeys"] = ignoreNullKeys_; return obj; } @@ -342,6 +418,17 @@ PlanNodePtr AggregationNode::create(const folly::dynamic& obj, void* context) { aggregates.push_back(Aggregate::deserialize(aggregate, context)); } + std::vector globalGroupingSets; + for (const auto& globalSet : obj["globalGroupingSets"]) { + globalGroupingSets.push_back(globalSet.asInt()); + } + + std::optional groupId; + if (obj.count("groupId")) { + groupId = ISerializable::deserialize( + obj["groupId"], context); + } + return std::make_shared( deserializePlanNodeId(obj), stepFromName(obj["step"].asString()), @@ -349,10 +436,111 @@ PlanNodePtr AggregationNode::create(const folly::dynamic& obj, void* context) { preGroupedKeys, aggregateNames, aggregates, + globalGroupingSets, + groupId, obj["ignoreNullKeys"].asBool(), deserializeSingleSource(obj, context)); } +namespace { +RowTypePtr getExpandOutputType( + const std::vector>& projections, + std::vector names) { + VELOX_USER_CHECK(!names.empty()); + VELOX_USER_CHECK(!projections.empty()); + VELOX_USER_CHECK_GT(names.size(), 0); + VELOX_USER_CHECK_GT(projections.size(), 0); + + for (int32_t i = 0; i < projections.size(); i++) { + VELOX_USER_CHECK_EQ(names.size(), projections[i].size()); + } + + std::vector types; + types.reserve(names.size()); + for (const auto& projection : projections[0]) { + types.push_back(projection->type()); + } + + folly::F14FastSet uniqueNames; + for (const auto& name : names) { + auto result = uniqueNames.insert(name); + VELOX_USER_CHECK( + result.second, + "Found duplicate column name in Expand plan node: {}.", + name); + } + + return ROW(std::move(names), std::move(types)); +} +} // namespace + +ExpandNode::ExpandNode( + PlanNodeId id, + std::vector> projections, + std::vector names, + PlanNodePtr source) + : PlanNode(std::move(id)), + sources_{source}, + outputType_(getExpandOutputType(projections, std::move(names))), + projections_(std::move(projections)) { + const auto& projectionNames = outputType_->names(); + const auto numColumns = projectionNames.size(); + const auto numRows = projections_.size(); + + for (const auto& rowProjection : projections_) { + for (const auto& columnProjection : rowProjection) { + VELOX_USER_CHECK( + TypedExprs::isFieldAccess(columnProjection) || + TypedExprs::isConstant(columnProjection), + "Unsupported projection expression in Expand plan node. Expected field reference or constant. Got: {} ", + columnProjection->toString()); + } + } + + for (int i = 0; i < numColumns; ++i) { + const auto& type = outputType_->childAt(i); + for (int j = 1; j < numRows; ++j) { + VELOX_USER_CHECK( + projections_[j][i]->type()->equivalent(*type), + "The projections type does not match across different rows in the same column. Got: {}, {}", + projections_[j][i]->type()->toString(), + type->toString()); + } + } +} + +void ExpandNode::addDetails(std::stringstream& stream) const { + for (auto i = 0; i < projections_.size(); ++i) { + appendComma(i, stream); + stream << "["; + addKeys(stream, projections_[i]); + stream << "]"; + } +} + +folly::dynamic ExpandNode::serialize() const { + auto obj = PlanNode::serialize(); + obj["projections"] = ISerializable::serialize(projections_); + obj["names"] = ISerializable::serialize(outputType_->names()); + + return obj; +} + +// static +PlanNodePtr ExpandNode::create(const folly::dynamic& obj, void* context) { + auto source = deserializeSingleSource(obj, context); + auto names = + ISerializable::deserialize>(obj["names"]); + auto projections = + ISerializable::deserialize>>( + obj["projections"], context); + return std::make_shared( + deserializePlanNodeId(obj), + std::move(projections), + std::move(names), + std::move(source)); +} + namespace { RowTypePtr getGroupIdOutputType( const std::vector& groupingKeyInfos, @@ -385,56 +573,8 @@ RowTypePtr getGroupIdOutputType( return ROW(std::move(names), std::move(types)); } -std::vector> getGroupingSets( - const std::vector>& groupingSetFields, - const std::vector& groupingKeyInfos) { - std::unordered_map inputToOutputGroupingKeyMap; - for (const auto& groupKeyInfo : groupingKeyInfos) { - inputToOutputGroupingKeyMap[groupKeyInfo.input->name()] = - groupKeyInfo.output; - } - - // Prestissimo passes grouping keys with their input column name to Velox. - // But Velox expects the output column name for the grouping key. - std::vector> groupingSets; - groupingSets.reserve(groupingSetFields.size()); - for (const auto& groupFields : groupingSetFields) { - std::vector groupingKeys; - groupingKeys.reserve(groupFields.size()); - for (const auto& groupingField : groupFields) { - groupingKeys.push_back( - inputToOutputGroupingKeyMap[groupingField->name()]); - } - groupingSets.push_back(groupingKeys); - } - return groupingSets; -} - } // namespace -GroupIdNode::GroupIdNode( - PlanNodeId id, - std::vector> groupingSets, - std::vector groupingKeyInfos, - std::vector aggregationInputs, - std::string groupIdName, - PlanNodePtr source) - : PlanNode(std::move(id)), - sources_{source}, - outputType_(getGroupIdOutputType( - groupingKeyInfos, - aggregationInputs, - groupIdName)), - groupingSets_(getGroupingSets(groupingSets, groupingKeyInfos)), - groupingKeyInfos_(std::move(groupingKeyInfos)), - aggregationInputs_(std::move(aggregationInputs)), - groupIdName_(std::move(groupIdName)) { - VELOX_USER_CHECK_GE( - groupingSets_.size(), - 2, - "GroupIdNode requires two or more grouping sets."); -} - GroupIdNode::GroupIdNode( PlanNodeId id, std::vector> groupingSets, @@ -460,14 +600,10 @@ GroupIdNode::GroupIdNode( void GroupIdNode::addDetails(std::stringstream& stream) const { for (auto i = 0; i < groupingSets_.size(); ++i) { - if (i > 0) { - stream << ", "; - } + appendComma(i, stream); stream << "["; for (auto j = 0; j < groupingSets_[i].size(); j++) { - if (j > 0) { - stream << ", "; - } + appendComma(j, stream); stream << groupingSets_[i][j]; } stream << "]"; @@ -580,9 +716,7 @@ void ProjectNode::addDetails(std::stringstream& stream) const { stream << "expressions: "; for (auto i = 0; i < projections_.size(); i++) { auto& projection = projections_[i]; - if (i > 0) { - stream << ", "; - } + appendComma(i, stream); stream << "(" << names_[i] << ":" << projection->type()->toString() << ", " << projection->toString() << ")"; } @@ -959,10 +1093,51 @@ PlanNodePtr HashJoinNode::create(const folly::dynamic& obj, void* context) { outputType); } +MergeJoinNode::MergeJoinNode( + const PlanNodeId& id, + JoinType joinType, + const std::vector& leftKeys, + const std::vector& rightKeys, + TypedExprPtr filter, + PlanNodePtr left, + PlanNodePtr right, + RowTypePtr outputType) + : AbstractJoinNode( + id, + joinType, + leftKeys, + rightKeys, + std::move(filter), + std::move(left), + std::move(right), + std::move(outputType)) { + VELOX_USER_CHECK( + isSupported(joinType_), + "The join type is not supported by merge join: ", + joinTypeName(joinType_)); +} + folly::dynamic MergeJoinNode::serialize() const { return serializeBase(); } +// static +bool MergeJoinNode::isSupported(core::JoinType joinType) { + switch (joinType) { + case core::JoinType::kInner: + case core::JoinType::kLeft: + case core::JoinType::kRight: + case core::JoinType::kLeftSemiFilter: + case core::JoinType::kRightSemiFilter: + case core::JoinType::kAnti: + case core::JoinType::kFull: + return true; + + default: + return false; + } +} + // static PlanNodePtr MergeJoinNode::create(const folly::dynamic& obj, void* context) { auto sources = deserializeSources(obj, context); @@ -1002,9 +1177,8 @@ NestedLoopJoinNode::NestedLoopJoinNode( sources_({std::move(left), std::move(right)}), outputType_(std::move(outputType)) { VELOX_USER_CHECK( - core::isInnerJoin(joinType_) || core::isLeftJoin(joinType_) || - core::isRightJoin(joinType_) || core::isFullJoin(joinType_), - "{} unsupported, NestedLoopJoin only supports inner and outer join", + isSupported(joinType_), + "The join type is not supported by nested loop join: ", joinTypeName(joinType_)); auto leftType = sources_[0]->outputType(); @@ -1036,6 +1210,20 @@ NestedLoopJoinNode::NestedLoopJoinNode( right, outputType) {} +// static +bool NestedLoopJoinNode::isSupported(core::JoinType joinType) { + switch (joinType) { + case core::JoinType::kInner: + case core::JoinType::kLeft: + case core::JoinType::kRight: + case core::JoinType::kFull: + return true; + + default: + return false; + } +} + void NestedLoopJoinNode::addDetails(std::stringstream& stream) const { stream << joinTypeName(joinType_); if (joinCondition_) { @@ -1139,6 +1327,9 @@ void addWindowFunction( std::stringstream& stream, const WindowNode::Function& windowFunction) { stream << windowFunction.functionCall->toString() << " "; + if (windowFunction.ignoreNulls) { + stream << "IGNORE NULLS "; + } auto frame = windowFunction.frame; if (frame.startType == WindowNode::BoundType::kUnboundedFollowing) { VELOX_USER_FAIL("Window frame start cannot be UNBOUNDED FOLLOWING"); @@ -1190,30 +1381,61 @@ WindowNode::WindowNode( sortingKeys_.size(), sortingOrders_.size(), "Number of sorting keys must be equal to the number of sorting orders"); + + std::unordered_set keyNames; + for (const auto& key : partitionKeys_) { + VELOX_USER_CHECK( + keyNames.insert(key->name()).second, + "Partitioning keys must be unique. Found duplicate key: {}", + key->name()); + } + + for (const auto& key : sortingKeys_) { + VELOX_USER_CHECK( + keyNames.insert(key->name()).second, + "Sorting keys must be unique and not overlap with partitioning keys. Found duplicate key: {}", + key->name()); + } + + for (const auto& windowFunction : windowFunctions_) { + if (windowFunction.frame.type == WindowType::kRange) { + if (windowFunction.frame.startValue || windowFunction.frame.endValue) { + // This is RANGE frame with a k limit bound like + // RANGE BETWEEN 5 PRECEDING AND CURRENT ROW. + // Such frames require that the ORDER BY have a single sorting key + // for comparison. + VELOX_USER_CHECK_EQ( + sortingKeys_.size(), + 1, + "Window frame of type RANGE PRECEDING or FOLLOWING requires single sorting key in ORDER BY."); + } + } + } } void WindowNode::addDetails(std::stringstream& stream) const { - stream << "partition by ["; + if (inputsSorted_) { + stream << "STREAMING "; + } + if (!partitionKeys_.empty()) { + stream << "partition by ["; addFields(stream, partitionKeys_); + stream << "] "; } - stream << "] "; - - stream << "order by ["; - addSortingKeys(sortingKeys_, sortingOrders_, stream); - stream << "] "; - auto numInputCols = sources_[0]->outputType()->size(); - auto numOutputCols = outputType_->size(); - for (auto i = numInputCols; i < numOutputCols; i++) { - if (i >= numInputCols + 1) { - stream << ", "; - } - stream << outputType_->names()[i] << " := "; - addWindowFunction(stream, windowFunctions_[i - numInputCols]); + if (!sortingKeys_.empty()) { + stream << "order by ["; + addSortingKeys(sortingKeys_, sortingOrders_, stream); + stream << "] "; } - stream << " inputsSorted [" << inputsSorted_ << "]"; + const auto numInputs = inputType()->size(); + for (auto i = 0; i < windowFunctions_.size(); i++) { + appendComma(i, stream); + stream << outputType_->names()[i + numInputs] << " := "; + addWindowFunction(stream, windowFunctions_[i]); + } } namespace { @@ -1385,7 +1607,7 @@ MarkDistinctNode::MarkDistinctNode( sources_{std::move(source)}, outputType_( getMarkDistinctOutputType(sources_[0]->outputType(), markerName_)) { - VELOX_USER_CHECK_GT(markerName_.size(), 0) + VELOX_USER_CHECK_GT(markerName_.size(), 0); VELOX_USER_CHECK_GT(distinctKeys_.size(), 0); } @@ -1521,6 +1743,23 @@ TopNRowNumberNode::TopNRowNumberNode( sortingKeys_.size(), 0, "Number of sorting keys must be greater than zero"); + + VELOX_USER_CHECK_GT(limit, 0, "Limit must be greater than zero"); + + std::unordered_set keyNames; + for (const auto& key : partitionKeys_) { + VELOX_USER_CHECK( + keyNames.insert(key->name()).second, + "Partitioning keys must be unique. Found duplicate key: {}", + key->name()); + } + + for (const auto& key : sortingKeys_) { + VELOX_USER_CHECK( + keyNames.insert(key->name()).second, + "Sorting keys must be unique and not overlap with partitioning keys. Found duplicate key: {}", + key->name()); + } } void TopNRowNumberNode::addDetails(std::stringstream& stream) const { @@ -1598,7 +1837,9 @@ PlanNodePtr LocalMergeNode::create(const folly::dynamic& obj, void* context) { std::move(sources)); } -void TableWriteNode::addDetails(std::stringstream& /*unused*/) const {} +void TableWriteNode::addDetails(std::stringstream& stream) const { + stream << insertTableHandle_->connectorInsertTableHandle()->toString(); +} folly::dynamic TableWriteNode::serialize() const { auto obj = PlanNode::serialize(); @@ -1637,7 +1878,7 @@ PlanNodePtr TableWriteNode::create(const folly::dynamic& obj, void* context) { auto outputType = deserializeRowType(obj["outputType"]); auto commitStrategy = connector::stringToCommitStrategy(obj["commitStrategy"].asString()); - auto source = ISerializable::deserialize(obj["sources"]); + auto source = ISerializable::deserialize(obj["sources"], context); return std::make_shared( id, columns, @@ -1668,7 +1909,7 @@ folly::dynamic TableWriteMergeNode::serialize() const { // static PlanNodePtr TableWriteMergeNode::create( const folly::dynamic& obj, - void* /*unused*/) { + void* context) { auto id = obj["id"].asString(); auto outputType = deserializeRowType(obj["outputType"]); std::shared_ptr aggregationNode; @@ -1676,7 +1917,7 @@ PlanNodePtr TableWriteMergeNode::create( aggregationNode = std::const_pointer_cast( ISerializable::deserialize(obj["aggregationNode"])); } - auto source = ISerializable::deserialize(obj["sources"]); + auto source = ISerializable::deserialize(obj["sources"], context); return std::make_shared( id, outputType, aggregationNode, source); } @@ -1851,6 +2092,36 @@ PlanNodePtr PartitionedOutputNode::create( deserializeSingleSource(obj, context)); } +TopNNode::TopNNode( + const PlanNodeId& id, + const std::vector& sortingKeys, + const std::vector& sortingOrders, + int32_t count, + bool isPartial, + const PlanNodePtr& source) + : PlanNode(id), + sortingKeys_(sortingKeys), + sortingOrders_(sortingOrders), + count_(count), + isPartial_(isPartial), + sources_{source} { + VELOX_USER_CHECK(!sortingKeys.empty(), "TopN must specify sorting keys"); + VELOX_USER_CHECK_EQ( + sortingKeys.size(), + sortingOrders.size(), + "Number of sorting keys and sorting orders in TopN must be the same"); + VELOX_USER_CHECK_GT( + count, 0, "TopN must specify greater than zero number of rows to keep"); + folly::F14FastSet sortingKeyNames; + for (const auto& sortingKey : sortingKeys_) { + auto result = sortingKeyNames.insert(sortingKey->name()); + VELOX_USER_CHECK( + result.second, + "TopN must specify unique sorting keys. Found duplicate key: {}", + *result.first); + } +} + void TopNNode::addDetails(std::stringstream& stream) const { if (isPartial_) { stream << "PARTIAL "; @@ -1952,13 +2223,13 @@ void PlanNode::toString( bool detailed, bool recursive, size_t indentationSize, - std::function addContext) const { + std::stringstream& stream)>& addContext) const { const std::string indentation(indentationSize, ' '); - stream << indentation << "-- " << name(); + stream << indentation << "-- " << name() << "[" << id() << "]"; if (detailed) { stream << "["; @@ -1972,7 +2243,7 @@ void PlanNode::toString( if (addContext) { auto contextIndentation = indentation + " "; stream << contextIndentation; - addContext(id_, contextIndentation, stream); + addContext(id(), contextIndentation, stream); stream << std::endl; } @@ -2012,6 +2283,7 @@ void PlanNode::registerSerDe() { registry.Register("AssignUniqueIdNode", AssignUniqueIdNode::create); registry.Register("EnforceSingleRowNode", EnforceSingleRowNode::create); registry.Register("ExchangeNode", ExchangeNode::create); + registry.Register("ExpandNode", ExpandNode::create); registry.Register("FilterNode", FilterNode::create); registry.Register("GroupIdNode", GroupIdNode::create); registry.Register("HashJoinNode", HashJoinNode::create); diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h index e561c015f0db2..f3ff68e802fb1 100644 --- a/velox/core/PlanNode.h +++ b/velox/core/PlanNode.h @@ -15,20 +15,19 @@ */ #pragma once +#include + #include "velox/connectors/Connector.h" #include "velox/core/Expressions.h" #include "velox/core/QueryConfig.h" -#include "velox/vector/arrow/Abi.h" -#include "velox/vector/arrow/Bridge.h" +struct ArrowArrayStream; namespace facebook::velox::core { typedef std::string PlanNodeId; -/** - * Generic representation of InsertTable - */ +/// Generic representation of InsertTable struct InsertTableHandle { public: InsertTableHandle( @@ -157,17 +156,17 @@ class PlanNode : public ISerializable { /// 'addContext' is not null. /// /// @param addContext Optional lambda to add context for a given plan node. - /// Receives plan node ID, indentation and std::stringstring where to append + /// Receives plan node ID, indentation and std::stringstream where to append /// the context. Use indentation for second and subsequent lines of a - /// mult-line context. Do not use indentation for single-line context. Do not + /// multi-line context. Do not use indentation for single-line context. Do not /// add trailing new-line character for the last or only line of context. std::string toString( bool detailed = false, bool recursive = false, - std::function addContext = nullptr) const { + std::stringstream& stream)>& addContext = nullptr) const { std::stringstream stream; toString(stream, detailed, recursive, 0, addContext); return stream.str(); @@ -210,10 +209,10 @@ class PlanNode : public ISerializable { bool detailed, bool recursive, size_t indentationSize, - std::function addContext) const; + std::stringstream& stream)>& addContext) const; const std::string id_; }; @@ -500,8 +499,6 @@ class AggregationNode : public PlanNode { /// Raw input types used to properly identify aggregate function. These /// might be different from the input types specified in 'call' when /// aggregation step is kIntermediate or kFinal. - /// - /// Note: not used yet. std::vector rawInputTypes; /// Optional name of input column to use as a mask. Column type must be @@ -534,6 +531,27 @@ class AggregationNode : public PlanNode { bool ignoreNullKeys, PlanNodePtr source); + /// @param globalGroupingSets Group IDs of the global grouping sets produced + /// by the preceding GroupId node + /// @param groupId Group ID key produced by the preceding GroupId node. Must + /// be set if globalGroupingSets is not empty. Must not be set otherwise. Must + /// be one of the groupingKeys. + + /// GlobalGroupingSets and groupId trigger special handling when the input + /// data set is empty (no rows). In that case, aggregation generates a single + /// row with the default global aggregate value per global grouping set. + AggregationNode( + const PlanNodeId& id, + Step step, + const std::vector& groupingKeys, + const std::vector& preGroupedKeys, + const std::vector& aggregateNames, + const std::vector& aggregates, + const std::vector& globalGroupingSets, + const std::optional& groupId, + bool ignoreNullKeys, + PlanNodePtr source); + const std::vector& sources() const override { return sources_; } @@ -554,6 +572,19 @@ class AggregationNode : public PlanNode { return preGroupedKeys_; } + bool isPreGrouped() const { + return !preGroupedKeys_.empty() && + std::equal( + preGroupedKeys_.begin(), + preGroupedKeys_.end(), + groupingKeys_.begin(), + groupingKeys_.end(), + [](const FieldAccessTypedExprPtr& x, + const FieldAccessTypedExprPtr& y) -> bool { + return (*x == *y); + }); + } + const std::vector& aggregateNames() const { return aggregateNames_; } @@ -566,18 +597,19 @@ class AggregationNode : public PlanNode { return ignoreNullKeys_; } + const std::vector& globalGroupingSets() const { + return globalGroupingSets_; + } + + std::optional groupId() const { + return groupId_; + } + std::string_view name() const override { return "Aggregation"; } - bool canSpill(const QueryConfig& queryConfig) const override { - // NOTE: as for now, we don't allow spilling for distinct aggregation - // (https://github.com/facebookincubator/velox/issues/3263) and pre-grouped - // aggregation (https://github.com/facebookincubator/velox/issues/3264). We - // will add support later to re-enable. - return (isFinal() || isSingle()) && !(aggregates().empty()) && - preGroupedKeys().empty() && queryConfig.aggregationSpillEnabled(); - } + bool canSpill(const QueryConfig& queryConfig) const override; bool isFinal() const { return step_ == Step::kFinal; @@ -600,6 +632,10 @@ class AggregationNode : public PlanNode { const std::vector aggregateNames_; const std::vector aggregates_; const bool ignoreNullKeys_; + + std::optional groupId_; + std::vector globalGroupingSets_; + const std::vector sources_; const RowTypePtr outputType_; }; @@ -699,6 +735,10 @@ class TableWriteNode : public PlanNode { return aggregationNode_; } + bool canSpill(const QueryConfig& queryConfig) const override { + return queryConfig.writerSpillEnabled(); + } + std::string_view name() const override { return "TableWrite"; } @@ -725,17 +765,6 @@ class TableWriteMergeNode : public PlanNode { /// 'outputType' specifies the type to store the metadata of table write /// output which contains the following columns: 'numWrittenRows', 'fragment' /// and 'tableCommitContext'. -#ifdef VELOX_ENABLE_BACKWARD_COMPATIBILITY - TableWriteMergeNode( - const PlanNodeId& id, - RowTypePtr outputType, - PlanNodePtr source) - : PlanNode(id), - aggregationNode_(nullptr), - sources_{std::move(source)}, - outputType_(std::move(outputType)) {} -#endif - TableWriteMergeNode( const PlanNodeId& id, RowTypePtr outputType, @@ -775,6 +804,56 @@ class TableWriteMergeNode : public PlanNode { const RowTypePtr outputType_; }; +/// For each input row, generates N rows with M columns according to +/// specified 'projections'. 'projections' is an N x M matrix of expressions: a +/// vector of N rows each having M columns. Each expression is either a column +/// reference or a constant. Both null and non-null constants are allowed. +/// 'names' is a list of M new column names. The semantic of this operator +/// matches Spark. +class ExpandNode : public PlanNode { + public: + ExpandNode( + PlanNodeId id, + std::vector> projections, + std::vector names, + PlanNodePtr source); + + const RowTypePtr& outputType() const override { + return outputType_; + } + + const RowTypePtr& inputType() const { + return sources_[0]->outputType(); + } + + const std::vector& sources() const override { + return sources_; + } + + const std::vector>& projections() const { + return projections_; + } + + const std::vector& names() const { + return outputType_->names(); + } + + std::string_view name() const override { + return "Expand"; + } + + folly::dynamic serialize() const override; + + static PlanNodePtr create(const folly::dynamic& obj, void* context); + + private: + void addDetails(std::stringstream& stream) const override; + + const std::vector sources_; + const RowTypePtr outputType_; + const std::vector> projections_; +}; + /// Plan node used to implement aggregations over grouping sets. Duplicates the /// aggregation input for each set of grouping keys. The output contains one /// column for each grouping key, followed by aggregation inputs, followed by a @@ -792,25 +871,6 @@ class GroupIdNode : public PlanNode { folly::dynamic serialize() const; }; - /// @param id Plan node ID. - /// @param groupingSets A list of grouping key sets. Grouping keys within the - /// set must be unique, but grouping keys across sets may repeat. - /// @param groupingKeyInfos The names and order of the grouping keys in the - /// output. - /// @param aggregationInputs Columns that contain inputs to the aggregate - /// functions. - /// @param groupIdName Name of the column that will contain the grouping set - /// ID (a zero based integer). - /// @param source Input plan node. - /// NOTE: THIS FUNCTION IS DEPRECATED. PLEASE DO NOT USE. - GroupIdNode( - PlanNodeId id, - std::vector> groupingSets, - std::vector groupingKeyInfos, - std::vector aggregationInputs, - std::string groupIdName, - PlanNodePtr source); - /// @param id Plan node ID. /// @param groupingSets A list of grouping key sets. Grouping keys within the /// set must be unique, but grouping keys across sets may repeat. @@ -850,7 +910,7 @@ class GroupIdNode : public PlanNode { return aggregationInputs_; } - const std::string& groupIdName() { + const std::string& groupIdName() const { return groupIdName_; } @@ -1296,16 +1356,16 @@ FOLLY_ALWAYS_INLINE std::ostream& operator<<( enum class JoinType { // For each row on the left, find all matching rows on the right and return // all combinations. - kInner, + kInner = 0, // For each row on the left, find all matching rows on the right and return // all combinations. In addition, return all rows from the left that have no // match on the right with right-side columns filled with nulls. - kLeft, + kLeft = 1, // Opposite of kLeft. For each row on the right, find all matching rows on the // left and return all combinations. In addition, return all rows from the // right that have no match on the left with left-side columns filled with // nulls. - kRight, + kRight = 2, // A "union" of kLeft and kRight. For each row on the left, find all matching // rows on the right and return all combinations. In addition, return all rows // from the left that have no @@ -1313,11 +1373,11 @@ enum class JoinType { // all rows from the // right that have no match on the left with left-side columns filled with // nulls. - kFull, + kFull = 3, // Return a subset of rows from the left side which have a match on the right // side. For this join type, cardinality of the output is less than or equal // to the cardinality of the left side. - kLeftSemiFilter, + kLeftSemiFilter = 4, // Return each row from the left side with a boolean flag indicating whether // there exists a match on the right side. For this join type, cardinality of // the output equals the cardinality of the left side. @@ -1326,11 +1386,11 @@ enum class JoinType { // 'nullAware' boolean specified separately. // // Null-aware join follows IN semantic. Regular join follows EXISTS semantic. - kLeftSemiProject, + kLeftSemiProject = 5, // Opposite of kLeftSemiFilter. Return a subset of rows from the right side // which have a match on the left side. For this join type, cardinality of the // output is less than or equal to the cardinality of the right side. - kRightSemiFilter, + kRightSemiFilter = 6, // Opposite of kLeftSemiProject. Return each row from the right side with a // boolean flag indicating whether there exists a match on the left side. For // this join type, cardinality of the output equals the cardinality of the @@ -1340,7 +1400,7 @@ enum class JoinType { // 'nullAware' boolean specified separately. // // Null-aware join follows IN semantic. Regular join follows EXISTS semantic. - kRightSemiProject, + kRightSemiProject = 7, // Return each row from the left side which has no match on the right side. // The handling of the rows with nulls in the join key depends on the // 'nullAware' boolean specified separately. @@ -1354,7 +1414,8 @@ enum class JoinType { // Regular anti join follows NOT EXISTS semantic: // (1) ignore right-side rows with nulls in the join keys; // (2) unconditionally return left side rows with nulls in the join keys. - kAnti, + kAnti = 8, + kNumJoinTypes = 9, }; const char* joinTypeName(JoinType joinType); @@ -1465,6 +1526,10 @@ class AbstractJoinNode : public PlanNode { return joinType_ == JoinType::kAnti; } + bool isPreservingProbeOrder() const { + return isInnerJoin() || isLeftJoin() || isAntiJoin(); + } + const std::vector& leftKeys() const { return leftKeys_; } @@ -1526,7 +1591,7 @@ class HashJoinNode : public AbstractJoinNode { if (nullAware) { VELOX_USER_CHECK( isNullAwareSupported(joinType), - "Null-aware flag is supported only for semi and anti joins"); + "Null-aware flag is supported only for semi project and anti joins"); VELOX_USER_CHECK_EQ( 1, leftKeys_.size(), "Null-aware joins allow only one join key"); @@ -1580,16 +1645,7 @@ class MergeJoinNode : public AbstractJoinNode { TypedExprPtr filter, PlanNodePtr left, PlanNodePtr right, - RowTypePtr outputType) - : AbstractJoinNode( - id, - joinType, - leftKeys, - rightKeys, - std::move(filter), - std::move(left), - std::move(right), - std::move(outputType)) {} + RowTypePtr outputType); std::string_view name() const override { return "MergeJoin"; @@ -1597,6 +1653,9 @@ class MergeJoinNode : public AbstractJoinNode { folly::dynamic serialize() const override; + /// If merge join supports this join type. + static bool isSupported(core::JoinType joinType); + static PlanNodePtr create(const folly::dynamic& obj, void* context); }; @@ -1604,9 +1663,10 @@ class MergeJoinNode : public AbstractJoinNode { /// exec::NestedLoopJoinProbe and exec::NestedLoopJoinBuild. A separate pipeline /// is produced for the build side when generating exec::Operators. /// -/// Nested loop join supports both equal and non-equal joins. Expressions +/// Nested loop join (NLJ) supports both equal and non-equal joins. Expressions /// specified in joinCondition are evaluated on every combination of left/right -/// tuple, to emit result. +/// tuple, to emit result. Results are emitted following the same input order of +/// probe rows for inner and left joins, for each thread of execution. /// /// To create Cartesian product of the left/right's output, use the constructor /// without `joinType` and `joinCondition` parameter. @@ -1648,6 +1708,9 @@ class NestedLoopJoinNode : public PlanNode { folly::dynamic serialize() const override; + /// If nested loop join supports this join type. + static bool isSupported(core::JoinType joinType); + static PlanNodePtr create(const folly::dynamic& obj, void* context); private: @@ -1678,6 +1741,15 @@ class OrderByNode : public PlanNode { sortingKeys.size(), sortingOrders.size(), "Number of sorting keys and sorting orders in OrderBy must be the same"); + // Reject duplicate sorting keys. + std::unordered_set uniqueKeys; + for (const auto& sortKey : sortingKeys) { + VELOX_USER_CHECK_NOT_NULL(sortKey, "Sorting key cannot be null"); + VELOX_USER_CHECK( + uniqueKeys.insert(sortKey->name()).second, + "Duplicate sorting keys are not allowed: {}", + sortKey->name()); + } } const std::vector& sortingKeys() const { @@ -1732,21 +1804,7 @@ class TopNNode : public PlanNode { const std::vector& sortingOrders, int32_t count, bool isPartial, - const PlanNodePtr& source) - : PlanNode(id), - sortingKeys_(sortingKeys), - sortingOrders_(sortingOrders), - count_(count), - isPartial_(isPartial), - sources_{source} { - VELOX_USER_CHECK(!sortingKeys.empty(), "TopN must specify sorting keys"); - VELOX_USER_CHECK_EQ( - sortingKeys.size(), - sortingOrders.size(), - "Number of sorting keys and sorting orders in TopN must be the same"); - VELOX_USER_CHECK_GT( - count, 0, "TopN must specify greater than zero number of rows to keep"); - } + const PlanNodePtr& source); const std::vector& sortingKeys() const { return sortingKeys_; @@ -1797,8 +1855,8 @@ class LimitNode : public PlanNode { // nodes. LimitNode( const PlanNodeId& id, - int32_t offset, - int32_t count, + int64_t offset, + int64_t count, bool isPartial, const PlanNodePtr& source) : PlanNode(id), @@ -1820,11 +1878,11 @@ class LimitNode : public PlanNode { return sources_; } - int32_t offset() const { + int64_t offset() const { return offset_; } - int32_t count() const { + int64_t count() const { return count_; } @@ -1843,8 +1901,8 @@ class LimitNode : public PlanNode { private: void addDetails(std::stringstream& stream) const override; - const int32_t offset_; - const int32_t count_; + const int64_t offset_; + const int64_t count_; const bool isPartial_; const std::vector sources_; }; @@ -1982,7 +2040,7 @@ class AssignUniqueIdNode : public PlanNode { const std::shared_ptr& uniqueIdCounter() const { return uniqueIdCounter_; - }; + } folly::dynamic serialize() const override; @@ -2036,7 +2094,6 @@ class WindowNode : public PlanNode { /// Frame bounds can be CURRENT ROW, UNBOUNDED PRECEDING(FOLLOWING) /// and k PRECEDING(FOLLOWING). K could be a constant or column. /// - /// k PRECEDING(FOLLOWING) is only supported for ROW frames now. /// k has to be of integer or bigint type. struct Frame { WindowType type; @@ -2073,26 +2130,6 @@ class WindowNode : public PlanNode { bool inputsSorted, PlanNodePtr source); -#ifdef VELOX_ENABLE_BACKWARD_COMPATIBILITY - WindowNode( - PlanNodeId id, - std::vector partitionKeys, - std::vector sortingKeys, - std::vector sortingOrders, - std::vector windowColumnNames, - std::vector windowFunctions, - PlanNodePtr source) - : WindowNode( - id, - partitionKeys, - sortingKeys, - sortingOrders, - windowColumnNames, - windowFunctions, - false, - source){}; -#endif - const std::vector& sources() const override { return sources_; } @@ -2103,6 +2140,18 @@ class WindowNode : public PlanNode { return outputType_; } + bool canSpill(const QueryConfig& queryConfig) const override { + // No partitioning keys means the whole input is one big partition. In this + // case, spilling is not helpful because we need to have a full partition in + // memory to produce results. + return !partitionKeys_.empty() && !inputsSorted_ && + queryConfig.windowSpillEnabled(); + } + + const RowTypePtr& inputType() const { + return sources_[0]->outputType(); + } + const std::vector& partitionKeys() const { return partitionKeys_; } @@ -2176,6 +2225,10 @@ class RowNumberNode : public PlanNode { return outputType_; } + bool canSpill(const QueryConfig& queryConfig) const override { + return !partitionKeys_.empty() && queryConfig.rowNumberSpillEnabled(); + } + const std::vector& partitionKeys() const { return partitionKeys_; } @@ -2265,6 +2318,9 @@ class MarkDistinctNode : public PlanNode { class TopNRowNumberNode : public PlanNode { public: /// @param partitionKeys Partitioning keys. May be empty. + /// @param sortingKeys Sorting keys. May not be empty and may not intersect + /// with 'partitionKeys'. + /// @param sortingOrders Sorting orders, one per sorting key. /// @param rowNumberColumnName Optional name of the column containing row /// numbers. If not specified, the output doesn't include 'row number' column. /// This is used when computing partial results. @@ -2288,6 +2344,14 @@ class TopNRowNumberNode : public PlanNode { return outputType_; } + bool canSpill(const QueryConfig& queryConfig) const override { + return !partitionKeys_.empty() && queryConfig.topNRowNumberSpillEnabled(); + } + + const RowTypePtr& inputType() const { + return sources_[0]->outputType(); + } + const std::vector& partitionKeys() const { return partitionKeys_; } @@ -2332,3 +2396,21 @@ class TopNRowNumberNode : public PlanNode { }; } // namespace facebook::velox::core + +template <> +struct fmt::formatter + : formatter { + auto format( + facebook::velox::core::PartitionedOutputNode::Kind s, + format_context& ctx) const { + return formatter::format( + facebook::velox::core::PartitionedOutputNode::kindString(s), ctx); + } +}; + +template <> +struct fmt::formatter : formatter { + auto format(facebook::velox::core::JoinType s, format_context& ctx) const { + return formatter::format(static_cast(s), ctx); + } +}; diff --git a/velox/core/QueryConfig.cpp b/velox/core/QueryConfig.cpp index 7fb09d3f65797..3d5b25ff94878 100644 --- a/velox/core/QueryConfig.cpp +++ b/velox/core/QueryConfig.cpp @@ -16,102 +16,46 @@ #include +#include "velox/common/config/Config.h" #include "velox/core/QueryConfig.h" +#include "velox/type/tz/TimeZoneMap.h" namespace facebook::velox::core { -double toBytesPerCapacityUnit(CapacityUnit unit) { - switch (unit) { - case CapacityUnit::BYTE: - return 1; - case CapacityUnit::KILOBYTE: - return exp2(10); - case CapacityUnit::MEGABYTE: - return exp2(20); - case CapacityUnit::GIGABYTE: - return exp2(30); - case CapacityUnit::TERABYTE: - return exp2(40); - case CapacityUnit::PETABYTE: - return exp2(50); - default: - VELOX_USER_FAIL("Invalid capacity unit '{}'", (int)unit); - } -} - -CapacityUnit valueOfCapacityUnit(const std::string& unitStr) { - if (unitStr == "B") { - return CapacityUnit::BYTE; - } - if (unitStr == "kB") { - return CapacityUnit::KILOBYTE; - } - if (unitStr == "MB") { - return CapacityUnit::MEGABYTE; - } - if (unitStr == "GB") { - return CapacityUnit::GIGABYTE; - } - if (unitStr == "TB") { - return CapacityUnit::TERABYTE; - } - if (unitStr == "PB") { - return CapacityUnit::PETABYTE; - } - VELOX_USER_FAIL("Invalid capacity unit '{}'", unitStr); +QueryConfig::QueryConfig( + const std::unordered_map& values) + : config_{std::make_unique( + std::unordered_map(values))} { + validateConfig(); } -// Convert capacity string with unit to the capacity number in the specified -// units -uint64_t toCapacity(const std::string& from, CapacityUnit to) { - static const RE2 kPattern(R"(^\s*(\d+(?:\.\d+)?)\s*([a-zA-Z]+)\s*$)"); - double value; - std::string unit; - if (!RE2::FullMatch(from, kPattern, &value, &unit)) { - VELOX_USER_FAIL("Invalid capacity string '{}'", from); - } - - return value * - (toBytesPerCapacityUnit(valueOfCapacityUnit(unit)) / - toBytesPerCapacityUnit(to)); +QueryConfig::QueryConfig(std::unordered_map&& values) + : config_{std::make_unique(std::move(values))} { + validateConfig(); } -std::chrono::duration toDuration(const std::string& str) { - static const RE2 kPattern(R"(^\s*(\d+(?:\.\d+)?)\s*([a-zA-Z]+)\s*)"); - - double value; - std::string unit; - if (!RE2::FullMatch(str, kPattern, &value, &unit)) { - VELOX_USER_FAIL("Invalid duration {}", str); +void QueryConfig::validateConfig() { + // Validate if timezone name can be recognized. + if (config_->valueExists(QueryConfig::kSessionTimezone)) { + VELOX_USER_CHECK( + tz::getTimeZoneID( + config_->get(QueryConfig::kSessionTimezone).value(), + false) != -1, + fmt::format( + "session '{}' set with invalid value '{}'", + QueryConfig::kSessionTimezone, + config_->get(QueryConfig::kSessionTimezone).value())); } - if (unit == "ns") { - return std::chrono::duration(value); - } else if (unit == "us") { - return std::chrono::duration(value); - } else if (unit == "ms") { - return std::chrono::duration(value); - } else if (unit == "s") { - return std::chrono::duration(value); - } else if (unit == "m") { - return std::chrono::duration>(value); - } else if (unit == "h") { - return std::chrono::duration>(value); - } else if (unit == "d") { - return std::chrono::duration>(value); - } - VELOX_USER_FAIL("Invalid duration {}", str); } -QueryConfig::QueryConfig( - const std::unordered_map& values) - : config_{std::make_unique(values)} {} - -QueryConfig::QueryConfig(std::unordered_map&& values) - : config_{std::make_unique(std::move(values))} {} - void QueryConfig::testingOverrideConfigUnsafe( std::unordered_map&& values) { - config_ = std::make_unique(std::move(values)); + config_ = std::make_unique(std::move(values)); +} + +std::unordered_map QueryConfig::rawConfigsCopy() + const { + return config_->rawConfigsCopy(); } } // namespace facebook::velox::core diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index c243d3cf736c0..304e1fd370bbc 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -15,29 +15,12 @@ */ #pragma once -#include "velox/core/Config.h" +#include "velox/common/config/Config.h" +#include "velox/vector/TypeAliases.h" namespace facebook::velox::core { -enum class CapacityUnit { - BYTE, - KILOBYTE, - MEGABYTE, - GIGABYTE, - TERABYTE, - PETABYTE -}; - -double toBytesPerCapacityUnit(CapacityUnit unit); - -CapacityUnit valueOfCapacityUnit(const std::string& unitStr); - -/// Convert capacity string with unit to the capacity number in the specified -/// units -uint64_t toCapacity(const std::string& from, CapacityUnit to); - -std::chrono::duration toDuration(const std::string& str); -/// A simple wrapper around velox::Config. Defines constants for query +/// A simple wrapper around velox::ConfigBase. Defines constants for query /// config properties and accessor methods. /// Create per query context. Does not have a singleton instance. /// Does not allow altering properties on the fly. Only at creation time. @@ -48,17 +31,31 @@ class QueryConfig { explicit QueryConfig(std::unordered_map&& values); +#ifdef VELOX_ENABLE_BACKWARD_COMPATIBILITY static constexpr const char* kCodegenEnabled = "codegen.enabled"; - /// Maximum memory that a query can use on a single host. - static constexpr const char* kQueryMaxMemoryPerNode = - "query_max_memory_per_node"; - static constexpr const char* kCodegenConfigurationFilePath = "codegen.configuration_file_path"; static constexpr const char* kCodegenLazyLoading = "codegen.lazy_loading"; + bool codegenEnabled() const { + return get(kCodegenEnabled, false); + } + + std::string codegenConfigurationFilePath() const { + return get(kCodegenConfigurationFilePath, ""); + } + + bool codegenLazyLoading() const { + return get(kCodegenLazyLoading, true); + } +#endif + + /// Maximum memory that a query can use on a single host. + static constexpr const char* kQueryMaxMemoryPerNode = + "query_max_memory_per_node"; + /// User provided session timezone. Stores a string with the actual timezone /// name, e.g: "America/Los_Angeles". static constexpr const char* kSessionTimezone = "session_timezone"; @@ -94,30 +91,13 @@ class QueryConfig { /// Flags used to configure the CAST operator: + static constexpr const char* kLegacyCast = "legacy_cast"; + /// This flag makes the Row conversion to by applied in a way that the casting /// row field are matched by name instead of position. static constexpr const char* kCastMatchStructByName = "cast_match_struct_by_name"; - /// If set, cast from float/double/decimal/string to integer truncates the - /// decimal part, otherwise rounds. - static constexpr const char* kCastToIntByTruncate = "cast_to_int_by_truncate"; - - /// If set, cast from string to date allows only ISO 8601 formatted strings: - /// [+-](YYYY-MM-DD). Otherwise, allows all patterns supported by Spark: - /// `[+-]yyyy*` - /// `[+-]yyyy*-[m]m` - /// `[+-]yyyy*-[m]m-[d]d` - /// `[+-]yyyy*-[m]m-[d]d *` - /// `[+-]yyyy*-[m]m-[d]dT*` - /// The asterisk `*` in `yyyy*` stands for any numbers. - /// For the last two patterns, the trailing `*` can represent none or any - /// sequence of characters, e.g: - /// "1970-01-01 123" - /// "1970-01-01 (BC)" - static constexpr const char* kCastStringToDateIsIso8601 = - "cast_string_to_date_is_iso_8601"; - /// Used for backpressure to block local exchange producers when the local /// exchange buffer reaches or exceeds this size. static constexpr const char* kMaxLocalExchangeBufferSize = @@ -128,6 +108,11 @@ class QueryConfig { static constexpr const char* kMaxExchangeBufferSize = "exchange.max_buffer_size"; + /// Maximum size in bytes to accumulate among all sources of the merge + /// exchange. Enforced approximately, not strictly. + static constexpr const char* kMaxMergeExchangeBufferSize = + "merge_exchange.max_buffer_size"; + static constexpr const char* kMaxPartialAggregationMemory = "max_partial_aggregation_memory"; @@ -140,9 +125,17 @@ class QueryConfig { static constexpr const char* kAbandonPartialAggregationMinPct = "abandon_partial_aggregation_min_pct"; + static constexpr const char* kAbandonPartialTopNRowNumberMinRows = + "abandon_partial_topn_row_number_min_rows"; + + static constexpr const char* kAbandonPartialTopNRowNumberMinPct = + "abandon_partial_topn_row_number_min_pct"; + static constexpr const char* kMaxPartitionedOutputBufferSize = "max_page_partitioning_buffer_size"; + static constexpr const char* kMaxOutputBufferSize = "max_output_buffer_size"; + /// Preferred size of batches in bytes to be returned by operators from /// Operator::getOutput. It is used when an estimate of average row size is /// known. Otherwise kPreferredOutputBatchRows is used. @@ -162,6 +155,12 @@ class QueryConfig { /// output rows. static constexpr const char* kMaxOutputBatchRows = "max_output_batch_rows"; + /// TableScan operator will exit getOutput() method after this many + /// milliseconds even if it has no data to return yet. Zero means 'no time + /// limit'. + static constexpr const char* kTableScanGetOutputTimeLimitMs = + "table_scan_getoutput_time_limit_ms"; + /// If false, the 'group by' code is forced to use generic hash mode /// hashtable. static constexpr const char* kHashAdaptivityEnabled = @@ -185,22 +184,33 @@ class QueryConfig { /// OrderBy spilling flag, only applies if "spill_enabled" flag is set. static constexpr const char* kOrderBySpillEnabled = "order_by_spill_enabled"; - /// The max memory that a final aggregation can use before spilling. If it 0, - /// then there is no limit. - static constexpr const char* kAggregationSpillMemoryThreshold = - "aggregation_spill_memory_threshold"; + /// Window spilling flag, only applies if "spill_enabled" flag is set. + static constexpr const char* kWindowSpillEnabled = "window_spill_enabled"; + + /// If true, the memory arbitrator will reclaim memory from table writer by + /// flushing its buffered data to disk. + static constexpr const char* kWriterSpillEnabled = "writer_spill_enabled"; - /// The max memory that a hash join can use before spilling. If it 0, then - /// there is no limit. - static constexpr const char* kJoinSpillMemoryThreshold = - "join_spill_memory_threshold"; + /// RowNumber spilling flag, only applies if "spill_enabled" flag is set. + static constexpr const char* kRowNumberSpillEnabled = + "row_number_spill_enabled"; - /// The max memory that an order by can use before spilling. If it 0, then - /// there is no limit. - static constexpr const char* kOrderBySpillMemoryThreshold = - "order_by_spill_memory_threshold"; + /// TopNRowNumber spilling flag, only applies if "spill_enabled" flag is set. + static constexpr const char* kTopNRowNumberSpillEnabled = + "topn_row_number_spill_enabled"; - static constexpr const char* kTestingSpillPct = "testing.spill_pct"; + /// The max row numbers to fill and spill for each spill run. This is used to + /// cap the memory used for spilling. If it is zero, then there is no limit + /// and spilling might run out of memory. + /// Based on offline test results, the default value is set to 12 million rows + /// which uses ~128MB memory when to fill a spill run. + static constexpr const char* kMaxSpillRunRows = "max_spill_run_rows"; + + /// The max spill bytes limit set for each query. This is used to cap the + /// storage used for spilling. If it is zero, then there is no limit and + /// spilling might exhaust the storage or takes too long to run. The default + /// value is set to 100 GB. + static constexpr const char* kMaxSpillBytes = "max_spill_bytes"; /// The max allowed spilling level with zero being the initial spilling level. /// This only applies for hash build spilling which might trigger recursive @@ -214,15 +224,6 @@ class QueryConfig { /// The max allowed spill file size. If it is zero, then there is no limit. static constexpr const char* kMaxSpillFileSize = "max_spill_file_size"; - /// The min spill run size limit used to select partitions for spilling. The - /// spiller tries to spill a previously spilled partitions if its data size - /// exceeds this limit, otherwise it spills the partition with most data. - /// If the limit is zero, then the spiller always spill a previously spilled - /// partition if it has any data. This is to avoid spill from a partition with - /// a small amount of data which might result in generating too many small - /// spilled files. - static constexpr const char* kMinSpillRunSize = "min_spill_run_size"; - static constexpr const char* kSpillCompressionKind = "spill_compression_codec"; @@ -233,47 +234,59 @@ class QueryConfig { static constexpr const char* kSpillWriteBufferSize = "spill_write_buffer_size"; + /// Specifies the buffer size in bytes to read from one spilled file. If the + /// underlying filesystem supports async read, we do read-ahead with double + /// buffering, which doubles the buffer used to read from each spill file. + static constexpr const char* kSpillReadBufferSize = "spill_read_buffer_size"; + + /// Config used to create spill files. This config is provided to underlying + /// file system and the config is free form. The form should be defined by the + /// underlying file system. + static constexpr const char* kSpillFileCreateConfig = + "spill_file_create_config"; + + /// Default offset spill start partition bit. static constexpr const char* kSpillStartPartitionBit = "spiller_start_partition_bit"; + /// Default number of spill partition bits. + static constexpr const char* kSpillNumPartitionBits = + "spiller_num_partition_bits"; + + /// !!! DEPRECATED: do not use. static constexpr const char* kJoinSpillPartitionBits = "join_spiller_partition_bits"; - static constexpr const char* kAggregationSpillPartitionBits = - "aggregation_spiller_partition_bits"; - - /// If true and spilling has been triggered during the input processing, the - /// spiller will spill all the remaining in-memory state to disk before output - /// processing. This is to simplify the aggregation query OOM prevention in - /// output processing stage. - static constexpr const char* kAggregationSpillAll = "aggregation_spill_all"; - static constexpr const char* kMinSpillableReservationPct = "min_spillable_reservation_pct"; static constexpr const char* kSpillableReservationGrowthPct = "spillable_reservation_growth_pct"; + /// Minimum memory footprint size required to reclaim memory from a file + /// writer by flushing its buffered data to disk. + static constexpr const char* kWriterFlushThresholdBytes = + "writer_flush_threshold_bytes"; + /// If true, array_agg() aggregation function will ignore nulls in the input. static constexpr const char* kPrestoArrayAggIgnoreNulls = "presto.array_agg.ignore_nulls"; - /// If false, size function returns null for null input. - static constexpr const char* kSparkLegacySizeOfNull = - "spark.legacy_size_of_null"; - // The default number of expected items for the bloomfilter. static constexpr const char* kSparkBloomFilterExpectedNumItems = "spark.bloom_filter.expected_num_items"; - // The default number of bits to use for the bloom filter. + /// The default number of bits to use for the bloom filter. static constexpr const char* kSparkBloomFilterNumBits = "spark.bloom_filter.num_bits"; - // The max number of bits to use for the bloom filter. + /// The max number of bits to use for the bloom filter. static constexpr const char* kSparkBloomFilterMaxNumBits = "spark.bloom_filter.max_num_bits"; + /// The current spark partition id. + static constexpr const char* kSparkPartitionId = "spark.partition_id"; + /// The number of local parallel table writer operators per task. static constexpr const char* kTaskWriterCount = "task_writer_count"; @@ -308,9 +321,93 @@ class QueryConfig { static constexpr const char* kEnableExpressionEvaluationCache = "enable_expression_evaluation_cache"; + // For a given shared subexpression, the maximum distinct sets of inputs we + // cache results for. Lambdas can call the same expression with different + // inputs many times, causing the results we cache to explode in size. Putting + // a limit contains the memory usage. + static constexpr const char* kMaxSharedSubexprResultsCached = + "max_shared_subexpr_results_cached"; + + /// Maximum number of splits to preload. Set to 0 to disable preloading. + static constexpr const char* kMaxSplitPreloadPerDriver = + "max_split_preload_per_driver"; + + /// If not zero, specifies the cpu time slice limit in ms that a driver thread + /// can continuously run without yielding. If it is zero, then there is no + /// limit. + static constexpr const char* kDriverCpuTimeSliceLimitMs = + "driver_cpu_time_slice_limit_ms"; + + /// Maximum number of bytes to use for the normalized key in prefix-sort. Use + /// 0 to disable prefix-sort. + static constexpr const char* kPrefixSortNormalizedKeyMaxBytes = + "prefixsort_normalized_key_max_bytes"; + + /// Minimum number of rows to use prefix-sort. The default value has been + /// derived using micro-benchmarking. + static constexpr const char* kPrefixSortMinRows = "prefixsort_min_rows"; + + /// Enable query tracing flag. + static constexpr const char* kQueryTraceEnabled = "query_trace_enabled"; + + /// Base dir of a query to store tracing data. + static constexpr const char* kQueryTraceDir = "query_trace_dir"; + + /// A comma-separated list of plan node ids whose input data will be traced. + /// Empty string if only want to trace the query metadata. + static constexpr const char* kQueryTraceNodeIds = "query_trace_node_ids"; + + /// Disable optimization in expression evaluation to peel common dictionary + /// layer from inputs. + static constexpr const char* kDebugDisableExpressionWithPeeling = + "debug_disable_expression_with_peeling"; + + /// Disable optimization in expression evaluation to re-use cached results for + /// common sub-expressions. + static constexpr const char* kDebugDisableCommonSubExpressions = + "debug_disable_common_sub_expressions"; + + /// Disable optimization in expression evaluation to re-use cached results + /// between subsequent input batches that are dictionary encoded and have the + /// same alphabet(underlying flat vector). + static constexpr const char* kDebugDisableExpressionWithMemoization = + "debug_disable_expression_with_memoization"; + + /// Disable optimization in expression evaluation to delay loading of lazy + /// inputs unless required. + static constexpr const char* kDebugDisableExpressionWithLazyInputs = + "debug_disable_expression_with_lazy_inputs"; + + /// Temporary flag to control whether selective Nimble reader should be used + /// in this query or not. Will be removed after the selective Nimble reader + /// is fully rolled out. + static constexpr const char* kSelectiveNimbleReaderEnabled = + "selective_nimble_reader_enabled"; + + bool selectiveNimbleReaderEnabled() const { + return get(kSelectiveNimbleReaderEnabled, false); + } + + bool debugDisableExpressionsWithPeeling() const { + return get(kDebugDisableExpressionWithPeeling, false); + } + + bool debugDisableCommonSubExpressions() const { + return get(kDebugDisableCommonSubExpressions, false); + } + + bool debugDisableExpressionsWithMemoization() const { + return get(kDebugDisableExpressionWithMemoization, false); + } + + bool debugDisableExpressionsWithLazyInputs() const { + return get(kDebugDisableExpressionWithLazyInputs, false); + } + uint64_t queryMaxMemoryPerNode() const { - return toCapacity( - get(kQueryMaxMemoryPerNode, "0B"), CapacityUnit::BYTE); + return config::toCapacity( + get(kQueryMaxMemoryPerNode, "0B"), + config::CapacityUnit::BYTE); } uint64_t maxPartialAggregationMemoryUsage() const { @@ -331,30 +428,45 @@ class QueryConfig { return get(kAbandonPartialAggregationMinPct, 80); } - uint64_t aggregationSpillMemoryThreshold() const { - static constexpr uint64_t kDefault = 0; - return get(kAggregationSpillMemoryThreshold, kDefault); + int32_t abandonPartialTopNRowNumberMinRows() const { + return get(kAbandonPartialTopNRowNumberMinRows, 100'000); + } + + int32_t abandonPartialTopNRowNumberMinPct() const { + return get(kAbandonPartialTopNRowNumberMinPct, 80); } - uint64_t joinSpillMemoryThreshold() const { - static constexpr uint64_t kDefault = 0; - return get(kJoinSpillMemoryThreshold, kDefault); + uint64_t maxSpillRunRows() const { + static constexpr uint64_t kDefault = 12UL << 20; + return get(kMaxSpillRunRows, kDefault); } - uint64_t orderBySpillMemoryThreshold() const { - static constexpr uint64_t kDefault = 0; - return get(kOrderBySpillMemoryThreshold, kDefault); + uint64_t maxSpillBytes() const { + static constexpr uint64_t kDefault = 100UL << 30; + return get(kMaxSpillBytes, kDefault); } - // Returns the target size for a Task's buffered output. The - // producer Drivers are blocked when the buffered size exceeds - // this. The Drivers are resumed when the buffered size goes below - // PartitionedOutputBufferManager::kContinuePct % of this. + /// Returns the maximum number of bytes to buffer in PartitionedOutput + /// operator to avoid creating tiny SerializedPages. + /// + /// For PartitionedOutputNode::Kind::kPartitioned, PartitionedOutput operator + /// would buffer up to that number of bytes / number of destinations for each + /// destination before producing a SerializedPage. uint64_t maxPartitionedOutputBufferSize() const { static constexpr uint64_t kDefault = 32UL << 20; return get(kMaxPartitionedOutputBufferSize, kDefault); } + /// Returns the maximum size in bytes for the task's buffered output. + /// + /// The producer Drivers are blocked when the buffered size exceeds + /// this. The Drivers are resumed when the buffered size goes below + /// OutputBufferManager::kContinuePct % of this. + uint64_t maxOutputBufferSize() const { + static constexpr uint64_t kDefault = 32UL << 20; + return get(kMaxOutputBufferSize, kDefault); + } + uint64_t maxLocalExchangeBufferSize() const { static constexpr uint64_t kDefault = 32UL << 20; return get(kMaxLocalExchangeBufferSize, kDefault); @@ -365,17 +477,31 @@ class QueryConfig { return get(kMaxExchangeBufferSize, kDefault); } + uint64_t maxMergeExchangeBufferSize() const { + static constexpr uint64_t kDefault = 128UL << 20; + return get(kMaxMergeExchangeBufferSize, kDefault); + } + uint64_t preferredOutputBatchBytes() const { static constexpr uint64_t kDefault = 10UL << 20; return get(kPreferredOutputBatchBytes, kDefault); } - uint32_t preferredOutputBatchRows() const { - return get(kPreferredOutputBatchRows, 1024); + vector_size_t preferredOutputBatchRows() const { + const uint32_t batchRows = get(kPreferredOutputBatchRows, 1024); + VELOX_USER_CHECK_LE(batchRows, std::numeric_limits::max()); + return batchRows; + } + + vector_size_t maxOutputBatchRows() const { + const uint32_t maxBatchRows = get(kMaxOutputBatchRows, 10'000); + VELOX_USER_CHECK_LE( + maxBatchRows, std::numeric_limits::max()); + return maxBatchRows; } - uint32_t maxOutputBatchRows() const { - return get(kMaxOutputBatchRows, 10'000); + uint32_t tableScanGetOutputTimeLimitMs() const { + return get(kTableScanGetOutputTimeLimitMs, 5'000); } bool hashAdaptivityEnabled() const { @@ -396,28 +522,12 @@ class QueryConfig { return get(kAdaptiveFilterReorderingEnabled, true); } - bool isMatchStructByName() const { - return get(kCastMatchStructByName, false); - } - - bool isCastToIntByTruncate() const { - return get(kCastToIntByTruncate, false); + bool isLegacyCast() const { + return get(kLegacyCast, false); } - bool isIso8601() const { - return get(kCastStringToDateIsIso8601, true); - } - - bool codegenEnabled() const { - return get(kCodegenEnabled, false); - } - - std::string codegenConfigurationFilePath() const { - return get(kCodegenConfigurationFilePath, ""); - } - - bool codegenLazyLoading() const { - return get(kCodegenLazyLoading, true); + bool isMatchStructByName() const { + return get(kCastMatchStructByName, false); } bool adjustTimestampToTimezone() const { @@ -455,14 +565,32 @@ class QueryConfig { return get(kOrderBySpillEnabled, true); } - // Returns a percentage of aggregation or join input batches that - // will be forced to spill for testing. 0 means no extra spilling. - int32_t testingSpillPct() const { - return get(kTestingSpillPct, 0); + /// Returns true if spilling is enabled for Window operator. Must also + /// check the spillEnabled()! + bool windowSpillEnabled() const { + return get(kWindowSpillEnabled, true); + } + + /// Returns 'is writer spilling enabled' flag. Must also check the + /// spillEnabled()! + bool writerSpillEnabled() const { + return get(kWriterSpillEnabled, true); + } + + /// Returns true if spilling is enabled for RowNumber operator. Must also + /// check the spillEnabled()! + bool rowNumberSpillEnabled() const { + return get(kRowNumberSpillEnabled, true); + } + + /// Returns true if spilling is enabled for TopNRowNumber operator. Must also + /// check the spillEnabled()! + bool topNRowNumberSpillEnabled() const { + return get(kTopNRowNumberSpillEnabled, true); } int32_t maxSpillLevel() const { - return get(kMaxSpillLevel, 4); + return get(kMaxSpillLevel, 1); } /// Returns the start partition bit which is used with @@ -470,36 +598,36 @@ class QueryConfig { /// calculate the spilling partition number for join spill or aggregation /// spill. uint8_t spillStartPartitionBit() const { - constexpr uint8_t kDefaultStartBit = 29; + constexpr uint8_t kDefaultStartBit = 48; return get(kSpillStartPartitionBit, kDefaultStartBit); } - /// Returns the number of bits used to calculate the spilling partition - /// number for hash join. The number of spilling partitions will be power of - /// two. + /// Returns the number of bits used to calculate the spill partition number + /// for hash join. The number of spill partitions will be power of two. /// /// NOTE: as for now, we only support up to 8-way spill partitioning. + /// + /// DEPRECATED. uint8_t joinSpillPartitionBits() const { - constexpr uint8_t kDefaultBits = 2; + constexpr uint8_t kDefaultBits = 3; constexpr uint8_t kMaxBits = 3; return std::min( kMaxBits, get(kJoinSpillPartitionBits, kDefaultBits)); } - /// Returns the number of bits used to calculate the spilling partition - /// number for hash join. The number of spilling partitions will be power of - /// two. - /// + /// Returns the number of bits used to calculate the spill partition number + /// for hash join and RowNumber. The number of spill partitions will be power + /// of tow. /// NOTE: as for now, we only support up to 8-way spill partitioning. - uint8_t aggregationSpillPartitionBits() const { - constexpr uint8_t kDefaultBits = 0; + uint8_t spillNumPartitionBits() const { + constexpr uint8_t kDefaultBits = 3; constexpr uint8_t kMaxBits = 3; return std::min( - kMaxBits, get(kAggregationSpillPartitionBits, kDefaultBits)); + kMaxBits, get(kSpillNumPartitionBits, kDefaultBits)); } - bool aggregationSpillAll() const { - return get(kAggregationSpillAll, true); + uint64_t writerFlushThresholdBytes() const { + return get(kWriterFlushThresholdBytes, 96L << 20); } uint64_t maxSpillFileSize() const { @@ -507,11 +635,6 @@ class QueryConfig { return get(kMaxSpillFileSize, kDefaultMaxFileSize); } - uint64_t minSpillRunSize() const { - constexpr uint64_t kDefaultMinSpillRunSize = 256 << 20; // 256MB. - return get(kMinSpillRunSize, kDefaultMinSpillRunSize); - } - std::string spillCompressionKind() const { return get(kSpillCompressionKind, "none"); } @@ -521,6 +644,15 @@ class QueryConfig { return get(kSpillWriteBufferSize, 1L << 20); } + uint64_t spillReadBufferSize() const { + // The default read buffer size set to 1MB. + return get(kSpillReadBufferSize, 1L << 20); + } + + std::string spillFileCreateConfig() const { + return get(kSpillFileCreateConfig, ""); + } + /// Returns the minimal available spillable memory reservation in percentage /// of the current memory usage. Suppose the current memory usage size of M, /// available memory reservation size of N and min reservation percentage of @@ -542,9 +674,19 @@ class QueryConfig { return get(kSpillableReservationGrowthPct, kDefaultPct); } - bool sparkLegacySizeOfNull() const { - constexpr bool kDefault{true}; - return get(kSparkLegacySizeOfNull, kDefault); + /// Returns true if query tracing is enabled. + bool queryTraceEnabled() const { + return get(kQueryTraceEnabled, false); + } + + std::string queryTraceDir() const { + // The default query trace dir, empty by default. + return get(kQueryTraceDir, ""); + } + + std::string queryTraceNodeIds() const { + // The default query trace nodes, empty by default. + return get(kQueryTraceNodeIds, ""); } bool prestoArrayAggIgnoreNulls() const { @@ -574,6 +716,14 @@ class QueryConfig { return value; } + int32_t sparkPartitionId() const { + auto id = get(kSparkPartitionId); + VELOX_CHECK(id.has_value(), "Spark partition id is not set."); + auto value = id.value(); + VELOX_CHECK_GE(value, 0, "Invalid Spark partition id."); + return value; + } + bool exprTrackCpuUsage() const { return get(kExprTrackCpuUsage, false); } @@ -592,7 +742,7 @@ class QueryConfig { } bool hashProbeFinishEarlyOnEmptyBuild() const { - return get(kHashProbeFinishEarlyOnEmptyBuild, true); + return get(kHashProbeFinishEarlyOnEmptyBuild, false); } uint32_t minTableRowsForParallelJoinBuild() const { @@ -607,6 +757,38 @@ class QueryConfig { return get(kEnableExpressionEvaluationCache, true); } + uint32_t maxSharedSubexprResultsCached() const { + // 10 was chosen as a default as there are cases where a shared + // subexpression can be called in 2 different places and a particular + // argument may be peeled in one and not peeled in another. 10 is large + // enough to handle this happening for a few arguments in different + // combinations. + // + // For example, when the UDF at the root of a shared subexpression does not + // have default null behavior and takes an input that is dictionary encoded + // with nulls set in the DictionaryVector. That dictionary + // encoding may be peeled depending on whether or not there is a UDF above + // it in the expression tree that has default null behavior and takes the + // same input as an argument. + return get(kMaxSharedSubexprResultsCached, 10); + } + + int32_t maxSplitPreloadPerDriver() const { + return get(kMaxSplitPreloadPerDriver, 2); + } + + uint32_t driverCpuTimeSliceLimitMs() const { + return get(kDriverCpuTimeSliceLimitMs, 0); + } + + int64_t prefixSortNormalizedKeyMaxBytes() const { + return get(kPrefixSortNormalizedKeyMaxBytes, 128); + } + + int32_t prefixSortMinRows() const { + return get(kPrefixSortMinRows, 130); + } + template T get(const std::string& key, const T& defaultValue) const { return config_->get(key, defaultValue); @@ -621,7 +803,11 @@ class QueryConfig { void testingOverrideConfigUnsafe( std::unordered_map&& values); + std::unordered_map rawConfigsCopy() const; + private: - std::unique_ptr config_; + void validateConfig(); + + std::unique_ptr config_; }; } // namespace facebook::velox::core diff --git a/velox/core/QueryCtx.cpp b/velox/core/QueryCtx.cpp index f3ac8bfc57ff5..e0392252ee74f 100644 --- a/velox/core/QueryCtx.cpp +++ b/velox/core/QueryCtx.cpp @@ -13,59 +13,51 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include "velox/core/QueryCtx.h" +#include "velox/common/base/SpillConfig.h" +#include "velox/common/config/Config.h" namespace facebook::velox::core { -QueryCtx::QueryCtx( +// static +std::shared_ptr QueryCtx::create( folly::Executor* executor, - std::unordered_map queryConfigValues, - std::unordered_map> connectorConfigs, + QueryConfig&& queryConfig, + std::unordered_map> + connectorConfigs, cache::AsyncDataCache* cache, std::shared_ptr pool, - std::shared_ptr spillExecutor, - const std::string& queryId) - : queryId_(queryId), - connectorConfigs_(connectorConfigs), - cache_(cache), - pool_(std::move(pool)), - executor_(executor), - queryConfig_{std::move(queryConfigValues)}, - spillExecutor_(std::move(spillExecutor)) { - initPool(queryId); + folly::Executor* spillExecutor, + const std::string& queryId) { + std::shared_ptr queryCtx(new QueryCtx( + executor, + std::move(queryConfig), + std::move(connectorConfigs), + cache, + std::move(pool), + spillExecutor, + queryId)); + queryCtx->maybeSetReclaimer(); + return queryCtx; } QueryCtx::QueryCtx( folly::Executor* executor, QueryConfig&& queryConfig, - std::unordered_map> connectorConfigs, + std::unordered_map> + connectorSessionProperties, cache::AsyncDataCache* cache, std::shared_ptr pool, - std::shared_ptr spillExecutor, + folly::Executor* spillExecutor, const std::string& queryId) : queryId_(queryId), - connectorConfigs_(connectorConfigs), - cache_(cache), - pool_(std::move(pool)), executor_(executor), - queryConfig_{std::move(queryConfig)}, - spillExecutor_(std::move(spillExecutor)) { - initPool(queryId); -} - -QueryCtx::QueryCtx( - folly::Executor::KeepAlive<> executorKeepalive, - std::unordered_map queryConfigValues, - std::unordered_map> connectorConfigs, - cache::AsyncDataCache* cache, - std::shared_ptr pool, - const std::string& queryId) - : queryId_(queryId), - connectorConfigs_(connectorConfigs), + spillExecutor_(spillExecutor), cache_(cache), + connectorSessionProperties_(connectorSessionProperties), pool_(std::move(pool)), - executorKeepalive_(std::move(executorKeepalive)), - queryConfig_{std::move(queryConfigValues)} { + queryConfig_{std::move(queryConfig)} { initPool(queryId); } @@ -76,4 +68,78 @@ QueryCtx::QueryCtx( return fmt::format("query.{}.{}", queryId.c_str(), seqNum++); } +void QueryCtx::maybeSetReclaimer() { + VELOX_CHECK_NOT_NULL(pool_); + VELOX_CHECK(!underArbitration_); + if (pool_->reclaimer() != nullptr) { + return; + } + pool_->setReclaimer(QueryCtx::MemoryReclaimer::create(this, pool_.get())); +} + +void QueryCtx::updateSpilledBytesAndCheckLimit(uint64_t bytes) { + const auto numSpilledBytes = numSpilledBytes_.fetch_add(bytes) + bytes; + if (queryConfig_.maxSpillBytes() > 0 && + numSpilledBytes > queryConfig_.maxSpillBytes()) { + VELOX_SPILL_LIMIT_EXCEEDED(fmt::format( + "Query exceeded per-query local spill limit of {}", + succinctBytes(queryConfig_.maxSpillBytes()))); + } +} + +std::unique_ptr QueryCtx::MemoryReclaimer::create( + QueryCtx* queryCtx, + memory::MemoryPool* pool) { + return std::unique_ptr( + new QueryCtx::MemoryReclaimer(queryCtx->shared_from_this(), pool)); +} + +uint64_t QueryCtx::MemoryReclaimer::reclaim( + memory::MemoryPool* pool, + uint64_t targetBytes, + uint64_t maxWaitMs, + memory::MemoryReclaimer::Stats& stats) { + auto queryCtx = ensureQueryCtx(); + if (queryCtx == nullptr) { + return 0; + } + VELOX_CHECK_EQ(pool->name(), pool_->name()); + + const auto leaveGuard = + folly::makeGuard([&]() { queryCtx->finishArbitration(); }); + queryCtx->startArbitration(); + return memory::MemoryReclaimer::reclaim(pool, targetBytes, maxWaitMs, stats); +} + +bool QueryCtx::checkUnderArbitration(ContinueFuture* future) { + VELOX_CHECK_NOT_NULL(future); + std::lock_guard l(mutex_); + if (!underArbitration_) { + VELOX_CHECK(arbitrationPromises_.empty()); + return false; + } + arbitrationPromises_.emplace_back("QueryCtx::waitArbitration"); + *future = arbitrationPromises_.back().getSemiFuture(); + return true; +} + +void QueryCtx::startArbitration() { + std::lock_guard l(mutex_); + VELOX_CHECK(!underArbitration_); + VELOX_CHECK(arbitrationPromises_.empty()); + underArbitration_ = true; +} + +void QueryCtx::finishArbitration() { + std::vector promises; + { + std::lock_guard l(mutex_); + VELOX_CHECK(underArbitration_); + underArbitration_ = false; + promises.swap(arbitrationPromises_); + } + for (auto& promise : promises) { + promise.setValue(); + } +} } // namespace facebook::velox::core diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h index 653b2769e7ec1..89ff229e3a488 100644 --- a/velox/core/QueryCtx.h +++ b/velox/core/QueryCtx.h @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include @@ -23,10 +24,18 @@ #include "velox/vector/DecodedVector.h" #include "velox/vector/VectorPool.h" +namespace facebook::velox { +class Config; +}; + namespace facebook::velox::core { -class QueryCtx { +class QueryCtx : public std::enable_shared_from_this { public: + ~QueryCtx() { + VELOX_CHECK(!underArbitration_); + } + /// QueryCtx is used in different places. When used with `Task::start()`, it's /// required that the caller supplies the executor and ensure its lifetime /// outlives the tasks that use it. In contrast, when used in expression @@ -34,39 +43,14 @@ class QueryCtx { /// mode, executor is not needed. Hence, we don't require executor to always /// be passed in here, but instead, ensure that executor exists when actually /// being used. - // TODO(jtan6): Deprecate this constructor after external dependencies are - // migrated - QueryCtx( - folly::Executor* executor, - std::unordered_map queryConfigValues, - std::unordered_map> - connectorConfigs = {}, - cache::AsyncDataCache* cache = cache::AsyncDataCache::getInstance(), - std::shared_ptr pool = nullptr, - std::shared_ptr spillExecutor = nullptr, - const std::string& queryId = ""); - - QueryCtx( + static std::shared_ptr create( folly::Executor* executor = nullptr, QueryConfig&& queryConfig = QueryConfig{{}}, - std::unordered_map> - connectorConfigs = {}, - cache::AsyncDataCache* cache = cache::AsyncDataCache::getInstance(), - std::shared_ptr pool = nullptr, - std::shared_ptr spillExecutor = nullptr, - const std::string& queryId = ""); - - /// Constructor to block the destruction of executor while this - /// object is alive. - /// - /// This constructor does not keep the ownership of executor. - explicit QueryCtx( - folly::Executor::KeepAlive<> executorKeepalive, - std::unordered_map queryConfigValues = {}, - std::unordered_map> + std::unordered_map> connectorConfigs = {}, cache::AsyncDataCache* cache = cache::AsyncDataCache::getInstance(), std::shared_ptr pool = nullptr, + folly::Executor* spillExecutor = nullptr, const std::string& queryId = ""); static std::string generatePoolName(const std::string& queryId); @@ -80,26 +64,31 @@ class QueryCtx { } folly::Executor* executor() const { - if (executor_ != nullptr) { - return executor_; - } - auto executor = executorKeepalive_.get(); - VELOX_CHECK(executor, "Executor was not supplied."); - return executor; + return executor_; + } + + bool isExecutorSupplied() const { + return executor_ != nullptr; } const QueryConfig& queryConfig() const { return queryConfig_; } - Config* getConnectorConfig(const std::string& connectorId) const { - auto it = connectorConfigs_.find(connectorId); - if (it == connectorConfigs_.end()) { + config::ConfigBase* connectorSessionProperties( + const std::string& connectorId) const { + auto it = connectorSessionProperties_.find(connectorId); + if (it == connectorSessionProperties_.end()) { return getEmptyConfig(); } return it->second.get(); } + const std::unordered_map>& + connectorSessionProperties() const { + return connectorSessionProperties_; + } + /// Overrides the previous configuration. Note that this function is NOT /// thread-safe and should probably only be used in tests. void testingOverrideConfigUnsafe( @@ -109,48 +98,129 @@ class QueryCtx { // Overrides the previous connector-specific configuration. Note that this // function is NOT thread-safe and should probably only be used in tests. - void setConnectorConfigOverridesUnsafe( + void setConnectorSessionOverridesUnsafe( const std::string& connectorId, std::unordered_map&& configOverrides) { - connectorConfigs_[connectorId] = - std::make_shared(std::move(configOverrides)); + connectorSessionProperties_[connectorId] = + std::make_shared(std::move(configOverrides)); } folly::Executor* spillExecutor() const { - return spillExecutor_.get(); + return spillExecutor_; } const std::string& queryId() const { return queryId_; } + /// Checks if the associated query is under memory arbitration or not. The + /// function returns true if it is and set future which is fulfilled when the + /// the memory arbiration finishes. + bool checkUnderArbitration(ContinueFuture* future); + + /// Updates the aggregated spill bytes of this query, and and throws if + /// exceeds the max spill bytes limit. + void updateSpilledBytesAndCheckLimit(uint64_t bytes); + void testingOverrideMemoryPool(std::shared_ptr pool) { pool_ = std::move(pool); } + /// Indicates if the query is under memory arbitration or not. + bool testingUnderArbitration() const { + std::lock_guard l(mutex_); + VELOX_CHECK(underArbitration_ || arbitrationPromises_.empty()); + return underArbitration_; + } + private: - static Config* getEmptyConfig() { - static const std::unique_ptr kEmptyConfig = - std::make_unique(); + /// QueryCtx is used in different places. When used with `Task::start()`, it's + /// required that the caller supplies the executor and ensure its lifetime + /// outlives the tasks that use it. In contrast, when used in expression + /// evaluation through `ExecCtx` or 'Task::next()' for single thread execution + /// mode, executor is not needed. Hence, we don't require executor to always + /// be passed in here, but instead, ensure that executor exists when actually + /// being used. + QueryCtx( + folly::Executor* executor = nullptr, + QueryConfig&& queryConfig = QueryConfig{{}}, + std::unordered_map> + connectorConfigs = {}, + cache::AsyncDataCache* cache = cache::AsyncDataCache::getInstance(), + std::shared_ptr pool = nullptr, + folly::Executor* spillExecutor = nullptr, + const std::string& queryId = ""); + + class MemoryReclaimer : public memory::MemoryReclaimer { + public: + static std::unique_ptr create( + QueryCtx* queryCtx, + memory::MemoryPool* pool); + + uint64_t reclaim( + memory::MemoryPool* pool, + uint64_t targetBytes, + uint64_t maxWaitMs, + memory::MemoryReclaimer::Stats& stats) override; + + protected: + MemoryReclaimer( + const std::shared_ptr& queryCtx, + memory::MemoryPool* pool) + : queryCtx_(queryCtx), pool_(pool) { + VELOX_CHECK_NOT_NULL(pool_); + } + + // Gets the shared pointer to the associated query ctx to ensure its + // liveness during the query memory reclaim operation. + // + // NOTE: an operator's memory pool can outlive its operator. + std::shared_ptr ensureQueryCtx() const { + return queryCtx_.lock(); + } + + const std::weak_ptr queryCtx_; + memory::MemoryPool* const pool_; + }; + + static config::ConfigBase* getEmptyConfig() { + static const std::unique_ptr kEmptyConfig = + std::make_unique( + std::unordered_map()); return kEmptyConfig.get(); } void initPool(const std::string& queryId) { if (pool_ == nullptr) { - pool_ = memory::defaultMemoryManager().addRootPool( - QueryCtx::generatePoolName(queryId)); + pool_ = memory::memoryManager()->addRootPool( + QueryCtx::generatePoolName(queryId), memory::kMaxMemory); } } + // Setup the memory reclaimer for arbitration if user provided memory pool + // hasn't set it. + void maybeSetReclaimer(); + + // Invoked to start memory arbitration on this query. + void startArbitration(); + // Invoked to stop memory arbitration on this query. + void finishArbitration(); + const std::string queryId_; + folly::Executor* const executor_{nullptr}; + folly::Executor* const spillExecutor_{nullptr}; + cache::AsyncDataCache* const cache_; - std::unordered_map> connectorConfigs_; - cache::AsyncDataCache* cache_; + std::unordered_map> + connectorSessionProperties_; std::shared_ptr pool_; - folly::Executor* executor_; - folly::Executor::KeepAlive<> executorKeepalive_; QueryConfig queryConfig_; - std::shared_ptr spillExecutor_; + std::atomic numSpilledBytes_{0}; + + mutable std::mutex mutex_; + // Indicates if this query is under memory arbitration or not. + bool underArbitration_{false}; + std::vector arbitrationPromises_; }; // Represents the state of one thread of query execution. @@ -159,12 +229,53 @@ class ExecCtx { ExecCtx(memory::MemoryPool* pool, QueryCtx* queryCtx) : pool_(pool), queryCtx_(queryCtx), - exprEvalCacheEnabled_( - !queryCtx || - queryCtx->queryConfig().isExpressionEvaluationCacheEnabled()), + optimizationParams_(queryCtx), vectorPool_( - exprEvalCacheEnabled_ ? std::make_unique(pool) - : nullptr) {} + optimizationParams_.exprEvalCacheEnabled + ? std::make_unique(pool) + : nullptr) {} + + struct OptimizationParams { + explicit OptimizationParams(QueryCtx* queryCtx) { + const core::QueryConfig defaultQueryConfig = core::QueryConfig({}); + + const core::QueryConfig& queryConfig = + queryCtx ? queryCtx->queryConfig() : defaultQueryConfig; + + exprEvalCacheEnabled = queryConfig.isExpressionEvaluationCacheEnabled(); + dictionaryMemoizationEnabled = + !queryConfig.debugDisableExpressionsWithMemoization() && + exprEvalCacheEnabled; + peelingEnabled = !queryConfig.debugDisableExpressionsWithPeeling(); + sharedSubExpressionReuseEnabled = + !queryConfig.debugDisableCommonSubExpressions(); + deferredLazyLoadingEnabled = + !queryConfig.debugDisableExpressionsWithLazyInputs(); + maxSharedSubexprResultsCached = + queryConfig.maxSharedSubexprResultsCached(); + } + + /// True if caches in expression evaluation used for performance are + /// enabled, including VectorPool, DecodedVectorPool, SelectivityVectorPool + /// and dictionary memoization. + bool exprEvalCacheEnabled; + /// True if dictionary memoization optimization is enabled during experssion + /// evaluation, whichallows the reuse of results between consecutive input + /// batches if they are dictionary encoded and have the same + /// alphabet(undelying flat vector). + bool dictionaryMemoizationEnabled; + /// True if peeling is enabled during experssion evaluation. + bool peelingEnabled; + /// True if shared subexpression reuse is enabled during experssion + /// evaluation. + bool sharedSubExpressionReuseEnabled; + /// True if loading lazy inputs are deferred till they need to be + /// accessed during experssion evaluation. + bool deferredLazyLoadingEnabled; + /// The maximum number of distinct inputs to cache results in a + /// given shared subexpression during experssion evaluation. + uint32_t maxSharedSubexprResultsCached; + }; velox::memory::MemoryPool* pool() const { return pool_; @@ -181,7 +292,9 @@ class ExecCtx { /// Prefer using LocalSelectivityVector which takes care of returning the /// vector to the pool on destruction. std::unique_ptr getSelectivityVector(int32_t size) { - VELOX_CHECK(exprEvalCacheEnabled_ || selectivityVectorPool_.empty()); + VELOX_CHECK( + optimizationParams_.exprEvalCacheEnabled || + selectivityVectorPool_.empty()); if (selectivityVectorPool_.empty()) { return std::make_unique(size); } @@ -195,7 +308,9 @@ class ExecCtx { // content. The caller is responsible for setting the size and // assigning the contents. std::unique_ptr getSelectivityVector() { - VELOX_CHECK(exprEvalCacheEnabled_ || selectivityVectorPool_.empty()); + VELOX_CHECK( + optimizationParams_.exprEvalCacheEnabled || + selectivityVectorPool_.empty()); if (selectivityVectorPool_.empty()) { return std::make_unique(); } @@ -206,7 +321,7 @@ class ExecCtx { // Returns true if the vector was moved into the pool. bool releaseSelectivityVector(std::unique_ptr&& vector) { - if (exprEvalCacheEnabled_) { + if (optimizationParams_.exprEvalCacheEnabled) { selectivityVectorPool_.push_back(std::move(vector)); return true; } @@ -214,7 +329,8 @@ class ExecCtx { } std::unique_ptr getDecodedVector() { - VELOX_CHECK(exprEvalCacheEnabled_ || decodedVectorPool_.empty()); + VELOX_CHECK( + optimizationParams_.exprEvalCacheEnabled || decodedVectorPool_.empty()); if (decodedVectorPool_.empty()) { return std::make_unique(); } @@ -225,7 +341,7 @@ class ExecCtx { // Returns true if the vector was moved into the pool. bool releaseDecodedVector(std::unique_ptr&& vector) { - if (exprEvalCacheEnabled_) { + if (optimizationParams_.exprEvalCacheEnabled) { decodedVectorPool_.push_back(std::move(vector)); return true; } @@ -264,8 +380,8 @@ class ExecCtx { return 0; } - bool exprEvalCacheEnabled() const { - return exprEvalCacheEnabled_; + const OptimizationParams& optimizationParams() const { + return optimizationParams_; } private: @@ -273,8 +389,9 @@ class ExecCtx { memory::MemoryPool* const pool_; QueryCtx* const queryCtx_; - const bool exprEvalCacheEnabled_; - // A pool of preallocated DecodedVectors for use by expressions and operators. + const OptimizationParams optimizationParams_; + // A pool of preallocated DecodedVectors for use by expressions and + // operators. std::vector> decodedVectorPool_; // A pool of preallocated SelectivityVectors for use by expressions // and operators. diff --git a/velox/core/SimpleFunctionMetadata.h b/velox/core/SimpleFunctionMetadata.h index 73d1978facefa..0d9cc0f8b4bcd 100644 --- a/velox/core/SimpleFunctionMetadata.h +++ b/velox/core/SimpleFunctionMetadata.h @@ -17,13 +17,16 @@ #include #include +#include #include "velox/common/base/Exceptions.h" +#include "velox/common/base/Status.h" #include "velox/core/CoreTypeSystem.h" #include "velox/core/Metaprogramming.h" #include "velox/core/QueryConfig.h" #include "velox/expression/FunctionSignature.h" #include "velox/expression/SignatureBinder.h" +#include "velox/type/SimpleFunctionApi.h" #include "velox/type/Type.h" #include "velox/type/Variant.h" @@ -65,6 +68,20 @@ struct udf_help> { } }; +// Canonical name of the function. +template +struct udf_canonical_name { + static constexpr exec::FunctionCanonicalName value = + exec::FunctionCanonicalName::kUnknown; +}; + +template +struct udf_canonical_name< + T, + util::detail::void_t> { + static constexpr exec::FunctionCanonicalName value = T::canonical_name; +}; + // Has the value true, unless a Variadic Type appears anywhere but at the end // of the parameters. template @@ -153,11 +170,26 @@ struct TypeAnalysisResults { } } stats; - // String representaion of the type in the FunctionSignatureBuilder. + void addVariable(exec::SignatureVariable&& variable) { + if (!variablesInformation.count(variable.name())) { + variablesInformation.emplace(variable.name(), variable); + } else { + VELOX_CHECK( + variable == variablesInformation.at(variable.name()), + "Cant assign different properties to the same variable {}", + variable.name()); + } + } + + /// String representation of the type in the FunctionSignatureBuilder. std::ostringstream out; - // Set of generic variables used in the type. - std::set variables; + /// Physical type, e.g. BIGINT() for Date and ARRAY(BIGINT()) for + // Array. UNKNOWN() if type is generic or opaque. + TypePtr physicalType; + + /// Set of generic variables used in the type. + std::map variablesInformation; std::string typeAsString() { return out.str(); @@ -193,20 +225,66 @@ struct TypeAnalysis { results.stats.concreteCount++; results.out << detail::strToLowerCopy( std::string(SimpleTypeTrait::name)); + if constexpr ( + SimpleTypeTrait::typeKind == TypeKind::OPAQUE || + SimpleTypeTrait::typeKind == TypeKind::UNKNOWN) { + results.physicalType = UNKNOWN(); + } else { + results.physicalType = createScalarType(SimpleTypeTrait::typeKind); + } } }; -template -struct TypeAnalysis> { +template +struct TypeAnalysis> { void run(TypeAnalysisResults& results) { if constexpr (std::is_same_v) { results.out << "any"; } else { - auto variableType = fmt::format("__user_T{}", T::getId()); - results.out << variableType; - results.variables.insert(variableType); + auto typeVariableName = fmt::format("__user_T{}", T::getId()); + results.out << typeVariableName; + results.addVariable(exec::SignatureVariable( + typeVariableName, + std::nullopt, + exec::ParameterType::kTypeParameter, + false, + orderable, + comparable)); } results.stats.hasGeneric = true; + results.physicalType = UNKNOWN(); + } +}; + +template +struct TypeAnalysis> { + void run(TypeAnalysisResults& results) { + results.stats.concreteCount++; + + const auto p = P::name(); + const auto s = S::name(); + results.out << fmt::format("decimal({},{})", p, s); + results.addVariable(exec::SignatureVariable( + p, std::nullopt, exec::ParameterType::kIntegerParameter)); + results.addVariable(exec::SignatureVariable( + s, std::nullopt, exec::ParameterType::kIntegerParameter)); + results.physicalType = BIGINT(); + } +}; + +template +struct TypeAnalysis> { + void run(TypeAnalysisResults& results) { + results.stats.concreteCount++; + + const auto p = P::name(); + const auto s = S::name(); + results.out << fmt::format("decimal({},{})", p, s); + results.addVariable(exec::SignatureVariable( + p, std::nullopt, exec::ParameterType::kIntegerParameter)); + results.addVariable(exec::SignatureVariable( + s, std::nullopt, exec::ParameterType::kIntegerParameter)); + results.physicalType = HUGEINT(); } }; @@ -216,9 +294,12 @@ struct TypeAnalysis> { results.stats.concreteCount++; results.out << "map("; TypeAnalysis().run(results); + auto keyType = results.physicalType; results.out << ", "; TypeAnalysis().run(results); + auto valueType = results.physicalType; results.out << ")"; + results.physicalType = MAP(keyType, valueType); } }; @@ -237,8 +318,11 @@ struct TypeAnalysis> { tmp.stats.hasGeneric || results.stats.hasVariadicOfGeneric; results.stats.concreteCount += tmp.stats.concreteCount; - results.variables.insert(tmp.variables.begin(), tmp.variables.end()); + for (auto& [_, variable] : tmp.variablesInformation) { + results.addVariable(std::move(variable)); + } results.out << tmp.typeAsString(); + results.physicalType = tmp.physicalType; } }; @@ -249,6 +333,7 @@ struct TypeAnalysis> { results.out << "array("; TypeAnalysis().run(results); results.out << ")"; + results.physicalType = ARRAY(results.physicalType); } }; @@ -262,6 +347,7 @@ struct TypeAnalysis> { void run(TypeAnalysisResults& results) { results.stats.concreteCount++; results.out << "row("; + std::vector fieldTypes; // This expression applies the lambda for each row child type. bool first = true; ( @@ -271,9 +357,11 @@ struct TypeAnalysis> { } first = false; TypeAnalysis().run(results); + fieldTypes.push_back(results.physicalType); }(), ...); results.out << ")"; + results.physicalType = ROW(std::move(fieldTypes)); } }; @@ -282,20 +370,30 @@ struct TypeAnalysis> { void run(TypeAnalysisResults& results) { results.stats.concreteCount++; results.out << T::typeName; + + TypeAnalysisResults tmp; + TypeAnalysis().run(tmp); + results.physicalType = tmp.physicalType; } }; class ISimpleFunctionMetadata { public: + virtual ~ISimpleFunctionMetadata() = default; + // Return the return type of the function if its independent on the input // types, otherwise return null. virtual TypePtr tryResolveReturnType() const = 0; virtual std::string getName() const = 0; virtual bool isDeterministic() const = 0; + virtual bool defaultNullBehavior() const = 0; virtual uint32_t priority() const = 0; virtual const std::shared_ptr signature() const = 0; + virtual const TypePtr& resultPhysicalType() const = 0; + virtual const std::vector& argPhysicalTypes() const = 0; + virtual bool physicalSignatureEquals( + const ISimpleFunctionMetadata& other) const = 0; virtual std::string helpMessage(const std::string& name) const = 0; - virtual ~ISimpleFunctionMetadata() = default; }; template @@ -304,7 +402,11 @@ struct udf_has_name : std::false_type {}; template struct udf_has_name : std::true_type {}; -template +template < + typename Fun, + typename TReturn, + typename ConstantChecker, + typename... Args> class SimpleFunctionMetadata : public ISimpleFunctionMetadata { public: using return_type = TReturn; @@ -356,6 +458,10 @@ class SimpleFunctionMetadata : public ISimpleFunctionMetadata { return udf_is_deterministic(); } + bool defaultNullBehavior() const final { + return defaultNullBehavior_; + } + static constexpr bool isVariadic() { if constexpr (num_args == 0) { return false; @@ -364,10 +470,16 @@ class SimpleFunctionMetadata : public ISimpleFunctionMetadata { } } - explicit SimpleFunctionMetadata() { - auto analysis = analyzeSignatureTypes(); + explicit SimpleFunctionMetadata( + bool defaultNullBehavior, + const std::vector& constraints) + : defaultNullBehavior_{defaultNullBehavior} { + auto analysis = analyzeSignatureTypes(constraints); + buildSignature(analysis); priority_ = analysis.stats.computePriority(); + resultPhysicalType_ = analysis.resultPhysicalType; + argPhysicalTypes_ = analysis.argPhysicalTypes; } ~SimpleFunctionMetadata() override = default; @@ -376,6 +488,33 @@ class SimpleFunctionMetadata : public ISimpleFunctionMetadata { return signature_; } + const TypePtr& resultPhysicalType() const override { + return resultPhysicalType_; + } + + const std::vector& argPhysicalTypes() const override { + return argPhysicalTypes_; + } + + bool physicalSignatureEquals( + const ISimpleFunctionMetadata& other) const override { + if (!resultPhysicalType_->kindEquals(other.resultPhysicalType())) { + return false; + } + + if (argPhysicalTypes_.size() != other.argPhysicalTypes().size()) { + return false; + } + + for (auto i = 0; i < argPhysicalTypes_.size(); ++i) { + if (!argPhysicalTypes_[i]->kindEquals(other.argPhysicalTypes()[i])) { + return false; + } + } + + return true; + } + std::string helpMessage(const std::string& name) const final { // return fmt::format("{}({})", name, signature_->toString()); std::string s{name}; @@ -401,16 +540,21 @@ class SimpleFunctionMetadata : public ISimpleFunctionMetadata { struct SignatureTypesAnalysisResults { std::vector argsTypes; std::string outputType; - std::set variables; + std::map variables; TypeAnalysisResults::Stats stats; + TypePtr resultPhysicalType; + std::vector argPhysicalTypes; }; - SignatureTypesAnalysisResults analyzeSignatureTypes() { + SignatureTypesAnalysisResults analyzeSignatureTypes( + const std::vector& constraints) { std::vector argsTypes; TypeAnalysisResults results; TypeAnalysis().run(results); std::string outputType = results.typeAsString(); + const auto resultPhysicalType = results.physicalType; + std::vector argPhysicalTypes; ( [&]() { @@ -419,27 +563,44 @@ class SimpleFunctionMetadata : public ISimpleFunctionMetadata { results.resetTypeString(); TypeAnalysis().run(results); argsTypes.push_back(results.typeAsString()); + argPhysicalTypes.push_back(results.physicalType); }(), ...); + for (const auto& constraint : constraints) { + VELOX_CHECK( + !constraint.constraint().empty(), + "Constraint must be set for variable {}", + constraint.name()); + + results.variablesInformation.erase(constraint.name()); + results.variablesInformation.emplace(constraint.name(), constraint); + } + return SignatureTypesAnalysisResults{ std::move(argsTypes), std::move(outputType), - std::move(results.variables), - std::move(results.stats)}; + std::move(results.variablesInformation), + std::move(results.stats), + resultPhysicalType, + argPhysicalTypes}; } void buildSignature(const SignatureTypesAnalysisResults& analysis) { auto builder = exec::FunctionSignatureBuilder(); builder.returnType(analysis.outputType); - + int32_t position = 0; for (const auto& arg : analysis.argsTypes) { - builder.argumentType(arg); + if (ConstantChecker::isConstant[position++]) { + builder.constantArgumentType(arg); + } else { + builder.argumentType(arg); + } } - for (const auto& variable : analysis.variables) { - builder.typeVariable(variable); + for (const auto& [_, variable] : analysis.variables) { + builder.variable(variable); } if (isVariadic()) { @@ -448,20 +609,34 @@ class SimpleFunctionMetadata : public ISimpleFunctionMetadata { signature_ = builder.build(); } + const bool defaultNullBehavior_; exec::FunctionSignaturePtr signature_; uint32_t priority_; + TypePtr resultPhysicalType_; + std::vector argPhysicalTypes_; }; // wraps a UDF object to provide the inheritance // this is basically just boilerplate-avoidance -template -class UDFHolder final - : public core::SimpleFunctionMetadata { +template < + typename Fun, + typename Exec, + typename TReturn, + typename ConstantChecker, + typename... TArgs> +class UDFHolder { Fun instance_; public: + using return_type = TReturn; + using arg_types = std::tuple; + template + using type_at = typename std::tuple_element::type; + static constexpr int num_args = std::tuple_size::value; + using udf_struct_t = Fun; - using Metadata = core::SimpleFunctionMetadata; + using Metadata = + core::SimpleFunctionMetadata; template using exec_resolver = typename Exec::template resolver; @@ -491,15 +666,19 @@ class UDFHolder final DECLARE_METHOD_RESOLVER(callAscii_method_resolver, callAscii); DECLARE_METHOD_RESOLVER(initialize_method_resolver, initialize); - // Check which flavor of the call() method is provided by the UDF object. UDFs - // are required to provide at least one of the following methods: + // Check which flavor of the call()/callNullable()/callNullFree() method is + // provided by the UDF object. UDFs are required to provide at least one of + // the following methods: // - // - bool|void call(...) - // - bool|void callNullable(...) - // - bool|void callNullFree(...) + // - bool|void|Status call(...) + // - bool|void|Status callNullable(...) + // - bool|void|Status callNullFree(...) // - // Each of these methods can return either bool or void. Returning void means - // that the UDF is assumed never to return null values. + // Each of these methods can return bool, void or Status. Returning void + // means that the UDF is assumed never to return null values. Returning + // Status to hold success or error outcome of the function call, and it + // implies result is not null. If you need to return null as result, please + // use bool return type. // // Optionally, UDFs can also provide the following methods: // @@ -513,17 +692,29 @@ class UDFHolder final bool, exec_return_type, const exec_arg_type&...>::value; + static constexpr bool udf_has_call_return_void = util::has_method< Fun, call_method_resolver, void, exec_return_type, const exec_arg_type&...>::value; - static constexpr bool udf_has_call = - udf_has_call_return_bool | udf_has_call_return_void; + + static constexpr bool udf_has_call_return_status = util::has_method< + Fun, + call_method_resolver, + Status, + exec_return_type, + const exec_arg_type&...>::value; + + static constexpr bool udf_has_call = udf_has_call_return_bool | + udf_has_call_return_void | udf_has_call_return_status; + static_assert( - !(udf_has_call_return_bool && udf_has_call_return_void), - "Provided call() methods need to return either void OR bool."); + udf_has_call_return_void + udf_has_call_return_bool + + udf_has_call_return_status <= + 1, + "Provided call() methods need to only return void, bool OR Status."); // callNullable(): static constexpr bool udf_has_callNullable_return_bool = util::has_method< @@ -538,11 +729,21 @@ class UDFHolder final void, exec_return_type, const exec_arg_type*...>::value; + static constexpr bool udf_has_callNullable_return_status = util::has_method< + Fun, + callNullable_method_resolver, + Status, + exec_return_type, + const exec_arg_type*...>::value; static constexpr bool udf_has_callNullable = - udf_has_callNullable_return_bool | udf_has_callNullable_return_void; + udf_has_callNullable_return_bool | udf_has_callNullable_return_void | + udf_has_callNullable_return_status; + static_assert( - !(udf_has_callNullable_return_bool && udf_has_callNullable_return_void), - "Provided callNullable() methods need to return either void OR bool."); + udf_has_callNullable_return_void + udf_has_callNullable_return_bool + + udf_has_callNullable_return_status <= + 1, + "Provided callNullable() methods need to only return void, bool OR Status."); // callNullFree(): static constexpr bool udf_has_callNullFree_return_bool = util::has_method< @@ -557,11 +758,21 @@ class UDFHolder final void, exec_return_type, const exec_no_nulls_arg_type&...>::value; + static constexpr bool udf_has_callNullFree_return_status = util::has_method< + Fun, + callNullFree_method_resolver, + Status, + exec_return_type, + const exec_no_nulls_arg_type&...>::value; static constexpr bool udf_has_callNullFree = - udf_has_callNullFree_return_bool | udf_has_callNullFree_return_void; + udf_has_callNullFree_return_bool | udf_has_callNullFree_return_void | + udf_has_callNullFree_return_status; + static_assert( - !(udf_has_callNullFree_return_bool && udf_has_callNullFree_return_void), - "Provided callNullFree() methods need to return either void OR bool."); + udf_has_callNullFree_return_void + udf_has_callNullFree_return_bool + + udf_has_callNullFree_return_status <= + 1, + "Provided callNullFree() methods need to only return void, bool OR Status."); // callAscii(): static constexpr bool udf_has_callAscii_return_bool = util::has_method< @@ -593,9 +804,22 @@ class UDFHolder final Fun, initialize_method_resolver, void, + const std::vector&, const core::QueryConfig&, const exec_arg_type*...>::value; + // TODO Remove + static constexpr bool udf_has_legacy_initialize = util::has_method< + Fun, + initialize_method_resolver, + void, + const core::QueryConfig&, + const exec_arg_type*...>::value; + + static_assert( + !udf_has_legacy_initialize, + "Legacy initialize method! Upgrade."); + static_assert( udf_has_call || udf_has_callNullable || udf_has_callNullFree, "UDF must implement at least one of `call`, `callNullable`, or `callNullFree` functions.\n" @@ -649,23 +873,41 @@ class UDFHolder final template using exec_type_at = typename std::tuple_element::type; - explicit UDFHolder() : Metadata(), instance_{} {} + explicit UDFHolder() : instance_{} {} + + exec::FunctionCanonicalName getCanonicalName() const { + return udf_canonical_name::value; + } + + bool isDeterministic() const { + return udf_is_deterministic(); + } + + static constexpr bool isVariadic() { + if constexpr (num_args == 0) { + return false; + } else { + return isVariadicType>::value; + } + } FOLLY_ALWAYS_INLINE void initialize( + const std::vector& inputTypes, const core::QueryConfig& config, const typename exec_resolver::in_type*... constantArgs) { if constexpr (udf_has_initialize) { - return instance_.initialize(config, constantArgs...); + return instance_.initialize(inputTypes, config, constantArgs...); } } - FOLLY_ALWAYS_INLINE bool call( + FOLLY_ALWAYS_INLINE Status call( exec_return_type& out, + bool& notNull, const typename exec_resolver::in_type&... args) { if constexpr (udf_has_call) { - return callImpl(out, args...); + return callImpl(out, notNull, args...); } else if constexpr (udf_has_callNullable) { - return callNullableImpl(out, (&args)...); + return callNullableImpl(out, notNull, (&args)...); } else { VELOX_UNREACHABLE( "call should never be called if the UDF does not " @@ -673,18 +915,20 @@ class UDFHolder final } } - FOLLY_ALWAYS_INLINE bool callNullable( + FOLLY_ALWAYS_INLINE Status callNullable( exec_return_type& out, + bool& notNull, const typename exec_resolver::in_type*... args) { if constexpr (udf_has_callNullable) { - return callNullableImpl(out, args...); + return callNullableImpl(out, notNull, args...); } else if constexpr (udf_has_call) { // Default null behavior. const bool isAllSet = (args && ...); if (LIKELY(isAllSet)) { - return callImpl(out, (*args)...); + return callImpl(out, notNull, (*args)...); } else { - return false; + notNull = false; + return Status::OK(); } } else { VELOX_UNREACHABLE( @@ -693,21 +937,23 @@ class UDFHolder final } } - FOLLY_ALWAYS_INLINE bool callAscii( + FOLLY_ALWAYS_INLINE Status callAscii( exec_return_type& out, + bool& notNull, const typename exec_resolver::in_type&... args) { if constexpr (udf_has_callAscii) { - return callAsciiImpl(out, args...); + return callAsciiImpl(out, notNull, args...); } else { - return call(out, args...); + return call(out, notNull, args...); } } - FOLLY_ALWAYS_INLINE bool callNullFree( + FOLLY_ALWAYS_INLINE Status callNullFree( exec_return_type& out, + bool& notNull, const exec_no_nulls_arg_type&... args) { if constexpr (udf_has_callNullFree) { - return callNullFreeImpl(out, args...); + return callNullFreeImpl(out, notNull, args...); } else { VELOX_UNREACHABLE( "callNullFree should never be called if the UDF does not implement callNullFree."); @@ -716,52 +962,73 @@ class UDFHolder final // Helper functions to handle void vs bool return type. - FOLLY_ALWAYS_INLINE bool callImpl( + FOLLY_ALWAYS_INLINE Status callImpl( typename Exec::template resolver::out_type& out, + bool& notNull, const typename Exec::template resolver::in_type&... args) { static_assert(udf_has_call); - if constexpr (udf_has_call_return_bool) { + + if constexpr (udf_has_call_return_status) { + notNull = true; return instance_.call(out, args...); + } else if constexpr (udf_has_call_return_bool) { + notNull = instance_.call(out, args...); + return Status::OK(); } else { instance_.call(out, args...); - return true; + notNull = true; + return Status::OK(); } } - FOLLY_ALWAYS_INLINE bool callNullableImpl( + FOLLY_ALWAYS_INLINE Status callNullableImpl( exec_return_type& out, + bool& notNull, const typename Exec::template resolver::in_type*... args) { static_assert(udf_has_callNullable); - if constexpr (udf_has_callNullable_return_bool) { + + if constexpr (udf_has_callNullable_return_status) { + notNull = true; return instance_.callNullable(out, args...); + } else if constexpr (udf_has_callNullable_return_bool) { + notNull = instance_.callNullable(out, args...); + return Status::OK(); } else { instance_.callNullable(out, args...); - return true; + notNull = true; + return Status::OK(); } } - FOLLY_ALWAYS_INLINE bool callAsciiImpl( + FOLLY_ALWAYS_INLINE Status callAsciiImpl( typename Exec::template resolver::out_type& out, + bool& notNull, const typename Exec::template resolver::in_type&... args) { static_assert(udf_has_callAscii); if constexpr (udf_has_callAscii_return_bool) { - return instance_.callAscii(out, args...); + notNull = instance_.callAscii(out, args...); } else { instance_.callAscii(out, args...); - return true; + notNull = true; } + return Status::OK(); } - FOLLY_ALWAYS_INLINE bool callNullFreeImpl( + FOLLY_ALWAYS_INLINE Status callNullFreeImpl( typename Exec::template resolver::out_type& out, + bool& notNull, const exec_no_nulls_arg_type&... args) { static_assert(udf_has_callNullFree); - if constexpr (udf_has_callNullFree_return_bool) { + if constexpr (udf_has_callNullFree_return_status) { + notNull = true; return instance_.callNullFree(out, args...); + } else if constexpr (udf_has_callNullFree_return_bool) { + notNull = instance_.callNullFree(out, args...); } else { instance_.callNullFree(out, args...); - return true; + notNull = true; } + return Status::OK(); } }; diff --git a/velox/core/tests/CMakeLists.txt b/velox/core/tests/CMakeLists.txt index 5065c7f2705e4..6092bb4b98f63 100644 --- a/velox/core/tests/CMakeLists.txt +++ b/velox/core/tests/CMakeLists.txt @@ -26,11 +26,12 @@ add_test(velox_core_test velox_core_test) target_link_libraries( velox_core_test - PRIVATE velox_core - velox_exception - velox_exec_test_lib - velox_presto_types - velox_type - velox_vector_test_lib - gtest - gtest_main) + PRIVATE + velox_core + velox_exception + velox_exec_test_lib + velox_presto_types + velox_type + velox_vector_test_lib + GTest::gtest + GTest::gtest_main) diff --git a/velox/core/tests/ConstantTypedExprTest.cpp b/velox/core/tests/ConstantTypedExprTest.cpp index b5ad1dad19d4e..cba32ed7da05b 100644 --- a/velox/core/tests/ConstantTypedExprTest.cpp +++ b/velox/core/tests/ConstantTypedExprTest.cpp @@ -47,12 +47,8 @@ TEST(ConstantTypedExprTest, null) { EXPECT_FALSE(*makeNull(HYPERLOGLOG()) == *makeNull(VARBINARY())); EXPECT_FALSE(*makeNull(VARBINARY()) == *makeNull(HYPERLOGLOG())); - EXPECT_FALSE( - *makeNull(TIMESTAMP_WITH_TIME_ZONE()) == - *makeNull(ROW({BIGINT(), SMALLINT()}))); - EXPECT_FALSE( - *makeNull(ROW({BIGINT(), SMALLINT()})) == - *makeNull(TIMESTAMP_WITH_TIME_ZONE())); + EXPECT_FALSE(*makeNull(TIMESTAMP_WITH_TIME_ZONE()) == *makeNull(BIGINT())); + EXPECT_FALSE(*makeNull(BIGINT()) == *makeNull(TIMESTAMP_WITH_TIME_ZONE())); EXPECT_TRUE(*makeNull(DOUBLE()) == *makeNull(DOUBLE())); EXPECT_TRUE(*makeNull(ARRAY(DOUBLE())) == *makeNull(ARRAY(DOUBLE()))); diff --git a/velox/core/tests/PlanFragmentTest.cpp b/velox/core/tests/PlanFragmentTest.cpp index ca48ea07e0982..5116970d695c3 100644 --- a/velox/core/tests/PlanFragmentTest.cpp +++ b/velox/core/tests/PlanFragmentTest.cpp @@ -27,6 +27,10 @@ using facebook::velox::exec::test::PlanBuilder; namespace { class PlanFragmentTest : public testing::Test { protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + void SetUp() override { rowType_ = ROW({"c0", "c1", "c2"}, {BIGINT(), BIGINT(), BIGINT()}); rowTypeWithProjection_ = ROW( @@ -63,7 +67,7 @@ class PlanFragmentTest : public testing::Test { {QueryConfig::kOrderBySpillEnabled, orderBySpillEnabled ? "true" : "false"}, }); - return std::make_shared(nullptr, std::move(configData)); + return QueryCtx::create(nullptr, QueryConfig{std::move(configData)}); } RowTypePtr rowType_; @@ -74,9 +78,10 @@ class PlanFragmentTest : public testing::Test { RowTypePtr probeTypeWithProjection_; std::vector emptyProbeVectors_; std::shared_ptr probeValueNode_; - std::shared_ptr pool_{memory::addDefaultLeafMemoryPool()}; + std::shared_ptr pool_{ + memory::memoryManager()->addLeafPool()}; }; -}; // namespace +} // namespace TEST_F(PlanFragmentTest, orderByCanSpill) { struct { @@ -100,7 +105,8 @@ TEST_F(PlanFragmentTest, orderByCanSpill) { for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - const std::vector sortingKeys{nullptr}; + const std::vector sortingKeys{ + std::make_shared(BIGINT(), "c0")}; const std::vector sortingOrders{{true, true}}; auto orderBy = std::make_shared( "orderBy", sortingKeys, sortingOrders, false, valueNode_); @@ -144,7 +150,7 @@ TEST_F(PlanFragmentTest, aggregationCanSpill) { std::string debugString() const { return fmt::format( "aggregationStep:{} isSpillEnabled:{} isAggregationSpillEnabled:{} isDistinct:{} hasPreAggregation:{} expectedCanSpill:{}", - aggregationStep, + AggregationNode::stepName(aggregationStep), isSpillEnabled, isAggregationSpillEnabled, isDistinct, @@ -154,7 +160,7 @@ TEST_F(PlanFragmentTest, aggregationCanSpill) { } testSettings[] = { {AggregationNode::Step::kSingle, false, true, false, false, false}, {AggregationNode::Step::kSingle, true, false, false, false, false}, - {AggregationNode::Step::kSingle, true, true, true, false, false}, + {AggregationNode::Step::kSingle, true, true, true, false, true}, {AggregationNode::Step::kSingle, true, true, false, true, false}, {AggregationNode::Step::kSingle, true, true, false, false, true}, {AggregationNode::Step::kIntermediate, false, true, false, false, false}, @@ -167,11 +173,11 @@ TEST_F(PlanFragmentTest, aggregationCanSpill) { {AggregationNode::Step::kPartial, true, true, true, false, false}, {AggregationNode::Step::kPartial, true, true, false, true, false}, {AggregationNode::Step::kPartial, true, true, false, false, false}, - {AggregationNode::Step::kSingle, false, true, false, false, false}, - {AggregationNode::Step::kSingle, true, false, false, false, false}, - {AggregationNode::Step::kSingle, true, true, true, false, false}, - {AggregationNode::Step::kSingle, true, true, false, true, false}, - {AggregationNode::Step::kSingle, true, true, false, false, true}}; + {AggregationNode::Step::kFinal, false, true, false, false, false}, + {AggregationNode::Step::kFinal, true, false, false, false, false}, + {AggregationNode::Step::kFinal, true, true, true, false, true}, + {AggregationNode::Step::kFinal, true, true, false, true, false}, + {AggregationNode::Step::kFinal, true, true, false, false, true}}; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); @@ -317,3 +323,14 @@ TEST_F(PlanFragmentTest, hashJoin) { testData.expectedCanSpill); } } + +TEST_F(PlanFragmentTest, executionStrategyToString) { + ASSERT_EQ( + executionStrategyToString(core::ExecutionStrategy::kUngrouped), + "UNGROUPED"); + ASSERT_EQ( + executionStrategyToString(core::ExecutionStrategy::kGrouped), "GROUPED"); + ASSERT_EQ( + executionStrategyToString(static_cast(999)), + "UNKNOWN: 999"); +} diff --git a/velox/core/tests/PlanNodeTest.cpp b/velox/core/tests/PlanNodeTest.cpp index 6acd586f10080..961d1dd86c221 100644 --- a/velox/core/tests/PlanNodeTest.cpp +++ b/velox/core/tests/PlanNodeTest.cpp @@ -15,6 +15,7 @@ */ #include +#include "velox/common/base/tests/GTestUtils.h" #include "velox/core/PlanNode.h" using namespace ::facebook::velox; @@ -97,3 +98,17 @@ TEST(TestPlanNode, sortOrder) { } } } + +TEST(TestPlanNode, duplicateSortKeys) { + auto sortingKeys = std::vector{ + std::make_shared(BIGINT(), "c0"), + std::make_shared(BIGINT(), "c1"), + std::make_shared(BIGINT(), "c0"), + }; + auto sortingOrders = + std::vector{{true, true}, {false, false}, {true, true}}; + VELOX_ASSERT_USER_THROW( + std::make_shared( + "orderBy", sortingKeys, sortingOrders, false, nullptr), + "Duplicate sorting keys are not allowed: c0"); +} diff --git a/velox/core/tests/QueryConfigTest.cpp b/velox/core/tests/QueryConfigTest.cpp index 27b938b6e0d7c..c89d44d2fdfad 100644 --- a/velox/core/tests/QueryConfigTest.cpp +++ b/velox/core/tests/QueryConfigTest.cpp @@ -21,74 +21,39 @@ namespace facebook::velox::core::test { -TEST(TestQueryConfig, emptyConfig) { - std::unordered_map configData; - auto queryCtx = std::make_shared(nullptr, std::move(configData)); +class QueryConfigTest : public testing::Test { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } +}; + +TEST_F(QueryConfigTest, emptyConfig) { + auto queryCtx = QueryCtx::create(nullptr, QueryConfig{{}}); const QueryConfig& config = queryCtx->queryConfig(); - ASSERT_FALSE(config.codegenEnabled()); - ASSERT_EQ(config.codegenConfigurationFilePath(), ""); - ASSERT_FALSE(config.isCastToIntByTruncate()); + ASSERT_FALSE(config.isLegacyCast()); } -TEST(TestQueryConfig, setConfig) { - std::string path = "/tmp/CodeGenConfig"; +TEST_F(QueryConfigTest, setConfig) { + std::string path = "/tmp/setConfig"; std::unordered_map configData( - {{QueryConfig::kCodegenEnabled, "true"}, - {QueryConfig::kCodegenConfigurationFilePath, path}}); - auto queryCtx = std::make_shared(nullptr, std::move(configData)); + {{QueryConfig::kLegacyCast, "true"}}); + auto queryCtx = QueryCtx::create(nullptr, QueryConfig{std::move(configData)}); const QueryConfig& config = queryCtx->queryConfig(); - ASSERT_TRUE(config.codegenEnabled()); - ASSERT_EQ(config.codegenConfigurationFilePath(), path); - ASSERT_FALSE(config.isCastToIntByTruncate()); + ASSERT_TRUE(config.isLegacyCast()); } -TEST(TestQueryConfig, memConfig) { - const std::string tz = "timezone1"; - const std::unordered_map configData( - {{QueryConfig::kCodegenEnabled, "true"}, - {QueryConfig::kSessionTimezone, tz}}); - - { - MemConfig cfg{configData}; - MemConfig cfg2{}; - auto configDataCopy = configData; - MemConfig cfg3{std::move(configDataCopy)}; - ASSERT_TRUE(cfg.Config::get(QueryConfig::kCodegenEnabled)); - ASSERT_TRUE(cfg3.Config::get(QueryConfig::kCodegenEnabled)); - ASSERT_EQ( - tz, - cfg.Config::get(QueryConfig::kSessionTimezone).value()); - ASSERT_FALSE(cfg.Config::get("missing-entry").has_value()); - ASSERT_EQ(configData, cfg.values()); - ASSERT_EQ(configData, cfg.valuesCopy()); - } - - { - MemConfigMutable cfg{configData}; - MemConfigMutable cfg2{}; - auto configDataCopy = configData; - MemConfigMutable cfg3{std::move(configDataCopy)}; - ASSERT_TRUE(cfg.Config::get(QueryConfig::kCodegenEnabled).value()); - ASSERT_TRUE(cfg3.Config::get(QueryConfig::kCodegenEnabled).value()); - ASSERT_EQ( - tz, - cfg.Config::get(QueryConfig::kSessionTimezone).value()); - ASSERT_FALSE(cfg.Config::get("missing-entry").has_value()); - ASSERT_NO_THROW(cfg.setValue(QueryConfig::kCodegenEnabled, "false")); - ASSERT_FALSE(cfg.Config::get(QueryConfig::kCodegenEnabled).value()); - const std::string tz2 = "timezone2"; - ASSERT_NO_THROW(cfg.setValue(QueryConfig::kSessionTimezone, tz2)); - ASSERT_EQ( - tz2, - cfg.Config::get(QueryConfig::kSessionTimezone).value()); - ASSERT_THROW(cfg.values(), VeloxException); - ASSERT_EQ(configData, cfg3.valuesCopy()); - } +TEST_F(QueryConfigTest, invalidConfig) { + std::unordered_map configData( + {{QueryConfig::kSessionTimezone, "invalid"}}); + VELOX_ASSERT_USER_THROW( + QueryCtx::create(nullptr, QueryConfig{std::move(configData)}), + "session 'session_timezone' set with invalid value 'invalid'"); } -TEST(TestQueryConfig, taskWriterCountConfig) { +TEST_F(QueryConfigTest, taskWriterCountConfig) { struct { std::optional numWriterCounter; std::optional numPartitionedWriterCounter; @@ -126,7 +91,8 @@ TEST(TestQueryConfig, taskWriterCountConfig) { QueryConfig::kTaskPartitionedWriterCount, std::to_string(testConfig.numPartitionedWriterCounter.value())); } - auto queryCtx = std::make_shared(nullptr, std::move(configData)); + auto queryCtx = + QueryCtx::create(nullptr, QueryConfig{std::move(configData)}); const QueryConfig& config = queryCtx->queryConfig(); ASSERT_EQ(config.taskWriterCount(), testConfig.expectedWriterCounter); ASSERT_EQ( @@ -135,29 +101,33 @@ TEST(TestQueryConfig, taskWriterCountConfig) { } } -TEST(TestQueryConfig, enableExpressionEvaluationCacheConfig) { - std::shared_ptr rootPool_{ - memory::defaultMemoryManager().addRootPool()}; - std::shared_ptr pool_{rootPool_->addLeafChild("leaf")}; +TEST_F(QueryConfigTest, enableExpressionEvaluationCacheConfig) { + std::shared_ptr rootPool{ + memory::memoryManager()->addRootPool()}; + std::shared_ptr pool{rootPool->addLeafChild("leaf")}; auto testConfig = [&](bool enableExpressionEvaluationCache) { std::unordered_map configData( {{core::QueryConfig::kEnableExpressionEvaluationCache, enableExpressionEvaluationCache ? "true" : "false"}}); auto queryCtx = - std::make_shared(nullptr, std::move(configData)); + core::QueryCtx::create(nullptr, QueryConfig{std::move(configData)}); const core::QueryConfig& config = queryCtx->queryConfig(); ASSERT_EQ( config.isExpressionEvaluationCacheEnabled(), enableExpressionEvaluationCache); - auto execCtx = std::make_shared(pool_.get(), queryCtx.get()); - ASSERT_EQ(execCtx->exprEvalCacheEnabled(), enableExpressionEvaluationCache); + auto execCtx = std::make_shared(pool.get(), queryCtx.get()); + ASSERT_EQ( + execCtx->optimizationParams().exprEvalCacheEnabled, + enableExpressionEvaluationCache); ASSERT_EQ( execCtx->vectorPool() != nullptr, enableExpressionEvaluationCache); auto evalCtx = std::make_shared(execCtx.get()); - ASSERT_EQ(evalCtx->cacheEnabled(), enableExpressionEvaluationCache); + ASSERT_EQ( + evalCtx->dictionaryMemoizationEnabled(), + enableExpressionEvaluationCache); // Test ExecCtx::selectivityVectorPool_. auto rows = execCtx->getSelectivityVector(100); @@ -178,63 +148,58 @@ TEST(TestQueryConfig, enableExpressionEvaluationCacheConfig) { testConfig(false); } -TEST(TestQueryConfig, capacityConversion) { - folly::Random::DefaultGenerator rng; - rng.seed(1); - - std::unordered_map unitStrLookup{ - {CapacityUnit::BYTE, "B"}, - {CapacityUnit::KILOBYTE, "kB"}, - {CapacityUnit::MEGABYTE, "MB"}, - {CapacityUnit::GIGABYTE, "GB"}, - {CapacityUnit::TERABYTE, "TB"}, - {CapacityUnit::PETABYTE, "PB"}}; - - std::vector> units{ - {CapacityUnit::BYTE, 1}, - {CapacityUnit::KILOBYTE, 1024}, - {CapacityUnit::MEGABYTE, 1024 * 1024}, - {CapacityUnit::GIGABYTE, 1024 * 1024 * 1024}, - {CapacityUnit::TERABYTE, 1024ll * 1024 * 1024 * 1024}, - {CapacityUnit::PETABYTE, 1024ll * 1024 * 1024 * 1024 * 1024}}; - for (int32_t i = 0; i < units.size(); i++) { - for (int32_t j = 0; j < units.size(); j++) { - // We use this diffRatio to prevent float conversion overflow when - // converting from one unit to another. - uint64_t diffRatio = i < j ? units[j].second / units[i].second - : units[i].second / units[j].second; - uint64_t randNumber = folly::Random::rand64(rng); - uint64_t testNumber = i > j ? randNumber / diffRatio : randNumber; - ASSERT_EQ( - toCapacity( - std::string( - std::to_string(testNumber) + unitStrLookup[units[i].first]), - units[j].first), - (uint64_t)(testNumber * (units[i].second / units[j].second))); - } - } -} +TEST_F(QueryConfigTest, expressionEvaluationRelatedConfigs) { + // Verify that the expression evaluation related configs are porpogated + // correctly to ExprCtx which is used during expression evaluation. Each + // config is individually set and verified. + std::shared_ptr rootPool{ + memory::memoryManager()->addRootPool()}; + std::shared_ptr pool{rootPool->addLeafChild("leaf")}; + + auto testConfig = + [&](std::unordered_map configData) { + auto queryCtx = + core::QueryCtx::create(nullptr, QueryConfig{std::move(configData)}); + const auto& queryConfig = queryCtx->queryConfig(); + auto execCtx = + std::make_shared(pool.get(), queryCtx.get()); + auto evalCtx = std::make_shared(execCtx.get()); + + ASSERT_EQ( + evalCtx->peelingEnabled(), + !queryConfig.debugDisableExpressionsWithPeeling()); + ASSERT_EQ( + evalCtx->sharedSubExpressionReuseEnabled(), + !queryConfig.debugDisableCommonSubExpressions()); + ASSERT_EQ( + evalCtx->dictionaryMemoizationEnabled(), + !queryConfig.debugDisableExpressionsWithMemoization()); + ASSERT_EQ( + evalCtx->deferredLazyLoadingEnabled(), + !queryConfig.debugDisableExpressionsWithLazyInputs()); + }; + + auto createConfig = [&](bool debugDisableExpressionsWithPeeling, + bool debugDisableCommonSubExpressions, + bool debugDisableExpressionsWithMemoization, + bool debugDisableExpressionsWithLazyInputs) -> auto { + std::unordered_map configData( + {{core::QueryConfig::kDebugDisableExpressionWithPeeling, + std::to_string(debugDisableExpressionsWithPeeling)}, + {core::QueryConfig::kDebugDisableCommonSubExpressions, + std::to_string(debugDisableCommonSubExpressions)}, + {core::QueryConfig::kDebugDisableExpressionWithMemoization, + std::to_string(debugDisableExpressionsWithMemoization)}, + {core::QueryConfig::kDebugDisableExpressionWithLazyInputs, + std::to_string(debugDisableExpressionsWithLazyInputs)}}); + return configData; + }; -TEST(TestQueryConfig, durationConversion) { - folly::Random::DefaultGenerator rng; - rng.seed(1); - - std::vector> units{ - {"ns", 1}, - {"us", 1000}, - {"ms", 1000 * 1000}, - {"s", 1000ll * 1000 * 1000}, - {"m", 1000ll * 1000 * 1000 * 60}, - {"h", 1000ll * 1000 * 1000 * 60 * 60}, - {"d", 1000ll * 1000 * 1000 * 60 * 60 * 24}}; - for (uint32_t i = 0; i < units.size(); i++) { - auto testNumber = folly::Random::rand32(rng) % 10000; - auto duration = - toDuration(std::string(std::to_string(testNumber) + units[i].first)); - ASSERT_EQ( - testNumber * units[i].second, - std::chrono::duration_cast(duration).count()); - } + testConfig({}); // Verify default config. + testConfig(createConfig(true, false, false, false)); + testConfig(createConfig(false, true, false, false)); + testConfig(createConfig(false, false, true, false)); + testConfig(createConfig(false, false, false, true)); } } // namespace facebook::velox::core::test diff --git a/velox/core/tests/TypeAnalysisTest.cpp b/velox/core/tests/TypeAnalysisTest.cpp index a00896fc3a0e9..6e361d4bfe0c6 100644 --- a/velox/core/tests/TypeAnalysisTest.cpp +++ b/velox/core/tests/TypeAnalysisTest.cpp @@ -17,6 +17,8 @@ #include #include "velox/core/SimpleFunctionMetadata.h" +#include "velox/expression/FunctionSignature.h" +#include "velox/functions/prestosql/types/JsonType.h" #include "velox/type/Type.h" // Test for simple function type analysis. @@ -70,10 +72,18 @@ class TypeAnalysisTest : public testing::Test { } template - void testVariables(const std::set& expected) { + void testPhysicalType(const TypePtr& expected) { TypeAnalysisResults results; (TypeAnalysis().run(results), ...); - ASSERT_EQ(expected, results.variables); + ASSERT_EQ(expected->toString(), results.physicalType->toString()); + } + + template + void testVariables( + const std::map& expected) { + TypeAnalysisResults results; + (TypeAnalysis().run(results), ...); + ASSERT_EQ(expected, results.variablesInformation); } template @@ -99,6 +109,11 @@ TEST_F(TypeAnalysisTest, hasGeneric) { testHasGeneric, Array>>(true); testHasGeneric>, Array>>(true); + testHasGeneric>, Array>>(true); + testHasGeneric>, Array>>(true); + testHasGeneric>, Array>>(true); + testHasGeneric>, Array>>(true); + testHasGeneric, Any>>(true); testHasGeneric>(true); testHasGeneric(true); @@ -130,6 +145,9 @@ TEST_F(TypeAnalysisTest, hasVariadicOfGeneric) { testHasVariadicOfGeneric>(false); testHasVariadicOfGeneric>(true); + testHasVariadicOfGeneric>>(true); + testHasVariadicOfGeneric>>(true); + testHasVariadicOfGeneric, int32_t>(true); testHasVariadicOfGeneric>>(true); testHasVariadicOfGeneric>>>>( @@ -143,6 +161,9 @@ TEST_F(TypeAnalysisTest, countConcrete) { testCountConcrete(3); testCountConcrete(0); testCountConcrete>(0); + testCountConcrete>(0); + testCountConcrete>(0); + testCountConcrete>(0); testCountConcrete>(1); testCountConcrete>>(1); @@ -165,6 +186,10 @@ TEST_F(TypeAnalysisTest, testStringType) { testStringType({"bigint"}); testStringType({"double"}); testStringType({"real"}); + testStringType({"date"}); + + testStringType>({"decimal(i1,i5)"}); + testStringType>({"decimal(i1,i5)"}); testStringType>({"array(integer)"}); testStringType>({"map(any, integer)"}); @@ -181,21 +206,127 @@ TEST_F(TypeAnalysisTest, testStringType) { "map(array(integer), __user_T2)", }); + testStringType, Orderable>>({ + "integer", + "bigint", + "map(array(integer), __user_T2)", + }); + testStringType, Comparable>>({ + "integer", + "bigint", + "map(array(integer), __user_T2)", + }); testStringType>({"array(integer)"}); testStringType>({"map(bigint, double)"}); testStringType>>( {"row(any, double, __user_T1)"}); + + testStringType({"json"}); + testStringType>({"array(json)"}); } TEST_F(TypeAnalysisTest, testVariables) { testVariables({}); testVariables>({}); testVariables({}); - testVariables>({"__user_T1"}); + + testVariables>( + {{"__user_T1", + exec::SignatureVariable( + "__user_T1", + std::nullopt, + exec::ParameterType::kTypeParameter, + false, + false, + false)}}); + + testVariables>( + {{"__user_T1", + exec::SignatureVariable( + "__user_T1", + std::nullopt, + exec::ParameterType::kTypeParameter, + false, + true /*orderableTypesOnly*/, + true)}}); + + testVariables>( + {{"__user_T1", + exec::SignatureVariable( + "__user_T1", + std::nullopt, + exec::ParameterType::kTypeParameter, + false, + true /*orderableTypesOnly*/, + true /*comparableTypesOnly*/)}}); + + testVariables>( + {{"__user_T1", + exec::SignatureVariable( + "__user_T1", + std::nullopt, + exec::ParameterType::kTypeParameter, + false, + false /*orderableTypesOnly*/, + true /*comparableTypesOnly*/)}}); + testVariables>({}); testVariables>({}); - testVariables, Map, Generic>>( - {"__user_T2", "__user_T5"}); + testVariables, Map, Orderable>>( + {{"__user_T5", + exec::SignatureVariable( + "__user_T5", + std::nullopt, + exec::ParameterType::kTypeParameter, + false, + false /*orderableTypesOnly*/, + false /*comparableTypesOnly*/)}, + {"__user_T2", + exec::SignatureVariable( + "__user_T2", + std::nullopt, + exec::ParameterType::kTypeParameter, + false, + true /*orderableTypesOnly*/, + true /*comparableTypesOnly*/)}}); + + testVariables>({ + {"i1", + exec::SignatureVariable( + "i1", + std::nullopt, + exec::ParameterType::kIntegerParameter, + false, + false /*orderableTypesOnly*/, + false /*comparableTypesOnly*/)}, + {"i5", + exec::SignatureVariable( + "i5", + std::nullopt, + exec::ParameterType::kIntegerParameter, + false, + false /*orderableTypesOnly*/, + false /*comparableTypesOnly*/)}, + }); + + testVariables>({ + {"i2", + exec::SignatureVariable( + "i2", + std::nullopt, + exec::ParameterType::kIntegerParameter, + false, + false /*orderableTypesOnly*/, + false /*comparableTypesOnly*/)}, + {"i6", + exec::SignatureVariable( + "i6", + std::nullopt, + exec::ParameterType::kIntegerParameter, + false, + false /*orderableTypesOnly*/, + false /*comparableTypesOnly*/)}, + }); } TEST_F(TypeAnalysisTest, testRank) { @@ -210,12 +341,15 @@ TEST_F(TypeAnalysisTest, testRank) { testRank(3); testRank, Any, Variadic>(3); testRank, Generic>(3); + testRank, Comparable>(3); + testRank, Generic>(3); testRank, int32_t>(3); testRank, Any, Any>(3); testRank>(4); testRank, Any, Variadic>>(4); + testRank, Any, Variadic>>>(4); } TEST_F(TypeAnalysisTest, testPriority) { @@ -239,5 +373,45 @@ TEST_F(TypeAnalysisTest, testPriority) { getPriority>>(), getPriority>()); } + +TEST_F(TypeAnalysisTest, physicalType) { + testPhysicalType(BOOLEAN()); + testPhysicalType(INTEGER()); + testPhysicalType(BIGINT()); + testPhysicalType(REAL()); + testPhysicalType(DOUBLE()); + testPhysicalType(INTEGER()); + testPhysicalType(TIMESTAMP()); + testPhysicalType(VARCHAR()); + testPhysicalType(VARBINARY()); + + testPhysicalType>(BIGINT()); + testPhysicalType>(HUGEINT()); + + testPhysicalType>(ARRAY(INTEGER())); + testPhysicalType>(ARRAY(INTEGER())); + testPhysicalType>>(ARRAY(ARRAY(REAL()))); + testPhysicalType>>(ARRAY(UNKNOWN())); + testPhysicalType>>>(ARRAY(ARRAY(UNKNOWN()))); + testPhysicalType>>(ARRAY(BIGINT())); + + testPhysicalType>(MAP(INTEGER(), VARCHAR())); + testPhysicalType>>(MAP(INTEGER(), ARRAY(INTEGER()))); + testPhysicalType, Generic>>(MAP(UNKNOWN(), UNKNOWN())); + testPhysicalType, Array>>>( + MAP(UNKNOWN(), ARRAY(UNKNOWN()))); + testPhysicalType>>( + MAP(INTEGER(), HUGEINT())); + + testPhysicalType, Variadic>>( + ROW({INTEGER(), ARRAY(DOUBLE()), BOOLEAN()})); + + testPhysicalType(VARCHAR()); + testPhysicalType>(ARRAY(VARCHAR())); + + testPhysicalType(UNKNOWN()); + testPhysicalType>(ROW({INTEGER(), UNKNOWN()})); +} + } // namespace } // namespace facebook::velox::core diff --git a/velox/core/tests/TypedExprSerdeTest.cpp b/velox/core/tests/TypedExprSerdeTest.cpp index 81d9a1eb18a04..d8110fd613119 100644 --- a/velox/core/tests/TypedExprSerdeTest.cpp +++ b/velox/core/tests/TypedExprSerdeTest.cpp @@ -23,6 +23,10 @@ namespace facebook::velox::core::test { class TypedExprSerDeTest : public testing::Test, public velox::test::VectorTestBase { protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + TypedExprSerDeTest() { Type::registerSerDe(); @@ -54,6 +58,7 @@ TEST_F(TypedExprSerDeTest, fieldAccess) { std::shared_ptr expression = std::make_shared(BIGINT(), "a"); testSerde(expression); + ASSERT_EQ(expression->toString(), "\"a\""); expression = std::make_shared( VARCHAR(), @@ -61,6 +66,7 @@ TEST_F(TypedExprSerDeTest, fieldAccess) { ROW({"a", "b"}, {VARCHAR(), BOOLEAN()}), "ab"), 0); testSerde(expression); + ASSERT_EQ(expression->toString(), "\"ab\"[a]"); } TEST_F(TypedExprSerDeTest, constant) { @@ -113,19 +119,11 @@ TEST_F(TypedExprSerDeTest, call) { TEST_F(TypedExprSerDeTest, cast) { auto expression = std::make_shared( - BIGINT(), - std::vector{ - std::make_shared(VARCHAR(), "a"), - }, - false); + BIGINT(), std::make_shared(VARCHAR(), "a"), false); testSerde(expression); expression = std::make_shared( - VARCHAR(), - std::vector{ - std::make_shared(BIGINT(), "a"), - }, - true); + VARCHAR(), std::make_shared(BIGINT(), "a"), true); testSerde(expression); } diff --git a/velox/docs/README.md b/velox/docs/README.md index 284a6b002abdd..a46cfaec06894 100644 --- a/velox/docs/README.md +++ b/velox/docs/README.md @@ -10,16 +10,9 @@ To install Sphinx: `easy_install -U sphinx` `sphinx-quickstart` command was used to generate the initial Makefile and config. -### Building PyVelox Components - -If you're using a conda environment, it can be easily installed by `conda install` command. - -Pandoc is also used to generate `.rst` files from existing markdown files. Refer to installation -instructions [here](https://pandoc.org/installing.html). - To build the documentation, e.g. generate HTML files from .rst files: -Run the `./scripts/gen-docs.sh` script from the base directory. +Run `make html` from velox/docs. Navigate to `velox/docs/_build/html/index.html` in your browser to view the documentation. @@ -39,7 +32,7 @@ directory to the top-level docs folder and push to gh-pages branch. git checkout -b update-docs main # Generate the documentation. -./scripts/gen-docs.sh +cd velox/docs && make html # Copy documentation files to the top-level docs folder. cp -R _build/html/* ../../docs diff --git a/velox/docs/conf.py b/velox/docs/conf.py index 119f90866aa4c..f2563eb218cd9 100644 --- a/velox/docs/conf.py +++ b/velox/docs/conf.py @@ -54,7 +54,6 @@ "sphinx.ext.doctest", "sphinx.ext.mathjax", "sphinx.ext.viewcode", - "sphinx.ext.imgmath", "sphinx.ext.todo", "sphinx.ext.intersphinx", "sphinx.ext.autosummary", diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index 9a9b24f13bae2..bdcd28d054a8e 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -27,16 +27,26 @@ Generic Configuration - 10000 - Max number of rows that could be return by operators from Operator::getOutput. It is used when an estimate of average row size is known and preferred_output_batch_bytes is used to compute the number of output rows. + * - table_scan_getoutput_time_limit_ms + - integer + - 5000 + - TableScan operator will exit getOutput() method after this many milliseconds even if it has no data to return yet. Zero means 'no time limit'. * - abandon_partial_aggregation_min_rows - integer - 100,000 - - Min number of rows when we check if a partial aggregation is not reducing the cardinality well and might be - a subject to being abandoned. + - Number of input rows to receive before starting to check whether to abandon partial aggregation. * - abandon_partial_aggregation_min_pct - integer - 80 - - If a partial aggregation's number of output rows constitues this or highler percentage of the number of input rows, - then this partial aggregation will be a subject to being abandoned. + - Abandons partial aggregation if number of groups equals or exceeds this percentage of the number of input rows. + * - abandon_partial_topn_row_number_min_rows + - integer + - 100,000 + - Number of input rows to receive before starting to check whether to abandon partial TopNRowNumber. + * - abandon_partial_topn_row_number_min_pct + - integer + - 80 + - Abandons partial TopNRowNumber if number of output rows equals or exceeds this percentage of the number of input rows. * - session_timezone - string - @@ -72,11 +82,27 @@ Generic Configuration - Size of buffer in the exchange client that holds data fetched from other nodes before it is processed. A larger buffer can increase network throughput for larger clusters and thus decrease query processing time at the expense of reducing the amount of memory available for other usage. + * - merge_exchange.max_buffer_size + - integer + - 128MB + - The aggregate buffer size (in bytes) across all exchange clients generated by the merge exchange operator, + responsible for storing data retrieved from various nodes prior to processing. It is divided + equally among all clients and has an upper and lower limit of 32MB and 1MB, respectively, per + client. Enforced approximately, not strictly. A larger size can increase network throughput + for larger clusters and thus decrease query processing time at the expense of reducing the + amount of memory available for other usage. * - max_page_partitioning_buffer_size - integer - 32MB - - The target size for a Task's buffered output. The producer Drivers are blocked when the buffered size exceeds this. - The Drivers are resumed when the buffered size goes below PartitionedOutputBufferManager::kContinuePct (90)% of this. + - The maximum size in bytes for the task's buffered output when output is partitioned using hash of partitioning keys. See PartitionedOutputNode::Kind::kPartitioned. + The producer Drivers are blocked when the buffered size exceeds this. + The Drivers are resumed when the buffered size goes below OutputBufferManager::kContinuePct (90)% of this. + * - max_output_buffer_size + - integer + - 32MB + - The maximum size in bytes for the task's buffered output. + The producer Drivers are blocked when the buffered size exceeds this. + The Drivers are resumed when the buffered size goes below OutputBufferManager::kContinuePct (90)% of this. * - min_table_rows_for_parallel_join_build - integer - 1000 @@ -92,6 +118,25 @@ Generic Configuration - true - Whether to enable caches in expression evaluation. If set to true, optimizations including vector pools and evalWithMemo are enabled. + * - max_shared_subexpr_results_cached + - integer + - 10 + - For a given shared subexpression, the maximum distinct sets of inputs we cache results for. Lambdas can call + the same expression with different inputs many times, causing the results we cache to explode in size. Putting + a limit contains the memory usage. + * - driver_cpu_time_slice_limit_ms + - integer + - 0 + - If it is not zero, specifies the time limit that a driver can continuously + run on a thread before yield. If it is zero, then it no limit. + * - prefixsort_normalized_key_max_bytes + - integer + - 128 + - Maximum number of bytes to use for the normalized key in prefix-sort. Use 0 to disable prefix-sort. + * - prefixsort_min_rows + - integer + - 130 + - Minimum number of rows to use prefix-sort. The default value has been derived using micro-benchmarking. .. _expression-evaluation-conf: @@ -114,29 +159,31 @@ Expression Evaluation Configuration - false - Whether to track CPU usage for individual expressions (supported by call and cast expressions). Can be expensive when processing small batches, e.g. < 10K rows. + * - legacy_cast + - bool + - false + - Enables legacy CAST semantics if set to true. CAST(timestamp AS varchar) uses 'T' as separator between date and + time (instead of a space), and the year part is not padded. * - cast_match_struct_by_name - bool - false - This flag makes the Row conversion to by applied in a way that the casting row field are matched by name instead of position. - * - cast_to_int_by_truncate + * - debug_disable_expression_with_peeling - bool - false - - This flags forces the cast from float/double/decimal/string to integer to be performed by truncating the decimal part instead of rounding. - * - cast_string_to_date_is_iso_8601 + - Disable optimization in expression evaluation to peel common dictionary layer from inputs. Should only be used for debugging. + * - debug_disable_common_sub_expressions - bool - - true - - If set, cast from string to date allows only ISO 8601 formatted strings: ``[+-](YYYY-MM-DD)``. - Otherwise, allows all patterns supported by Spark: - * ``[+-]yyyy*`` - * ``[+-]yyyy*-[m]m`` - * ``[+-]yyyy*-[m]m-[d]d`` - * ``[+-]yyyy*-[m]m-[d]d *`` - * ``[+-]yyyy*-[m]m-[d]dT*`` - The asterisk ``*`` in ``yyyy*`` stands for any numbers. - For the last two patterns, the trailing ``*`` can represent none or any sequence of characters, e.g: - * "1970-01-01 123" - * "1970-01-01 (BC)" - Regardless of this setting's value, leading spaces will be trimmed. + - false + - Disable optimization in expression evaluation to re-use cached results for common sub-expressions. Should only be used for debugging. + * - debug_disable_expression_with_memoization + - bool + - false + - Disable optimization in expression evaluation to re-use cached results between subsequent input batches that are dictionary encoded and have the same alphabet(underlying flat vector). Should only be used for debugging. + * - debug_disable_expression_with_lazy_inputs + - bool + - false + - Disable optimization in expression evaluation to delay loading of lazy inputs unless required. Should only be used for debugging. Memory Management ----------------- @@ -182,29 +229,37 @@ Spilling - Spill memory to disk to avoid exceeding memory limits for the query. * - aggregation_spill_enabled - boolean - - false - - When `spill_enabled` is true, determines whether to spill memory to disk for aggregations to avoid exceeding - memory limits for the query. + - true + - When `spill_enabled` is true, determines whether HashAggregation operator can spill to disk under memory pressure. * - join_spill_enabled - boolean - - false - - When `spill_enabled` is true, determines whether to spill memory to disk for hash joins to avoid exceeding memory - limits for the query. + - true + - When `spill_enabled` is true, determines whether HashBuild and HashProbe operators can spill to disk under memory pressure. * - order_by_spill_enabled - boolean - - false - - When `spill_enabled` is true, determines whether to spill memory to disk for order by to avoid exceeding memory - limits for the query. + - true + - When `spill_enabled` is true, determines whether OrderBy operator can spill to disk under memory pressure. + * - window_spill_enabled + - boolean + - true + - When `spill_enabled` is true, determines whether Window operator can spill to disk under memory pressure. + * - row_number_spill_enabled + - boolean + - true + - When `spill_enabled` is true, determines whether RowNumber operator can spill to disk under memory pressure. + * - topn_row_number_spill_enabled + - boolean + - true + - When `spill_enabled` is true, determines whether TopNRowNumber operator can spill to disk under memory pressure. + * - writer_spill_enabled + - boolean + - true + - When `writer_spill_enabled` is true, determines whether TableWriter operator can flush the buffered data to disk + under memory pressure. * - aggregation_spill_memory_threshold - integer - 0 - Maximum amount of memory in bytes that a final aggregation can use before spilling. 0 means unlimited. - * - aggregation_spill_all - - boolean - - false - - If true and spilling has been triggered during the input processing, the spiller will spill all the remaining - - in-memory state to disk before output processing. This is to simplify the aggregation query OOM prevention in - - output processing stage. * - join_spill_memory_threshold - integer - 0 @@ -213,6 +268,10 @@ Spilling - integer - 0 - Maximum amount of memory in bytes that an order by can use before spilling. 0 means unlimited. + * - writer_flush_threshold_bytes + - integer + - 96MB + - Minimum memory footprint size required to reclaim memory from a file writer by flushing its buffered data to disk. * - min_spillable_reservation_pct - integer - 5 @@ -230,20 +289,41 @@ Spilling reservation grows along a series of powers of (1 + N / 100). If the memory reservation fails, it starts spilling. * - max_spill_level - integer - - 4 + - 1 - The maximum allowed spilling level with zero being the initial spilling level. Applies to hash join build spilling which might use recursive spilling when the build table is very large. -1 means unlimited. In this case an extremely large query might run out of spilling partition bits. The max spill level can be used to prevent a query from using too much io and cpu resources. + * - max_spill_run_rows + - integer + - 12582912 + - The max number of rows to fill and spill for each spill run. This is used to cap the memory used for spilling. + If it is zero, then there is no limit and spilling might run out of memory. Based on offline test results, the + default value is set to 12 million rows which uses ~128MB memory when to fill a spill run. + Relation between spill rows and memory usage are as follows: + * ``12 million rows: 128 MB`` + * ``30 million rows: 256 MB`` + * ``60 million rows: 512 MB`` * - max_spill_file_size - integer - 0 - The maximum allowed spill file size. Zero means unlimited. + * - max_spill_bytes + - integer + - 107374182400 + - The max spill bytes limit set for each query. This is used to cap the storage used for spilling. + If it is zero, then there is no limit and spilling might exhaust the storage or takes too long to run. + The default value is set to 100 GB. * - spill_write_buffer_size - integer - 4MB - The maximum size in bytes to buffer the serialized spill data before write to disk for IO efficiency. If set to zero, buffering is disabled. + * - spill_read_buffer_size + - integer + - 1MB + - The buffer size in bytes to read from one spilled file. If the underlying filesystem supports async + read, we do read-ahead with double buffering, which doubles the buffer used to read from each spill file. * - min_spill_run_size - integer - 256MB @@ -262,22 +342,17 @@ Spilling - integer - 29 - The start partition bit which is used with `spiller_partition_bits` together to calculate the spilling partition number. - * - join_spiller_partition_bits - - integer - - 2 - - The number of bits (N) used to calculate the spilling partition number for hash join: 2 ^ N. At the moment the maximum - value is 3, meaning we only support up to 8-way spill partitioning. - * - aggregation_spiller_partition_bits + * - spiller_num_partition_bits - integer - - 0 - - The number of bits (N) used to calculate the spilling partition number for hash aggregation: 2 ^ N. At the moment the - maximum value is 3, meaning we only support up to 8-way spill partitioning. + - 3 + - The number of bits (N) used to calculate the spilling partition number for hash join and RowNumber: 2 ^ N. At the moment the maximum + value is 3, meaning we only support up to 8-way spill partitioning.ing. * - testing.spill_pct - integer - 0 - Percentage of aggregation or join input batches that will be forced to spill for testing. 0 means no extra spilling. -Table Writer +Table Scan ------------ .. list-table:: :widths: 20 10 10 70 @@ -287,17 +362,13 @@ Table Writer - Type - Default Value - Description - * - task_writer_count + * - max_split_preload_per_driver - integer - - 1 - - The number of parallel table writer threads per task. - * - task_partitioned_writer_count - - integer - - task_writer_count - - The number of parallel table writer threads per task for bucketed table writes. If not set, use 'task_writer_count' as default. + - 2 + - Maximum number of splits to preload per driver. Set to 0 to disable preloading. -Codegen Configuration ---------------------- +Table Writer +------------ .. list-table:: :widths: 20 10 10 70 :header-rows: 1 @@ -306,34 +377,36 @@ Codegen Configuration - Type - Default Value - Description - * - codegen.enabled - - boolean - - false - - Along with `codegen.configuration_file_path` enables codegen in task execution path. - * - codegen.configuration_file_path - - string - - - - A path to the file contaning codegen options. - * - codegen.lazy_loading - - boolean - - true - - Triggers codegen initialization tests upon loading if false. Otherwise skips them. + * - task_writer_count + - integer + - 1 + - The number of parallel table writer threads per task. + * - task_partitioned_writer_count + - integer + - task_writer_count + - The number of parallel table writer threads per task for bucketed table writes. If not set, use 'task_writer_count' as default. Hive Connector -------------- +Hive Connector config is initialized on velox runtime startup and is shared among queries as the default config. +Each query can override the config by setting corresponding query session properties such as in Prestissimo. + .. list-table:: - :widths: 20 10 10 70 + :widths: 20 20 10 10 70 :header-rows: 1 - * - Property Name + * - Configuration Property Name + - Session Property Name - Type - Default Value - Description - * - max_partitions_per_writers + * - hive.max-partitions-per-writers + - - integer - 100 - Maximum number of (bucketed) partitions per a single table writer instance. - * - insert_existing_partitions_behavior + * - insert-existing-partitions-behavior + - insert_existing_partitions_behavior - string - ERROR - **Allowed values:** ``OVERWRITE``, ``ERROR``. The behavior on insert existing partitions. This property only derives @@ -341,23 +414,133 @@ Hive Connector sets the update mode to indicate overwriting a partition if exists. ``ERROR`` sets the update mode to indicate error throwing if writing to an existing partition. * - hive.immutable-partitions + - - bool - false - True if appending data to an existing unpartitioned table is allowed. Currently this configuration does not support appending to existing partitions. - * - file_column_names_read_as_lower_case + * - file-column-names-read-as-lower-case + - - bool - false - True if reading the source file column names as lower case, and planner should guarantee the input column name and filter is also lower case to achive case-insensitive read. + * - partition_path_as_lower_case + - + - bool + - true + - If true, the partition directory will be converted to lowercase when executing a table write operation. + * - allow-null-partition-keys + - allow_null_partition_keys + - bool + - true + - Determines whether null values for partition keys are allowed or not. If not, fails with "Partition key must + not be null" error message when writing data with null partition key. + Null check for partitioning key should be used only when partitions are generated dynamically during query execution. + For queries that write to fixed partitions, this check should happen much earlier before the Velox execution even starts. + * - ignore_missing_files + - + - bool + - false + - If true, splits that refer to missing files don't generate errors and are processed as empty splits. * - max-coalesced-bytes + - - integer - - 512KB + - 128MB - Maximum size in bytes to coalesce requests to be fetched in a single request. * - max-coalesced-distance-bytes + - - integer - - 128MB + - 512KB - Maximum distance in bytes between chunks to be fetched that may be coalesced into a single request. + * - load-quantum + - + - integer + - 8MB + - Define the size of each coalesce load request. E.g. in Parquet scan, if it's bigger than rowgroup size then the whole row group can be fetched together. Otherwise, the row group will be fetched column chunk by column chunk + * - num-cached-file-handles + - + - integer + - 20000 + - Maximum number of entries in the file handle cache. The value must be non-negative. Zero value + indicates infinite cache capacity. + * - file-handle-cache-enabled + - + - bool + - true + - Enables caching of file handles if true. Disables caching if false. File handle cache should be + disabled if files are not immutable, i.e. file content may change while file path stays the same. + * - sort-writer-max-output-rows + - sort_writer_max_output_rows + - integer + - 1024 + - Maximum number of rows for sort writer in one batch of output. This is to limit the memory usage of sort writer. + * - sort-writer-max-output-bytes + - sort_writer_max_output_bytes + - string + - 10MB + - Maximum bytes for sort writer in one batch of output. This is to limit the memory usage of sort writer. + * - file-preload-threshold + - + - integer + - 8MB + - Usually Velox fetches the meta data firstly then fetch the rest of file. But if the file is very small, Velox can fetch the whole file directly to avoid multiple IO requests. + The parameter controls the threshold when whole file is fetched. + * - footer-estimated-size + - + - integer + - 1MB + - Define the estimation of footer size in ORC and Parquet format. The footer data includes version, schema, and meta data for every columns which may or may not need to be fetched later. + The parameter controls the size when footer is fetched each time. Bigger value can decrease the IO requests but may fetch more useless meta data. + * - hive.orc.writer.stripe-max-size + - orc_optimized_writer_max_stripe_size + - string + - 64M + - Maximum stripe size in orc writer. + * - hive.orc.writer.dictionary-max-memory + - orc_optimized_writer_max_dictionary_memory + - string + - 16M + - Maximum dictionary memory that can be used in orc writer. + * - hive.orc.writer.integer-dictionary-encoding-enabled + - orc_optimized_writer_integer_dictionary_encoding_enabled + - bool + - true + - Whether or not dictionary encoding of integer types should be used by the ORC writer. + * - hive.orc.writer.string-dictionary-encoding-enabled + - orc_optimized_writer_string_dictionary_encoding_enabled + - bool + - true + - Whether or not dictionary encoding of string types should be used by the ORC writer. + * - hive.parquet.writer.timestamp-unit + - hive.parquet.writer.timestamp_unit + - tinyint + - 9 + - Timestamp unit used when writing timestamps into Parquet through Arrow bridge. + Valid values are 0 (second), 3 (millisecond), 6 (microsecond), 9 (nanosecond). + * - hive.orc.writer.linear-stripe-size-heuristics + - orc_writer_linear_stripe_size_heuristics + - bool + - true + - Enables historical based stripe size estimation after compression. + * - hive.orc.writer.min-compression-size + - orc_writer_min_compression_size + - integer + - 1024 + - Minimal number of items in an encoded stream. + * - hive.orc.writer.compression-level + - orc_optimized_writer_compression_level + - tinyint + - 3 for ZSTD and 4 for ZLIB + - The compression level to use with ZLIB and ZSTD. + * - cache.no_retention + - cache.no_retention + - bool + - false + - If true, evict out a query scanned data out of in-memory cache right after the access, + and also skip staging to the ssd cache. This helps to prevent the cache space pollution + from the one-time table scan by large batch query when mixed running with interactive + query which has high data locality. ``Amazon S3 Configuration`` @@ -372,7 +555,7 @@ Hive Connector - Description * - hive.s3.use-instance-credentials - bool - - true + - false - Use the EC2 metadata service to retrieve API credentials. This works with IAM roles in EC2. * - hive.s3.aws-access-key - string @@ -408,7 +591,34 @@ Hive Connector - string - velox-session - Session name associated with the IAM role. - + * - hive.s3.use-proxy-from-env + - bool + - false + - Utilize the configuration of the environment variables http_proxy, https_proxy, and no_proxy for use with the S3 API. + * - hive.s3.connect-timeout + - string + - + - Socket connect timeout. + * - hive.s3.socket-timeout + - string + - + - Socket read timeout. + * - hive.s3.max-connections + - integer + - + - Maximum concurrent TCP connections for a single http client. + * - hive.s3.max-attempts + - integer + - + - Maximum attempts for connections to a single http client, work together with retry-mode. By default, it's 3 for standard/adaptive mode + and 10 for legacy mode. + * - hive.s3.retry-mode + - string + - + - **Allowed values:** "standard", "adaptive", "legacy". By default it's empty, S3 client will be created with RetryStrategy. + Legacy mode only enables throttled retry for transient errors. + Standard mode is built on top of legacy mode and has throttled retry enabled for throttling errors apart from transient errors. + Adaptive retry mode dynamically limits the rate of AWS requests to maximize success rate. ``Google Cloud Storage Configuration`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. list-table:: @@ -431,6 +641,31 @@ Hive Connector - string - - The GCS service account configuration as json string. + * - hive.gcs.max-retry-count + - integer + - + - The GCS maximum retry counter of transient errors. + * - hive.gcs.max-retry-time + - string + - + - The GCS maximum time allowed to retry transient errors. + +``Azure Blob Storage Configuration`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. list-table:: + :widths: 30 10 10 60 + :header-rows: 1 + + * - Property Name + - Type + - Default Value + - Description + * - fs.azure.account.key..dfs.core.windows.net + - string + - + - The credentials to access the specific Azure Blob Storage account, replace with the name of your Azure Storage account. + This property aligns with how Spark configures Azure account key credentials for accessing Azure storage, by setting this property multiple + times with different storage account names, you can access multiple Azure storage accounts. Presto-specific Configuration ----------------------------- @@ -474,3 +709,31 @@ Spark-specific Configuration - 4194304 - The maximum number of bits to use for the bloom filter in :spark:func:`bloom_filter_agg` function, the value of this config can not exceed the default value. + * - spark.partition_id + - integer + - + - The current task's Spark partition ID. It's set by the query engine (Spark) prior to task execution. + +Tracing +-------- +.. list-table:: + :widths: 30 10 10 70 + :header-rows: 1 + + * - Property Name + - Type + - Default Value + - Description + * - query_trace_enabled + - bool + - true + - If true, enable query tracing. + * - query_trace_dir + - string + - + - The root directory to store the tracing data and metadata for a query. + * - query_trace_node_ids + - string + - + - A comma-separated list of plan node ids whose input data will be trace. If it is empty, then we only trace the + query metadata which includes the query plan and configs etc. diff --git a/velox/docs/develop.rst b/velox/docs/develop.rst index 1d0fc590f00d3..eedaed1eb8894 100644 --- a/velox/docs/develop.rst +++ b/velox/docs/develop.rst @@ -11,10 +11,12 @@ This guide is intended for Velox contributors and developers of Velox-based appl develop/vectors develop/scalar-functions develop/aggregate-functions + develop/view-and-writer-types develop/lambda-functions develop/expression-evaluation develop/dictionary-encoding develop/arena + develop/hash-table develop/aggregations develop/connectors develop/joins @@ -22,8 +24,11 @@ This guide is intended for Velox contributors and developers of Velox-based appl develop/operators develop/task develop/simd + develop/memory develop/spilling develop/serde + develop/timestamp develop/testing develop/debugging develop/TpchBenchmark + develop/window diff --git a/velox/docs/develop/TpchBenchmark.rst b/velox/docs/develop/TpchBenchmark.rst index 778d27248bff1..23ba28b5d02a5 100644 --- a/velox/docs/develop/TpchBenchmark.rst +++ b/velox/docs/develop/TpchBenchmark.rst @@ -16,7 +16,7 @@ following command line to do the build with S3 support: .. code:: shell - $ make release EXTRA_CMAKE_FLAGS="-DVELOX_BUILD_BENCHMARKS=ON -DVELOX_ENABLE_S3=ON" + $ make benchmarks-build EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_S3=ON" ---- diff --git a/velox/docs/develop/aggregate-functions.rst b/velox/docs/develop/aggregate-functions.rst index ed5007b336202..8c60890dfcf47 100644 --- a/velox/docs/develop/aggregate-functions.rst +++ b/velox/docs/develop/aggregate-functions.rst @@ -109,24 +109,337 @@ accordingly. Data is stored in the following order: .. image:: images/aggregation-layout.png :width: 600 -Aggregate class ---------------- - -To add an aggregate function, +To add an aggregate function, there are two options: implementing it as a +simple function or as a vector function. The simple-function interface allows +the author to write methods that process input data one row at a time and not +handle input vector encodings themselves. However, the simple-function +interface currently has certain limitations, such as not allowing for advanced +performance optimization on constant inputs. Aggregation functions that +require such functionalities can be implemented through the vector-function +interface. With the vector-function interface, the author writes methods that +process one vector at a time and handles input vector encodings by themselves. * Prepare: * Figure out what are the input, intermediate and final types. * Figure out what are partial and final calculations. * Design the accumulator. Make sure the same accumulator can accept both raw inputs and intermediate results. - * Create a new class that extends velox::exec::Aggregate base class + * If implementing a simple function, create a class for the function according + to instructions below; If implementing a vector function, + create a new class that extends velox::exec::Aggregate base class (see velox/exec/Aggregate.h) and implement virtual methods. * Register the new function using exec::registerAggregateFunction(...). * Add tests. * Write documentation. +Simple Function Interface +------------------------- + +This section describes the main concepts and the simple interface of +aggregation functions. Examples of aggregation functions implemented through +the simple-function interface can be found at velox/exec/tests/SimpleAverageAggregate.cpp +and velox/exec/tests/SimpleArrayAggAggregate.cpp. + +A simple aggregation function is implemented as a class as the following. + +.. code-block:: c++ + + // array_agg(T) -> array(T) -> array(T) + class ArrayAggAggregate { + public: + // Type(s) of input vector(s) wrapped in Row. + using InputType = Row>; + using IntermediateType = Array>; + using OutputType = Array>; + + // Optional. Default is true. + static constexpr bool default_null_behavior_ = false; + + // Optional. + static bool toIntermediate( + exec::out_type>>& out, + exec::optional_arg_type> in); + + struct AccumulatorType { ... }; + }; + +The author declares the function's input type, intermediate type, and output +type in the simple aggregation function class. The input type must be the +function's argument type(s) wrapped in a Row<> even if the function only takes +one argument. This is needed for the SimpleAggregateAdapter to parse input +types for arbitrary aggregation functions properly. + +The author can define an optional flag `default_null_behavior_` indicating +whether the aggregation function has default-null behavior. This flag is true +by default. Next, the class can have an optional method `toIntermediate()` +that converts the aggregation function's raw input directly to its intermediate +states. Finally, the author must define a struct named `AccumulatorType` in +the aggregation function class. We explain each part in more details below. + +Default-Null Behavior +^^^^^^^^^^^^^^^^^^^^^ + +When adding raw inputs or intermediate states to accumulators, aggregation +functions of default-null behavior ignore the input values that are nulls. For +raw inputs that consist of multiple columns, an entire row is ignored if at +least one column is null at this row. Below is an example. + +.. code-block:: sql + + SELECT sum(c0) FROM (values (cast(NULL as bigint), 10), (NULL, 20), (NULL, 30)) AS t(c0, c1); -- NULL + +When generating intermediate or final output results from accumulators, +aggregation functions of default-null behavior produce nulls for groups of no +input row or only null rows. Another example is given below. + +.. code-block:: sql + + SELECT sum(c0) FROM (values (1, 10), (2, 20), (3, 30)) AS t(c0, c1) WHERE c1 > 40; -- NULL + +Most aggregation functions have default-null behavior. An example is in +SimpleAverageAggregate.cpp. On the other hand, SimpleArrayAggAggregate.cpp has +an example of non-default-null behavior. + +This flag affects the C++ function signatures of `toIntermediate()` and methods +in the `AccumulatorType` struct. + +toIntermediate +^^^^^^^^^^^^^^ + +The author can optionally define a static method `toIntermediate()` that +converts a raw input to an intermediate state. If defined, this function is +used in query plans that abandon the partial aggregation step. If the aggregaiton function has +default-null behavior, the toIntermediate() function has an out-parameter +of the type `exec::out_type&` followed by in-parameters of +the type `exec::arg_type` for each `T` wrapped inside InputType . If the +aggregation function has non-default null behavior, the in-parameters of +toIntermediate() are of the type `exec::optional_arg_type` instead. + +When `T` is a primitive type except Varchar and Varbinary, `exec::arg_type` +is simply `T` itself and `exec::out_type` is `T&`. `exec::optional_arg_type` +is `std::optional`. + +When `T` is Varchar, Varbinary, or a complex type, `exec::arg_type`, +`exec::optional_arg_type`, and `exec::out_type` are the corresponding +view and writer types of `T`. A detailed explanation can be found in :doc:`view-and-writer-types`. + +.. list-table:: + :widths: 25 25 + :header-rows: 1 + + * - Default-Null Behavior + - Non-Default-Null Behavior + * - static bool SimpleAverageAggregate::toIntermediate( + exec::out_type>& out, + exec::arg_type in); + - static bool SimpleArrayAggAggregate::toIntermediate( + exec::out_type>>& out, + exec::optional_arg_type> in); + +AccumulatorType of Default-Null Behavior +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For aggregaiton functions of default-null behavior, the author defines an +`AccumulatorType` struct as follows. + +.. code-block:: c++ + + struct AccumulatorType { + // Author defines data members + ... + + // Optional. Default is true. + static constexpr bool is_fixed_size_ = false; + + // Optional. Default is false. + static constexpr bool use_external_memory_ = true; + + // Optional. Default is false. + static constexpr bool is_aligned_ = true; + + explicit AccumulatorType(HashStringAllocator* allocator); + + void addInput(HashStringAllocator* allocator, exec::arg_type value1, ...); + + void combine( + HashStringAllocator* allocator, + exec::arg_type other); + + bool writeIntermediateResult(exec::out_type& out); + + bool writeFinalResult(exec::out_type& out); + + // Optional. Called during destruction. + void destroy(HashStringAllocator* allocator); + }; + +The author defines an optional flag `is_fixed_size_` indicating whether the +every accumulator takes fixed amount of memory. This flag is true by default. +Next, the author defines another optional flag `use_external_memory_` +indicating whether the accumulator uses memory that is not tracked by Velox. +This flag is false by default. Then, the author can define an optional flag +`is_aligned_` indicating whether the accumulator requires aligned +access. This flag is false by default. + +The author defines a constructor that takes a single argument of +`HashStringAllocator*`. This constructor is called before aggregation starts to +initialize all accumulators. + +The author can also optionally define a `destroy` function that is called when +*this* accumulator object is destructed. + +Notice that `writeIntermediateResult` and `writeFinalResult` are expected to not +modify contents in the accumulator. + +addInput +"""""""" + +This method adds raw input values to *this* accumulator. It receives a +`HashStringAllocator*` followed by `exec::arg_type`-typed values, one for +each argument type `Ti` wrapped in InputType. + +With default-null behavior, raw-input rows where at least one column is null are +ignored before `addInput` is called. After `addInput` is called, *this* +accumulator is assumed to be non-null. + +combine +""""""" + +This method adds an input intermediate state to *this* accumulator. It receives +a `HashStringAllocator*` and one `exec::arg_type` value. With +default-null behavior, nulls among the input intermediate states are ignored +before `combine` is called. After `combine` is called, *this* accumulator is +assumed to be non-null. + +writeIntermediateResult +""""""""""""""""""""""" + +This method writes *this* accumulator out to an intermediate state vector. It +has an out-parameter of the type `exec::out_type&`. This +method returns true if it writes a non-null value to `out`, or returns false +meaning a null should be written to the intermediate state vector. Accumulators +that are nulls (i.e., no value has been added to them) automatically become +nulls in the intermediate state vector without `writeIntermediateResult` being +called. + +writeFinalResult +"""""""""""""""" + +This method writes *this* accumulator out to a final result vector. It +has an out-parameter of the type `exec::out_type&`. This +method returns true if it writes a non-null value to `out`, or returns false +meaning a null should be written to the final result vector. Accumulators +that are nulls (i.e., no value has been added to them) automatically become +nulls in the final result vector without `writeFinalResult` being called. + +AccumulatorType of Non-Default-Null Behavior +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For aggregaiton functions of non-default-null behavior, the author defines an +`AccumulatorType` struct as follows. + +.. code-block:: c++ + + struct AccumulatorType { + // Author defines data members + ... + + // Optional. Default is true. + static constexpr bool is_fixed_size_ = false; + + // Optional. Default is false. + static constexpr bool use_external_memory_ = true; + + // Optional. Default is false. + static constexpr bool is_aligned_ = true; + + explicit AccumulatorType(HashStringAllocator* allocator); + + bool addInput(HashStringAllocator* allocator, exec::optional_arg_type value1, ...); + + bool combine( + HashStringAllocator* allocator, + exec::optional_arg_type other); + + bool writeIntermediateResult(bool nonNullGroup, exec::out_type& out); + + bool writeFinalResult(bool nonNullGroup, exec::out_type& out); + + // Optional. + void destroy(HashStringAllocator* allocator); + }; + +The definition of `is_fixed_size_`, `use_external_memory_`, +`is_aligned_`, the constructor, and the `destroy` method are exactly +the same as those for default-null behavior. + +On the other hand, the C++ function signatures of `addInput`, `combine`, +`writeIntermediateResult`, and `writeFinalResult` are different. + +Same as the case for default-null behavior, `writeIntermediateResult` and +`writeFinalResult` are expected to not modify contents in the accumulator. + +addInput +"""""""" + +This method receives a `HashStringAllocator*` followed by +`exec::optional_arg_type` values, one for each argument type `Ti` wrapped +in InputType. + +This method is called on all raw-input rows even if some columns may be null. +It returns a boolean meaning whether *this* accumulator is non-null after the +call. All accumulators are initialized to *null* before aggregation starts. An +accumulator that is originally null can be turned to non-null. But an +accumulator that's already non-null remains non-null regardless of the return +value of `addInput`. + +combine +""""""" + +This method receives a `HashStringAllocator*` and an +`exec::optional_arg_type` value. This method is called on +all intermediate states even if some are nulls. Same as `addInput`, this method +returns a boolean meaning whether *this* accumulator is non-null after the call. + +writeIntermediateResult +""""""""""""""""""""""" + +This method has an out-parameter of the type `exec::out_type&` +and a boolean flag `nonNullGroup` indicating whether *this* accumulator is +non-null. This method returns true if it writes a non-null value to `out`, or +return false meaning a null should be written to the intermediate state vector. + +writeFinalResult +"""""""""""""""" + +This method writes *this* accumulator out to a final result vector. It has an +out-parameter of the type `exec::out_type&` and a boolean flag +`nonNullGroup` indicating whether *this* accumulator is non-null. This method +returns true if it writes a non-null value to `out`, or return false meaning a +null should be written to the final result vector. + +Limitations +^^^^^^^^^^^ + +The simple aggregation function interface currently has three limitations. + +1. All values read or written by the aggrgeaiton function must be part of the + accumulators. This means that there cannot be function-level states kept + outside of accumulators. + +2. Optimizations on constant inputs is not supported. I.e., constant input + arguments are processed once per row in the same way as non-constant inputs. + +3. Aggregation pushdown to table scan is not supported yet. We're planning to + add this support. + +Vector Function Interface +------------------------- + +Aggregation functions that cannot use the simple-function interface can be written as vector funcitons. + Accumulator size ----------------- +^^^^^^^^^^^^^^^^ The implementation of the velox::exec::Aggregate interface can start with *accumulatorFixedWidthSize()* method. @@ -158,7 +471,21 @@ location of the accumulator. // @param offset Offset in bytes from the start of the row of the accumulator // @param nullByte Offset in bytes from the start of the row of the null flag // @param nullMask The specific bit in the nullByte that stores the null flag - void setOffsets(int32_t offset, int32_t nullByte, uint8_t nullMask) + // @param initializedByte Offset in bytes from the start of the row of the + // initialized flag + // @param initializedMask The specific bit in the initializedByte that stores + // the initialized flag + // @param rowSizeOffset The offset of a uint32_t row size from the start of + // the row. Only applies to accumulators that store variable size data out of + // line. Fixed length accumulators do not use this. 0 if the row does not have + // a size field. + void setOffsets( + int32_t offset, + int32_t nullByte, + uint8_t nullMask, + int32_t initializedByte, + int8_t initializedMask, + int32_t rowSizeOffset) The base class implements the setOffsets method by storing the offsets in member variables. @@ -167,9 +494,20 @@ The base class implements the setOffsets method by storing the offsets in member // Byte position of null flag in group row. int32_t nullByte_; uint8_t nullMask_; + // Byte position of the initialized flag in group row. + int32_t initializedByte_; + uint8_t initializedMask_; // Offset of fixed length accumulator state in group row. int32_t offset_; + // Offset of uint32_t row byte size of row. 0 if there are no + // variable width fields or accumulators on the row. The size is + // capped at 4G and will stay at 4G and not wrap around if growing + // past this. This serves to track the batch size when extracting + // rows. A size in excess of 4G would finish the batch in any case, + // so larger values need not be represented. + int32_t rowSizeOffset_ = 0; + Typically, an aggregate function doesn’t use the offsets directly. Instead, it uses helper methods from the base class. To access the accumulator: @@ -194,25 +532,25 @@ To manipulate the null flags: inline bool clearNull(char* group); Initialization --------------- +^^^^^^^^^^^^^^ -Once you have accumulatorFixedWidthSize(), the next method to implement is initializeNewGroups(). +Once you have accumulatorFixedWidthSize(), the next method to implement is initializeNewGroupsInternal(). .. code-block:: c++ // Initializes null flags and accumulators for newly encountered groups. // @param groups Pointers to the start of the new group rows. // @param indices Indices into 'groups' of the new entries. - virtual void initializeNewGroups( + virtual void initializeNewGroupsInternal( char** groups, folly::Range indices) = 0; This method is called by the HashAggregation operator every time it encounters new combinations of the grouping keys. This method should initialize the accumulators for the new groups. For example, partial “count” and “sum” aggregates would set the accumulators to zero. Many aggregate functions would set null flags to true by calling the exec::Aggregate::setAllNulls(groups, indices) helper method. GroupBy aggregation -------------------- +^^^^^^^^^^^^^^^^^^^ -At this point you have accumulatorFixedWidthSize() and initializeNewGroups() methods implemented. Now, we can proceed to implementing the end-to-end group-by aggregation. We need the following pieces: +At this point you have accumulatorFixedWidthSize() and initializeNewGroupsInternal() methods implemented. Now, we can proceed to implementing the end-to-end group-by aggregation. We need the following pieces: * Logic for adding raw input to the accumulator: * addRawInput() method. @@ -306,6 +644,7 @@ After implementing the addRawInput() method, we proceed to adding logic for extr .. code-block:: c++ // Extracts partial results (used for partial and intermediate aggregations). + // This method is expected to not modify contents in accumulators. // @param groups Pointers to the start of the group rows. // @param numGroups Number of groups to extract results from. // @param result The result vector to store the results in. @@ -326,7 +665,8 @@ Next, we implement the extractValues() method that extracts final results from t .. code-block:: c++ - // Extracts final results (used for final and single aggregations). + // Extracts final results (used for final and single aggregations). This method + // is expected to not modify contents in accumulators. // @param groups Pointers to the start of the group rows. // @param numGroups Number of groups to extract results from. // @param result The result vector to store the results in. @@ -393,7 +733,7 @@ implement toIntermediate() method which simply returns the input unmodified. GroupBy aggregation code path is done. We proceed to global aggregation. Global aggregation ------------------- +^^^^^^^^^^^^^^^^^^ Global aggregation is similar to group-by aggregation, but there is only one group and one accumulator. After implementing group-by aggregation, the only @@ -485,11 +825,11 @@ input type and result type. .. code-block:: c++ - bool registerApproxPercentile(const std::string& name) { - std::vector> signatures; + exec::AggregateRegistrationResult registerApproxPercentile(const std::string& name) { + std::vector> signatures; ... - exec::registerAggregateFunction( + return exec::registerAggregateFunction( name, std::move(signatures), [name]( @@ -514,12 +854,38 @@ input type and result type. ... } }); - return true; } static bool FB_ANONYMOUS_VARIABLE(g_AggregateFunction) = registerApproxPercentile(kApproxPercentile); +If the aggregation function is implemented through the simple-function +interface, use `SimpleAggregateAdapter` when creating the +unique pointers. Below is an example. + +.. code-block:: c++ + + exec::AggregateRegistrationResult registerSimpleArrayAggAggregate( + const std::string& name) { + ... + + return exec::registerAggregateFunction( + name, + std::move(signatures), + [name]( + core::AggregationNode::Step /*step*/, + const std::vector& argTypes, + const TypePtr& resultType, + const core::QueryConfig& /*config*/) + -> std::unique_ptr { + VELOX_CHECK_EQ( + argTypes.size(), 1, "{} takes at most one argument", name); + return std::make_unique>( + resultType); + }); +} + + Use FunctionSignatureBuilder to create FunctionSignature instances which describe supported signatures. Each signature includes zero or more input types, an intermediate result type and final result type. @@ -610,8 +976,7 @@ The following query plans are being tested. final aggregation with forced spilling. Query runs using 4 threads. Query run with forced spilling is enabled only for group-by aggregations and -only if `allowInputShuffle_` flag is enabled by calling allowInputShuffle -() method from the SetUp(). Spill testing requires multiple batches of input. +only if aggregate functions are not order-sensitive. Spill testing requires multiple batches of input. To split input data into multiple batches we add local exchange with round-robin repartitioning before the partial aggregation. This changes the order in which aggregation inputs are processed, hence, query results with spilling diff --git a/velox/docs/develop/arena.rst b/velox/docs/develop/arena.rst index f6abb785903f3..5e601509012bd 100644 --- a/velox/docs/develop/arena.rst +++ b/velox/docs/develop/arena.rst @@ -2,7 +2,7 @@ Arena Allocation ================ -exec::HashStringAllocator implements an arena backed by MappedMemory::Allocation +exec::HashStringAllocator implements an arena backed by memory::Allocation and supports contiguous and non-contiguous allocations. It is used to store variable width accumulators for the aggregate functions, hash tables for joins and aggregations. It is also used to back byte streams used to serialize and @@ -75,19 +75,19 @@ non-contiguous allocation, it frees all the blocks in that allocation. // Allocates 'size' contiguous bytes preceded by a Header. Returns // the address of Header. - Header* FOLLY_NONNULL allocate(int32_t size); + Header* allocate(int32_t size); // Adds the allocation of 'header' and any extensions (if header has // kContinued set) to the free list. - void free(Header* FOLLY_NONNULL header); + void free(Header* header); StlAllocator, an allocator backed by HashStringAllocator that can be used with STL containers, is implemented using the above allocate() and free() methods. NewWrite(), extendWrite() and finishWrite() methods allow for serializing -variable width data whose size is not known in advance using ByteStream. When -using ByteStream, the underlying data may come from multiple non-contiguous -blocks. ByteStream transparently manages allocation of additional blocks by +variable width data whose size is not known in advance using ByteOutputStream. When +using ByteOutputStream, the underlying data may come from multiple non-contiguous +blocks. ByteOutputStream transparently manages allocation of additional blocks by calling HashStringAllocator::newRange() method. .. code-block:: c++ @@ -97,27 +97,27 @@ calling HashStringAllocator::newRange() method. // kMinContiguous bytes of contiguous space. finishWrite finalizes // the allocation information after the write is done. // Returns the position at the start of the allocated block. - Position newWrite(ByteStream& stream, int32_t preferredSize = kMinContiguous); + Position newWrite(ByteOutputStream& stream, int32_t preferredSize = kMinContiguous); // Completes a write prepared with newWrite or // extendWrite. Up to 'numReserveBytes' unused bytes, if available, are left // after the end of the write to accommodate another write. Returns the // position immediately after the last written byte. - Position finishWrite(ByteStream& stream, int32_t numReserveBytes); + Position finishWrite(ByteOutputStream& stream, int32_t numReserveBytes); // Sets 'stream' to write starting at 'position'. If new ranges have to // be allocated when writing, headers will be updated accordingly. - void extendWrite(Position position, ByteStream& stream); + void extendWrite(Position position, ByteOutputStream& stream); -The prepareRead() method allows deserializing the data using ByteStream. +The prepareRead() method allows deserializing the data using ByteInputStream. .. code-block:: c++ // Sets 'stream' to range over the data in the range of 'header' and // possible continuation ranges. static void prepareRead( - const Header* FOLLY_NONNULL header, - ByteStream& stream); + const Header* header, + ByteInputStream& stream); Examples of Usage ----------------- @@ -139,22 +139,22 @@ The accumulator calls finishWrite() after writing the value. .. code-block:: c++ // Write first value - ByteStream stream(allocator); + ByteOutputStream stream(allocator); auto begin = allocator->newWrite(stream); // ... write to the stream allocator->finishWrite(stream); // Update the value - ByteStream stream(allocator); + ByteOutputStream stream(allocator); auto begin = allocator->extendWrite(begin, stream); // ... write to the stream allocator->finishWrite(stream); -The accumulator uses prepareRead() to read the data back using ByteStream. +The accumulator uses prepareRead() to read the data back using ByteInputStream. .. code-block:: c++ - ByteStream stream; + ByteInputStream stream; exec::HashStringAllocator::prepareRead(begin, stream); // … read from the stream @@ -174,13 +174,13 @@ write. .. code-block:: c++ // Write first value - ByteStream stream(allocator); + ByteOutputStream stream(allocator); auto begin = allocator->newWrite(stream); // ... write to the stream auto current = allocator->finishWrite(stream); // Update the value - ByteStream stream(allocator); + ByteOutputStream stream(allocator); auto begin = allocator->extendWrite(current, stream); // ... write to the stream allocator->finishWrite(stream); diff --git a/velox/docs/develop/connectors.rst b/velox/docs/develop/connectors.rst index fd1bacde3bc38..2d7ac8a011f33 100644 --- a/velox/docs/develop/connectors.rst +++ b/velox/docs/develop/connectors.rst @@ -82,6 +82,9 @@ Storage Adapters Hive Connector allows reading and writing files from a variety of distributed storage systems. The supported storage API are S3, HDFS, GCS, Linux FS. +If file is not found when reading, `openFileForRead` API throws `VeloxRuntimeError` with `error_code::kFileNotFound`. +This behavior is necessary to support the `ignore_missing_files` configuration property. + S3 is supported using the `AWS SDK for C++ `_ library. S3 supported schemes are `s3://` (Amazon S3, Minio), `s3a://` (Hadoop 3.x), `s3n://` (Deprecated in Hadoop 3.x), `oss://` (Alibaba cloud storage), and `cos://`, `cosn://` (Tencent cloud storage). @@ -92,4 +95,29 @@ are `hdfs://`. GCS is supported using the `Google Cloud Platform C++ Client Libraries `_. GCS supported schemes -are `gs://`. \ No newline at end of file +are `gs://`. + +ABS (Azure Blob Storage) is supported using the +`Azure SDK for C++ `_ library. ABS supported schemes are `abfs(s)://`. + +S3 Storage adapter using a proxy +******************************** + +By default, the C++ AWS S3 client does not honor the configuration of the +environment variables http_proxy, https_proxy, and no_proxy. +The Java AWS S3 client supports this. +The environment variables can be specified as lower case, upper case or both. +In order to enable the use of a proxy the hive connector configuration variable +`hive.s3.use-proxy-from-env` must be set to `true`. By default, the value +is `false`. + +This is the behavior when the proxy settings are enabled: + +1. http_proxy/HTTP_PROXY, https_proxy/HTTPS_PROXY and no_proxy/NO_PROXY + environment variables are read. If lower case and upper case variables are set + lower case variables take precendence. +2. The no_proxy/NO_PROXY content is scanned for exact and suffix matches. +3. IP addresses, domains, subdomains, or IP ranges (CIDR) can be specified in no_proxy/NO_PROXY. +4. The no_proxy/NO_PROXY list is comma separated. +5. Use . or \*. to indicate domain suffix matching, e.g. `.foobar.com` will + match `test.foobar.com` or `foo.foobar.com`. diff --git a/velox/docs/develop/debugging/print-plan-with-stats.rst b/velox/docs/develop/debugging/print-plan-with-stats.rst index 5cbe05f621bbb..52bd5e6bb7f99 100644 --- a/velox/docs/develop/debugging/print-plan-with-stats.rst +++ b/velox/docs/develop/debugging/print-plan-with-stats.rst @@ -133,6 +133,7 @@ Here is the output for the join query from above. skippedStrides sum: 0, count: 1, min: 0, max: 0 storageReadBytes sum: 150.25KB, count: 1, min: 150.25KB, max: 150.25KB totalScanTime sum: 0ns, count: 1, min: 0ns, max: 0ns + totalRemainingFilterTime sum: 0ns, count: 1, min: 0ns, max: 0ns queryThreadIoLatency sum: 0, count: 1, min: 0, max: 0 -> Project[expressions: (u_c0:INTEGER, ROW["c0"]), (u_c1:BIGINT, ROW["c1"])] Output: 100 rows (1.31KB), Cpu time: 21.50us, Blocked wall time: 0ns, Peak memory: 0B, Threads: 1 @@ -174,6 +175,7 @@ And this is the output for the aggregation query from above. skippedStrides sum: 0, count: 1, min: 0, max: 0 storageReadBytes sum: 61.53KB, count: 1, min: 61.53KB, max: 61.53KB totalScanTime sum: 0ns, count: 1, min: 0ns, max: 0ns + totalRemainingFilterTime sum: 0ns, count: 1, min: 0ns, max: 0ns queryThreadIoLatency sum: 0, count: 1, min: 0, max: 0 Common operator statistics @@ -247,6 +249,7 @@ groups were skipped via stats-based pruning. skippedStrides sum: 0, count: 1, min: 0, max: 0 storageReadBytes sum: 150.25KB, count: 1, min: 150.25KB, max: 150.25KB totalScanTime sum: 0ns, count: 1, min: 0ns, max: 0ns + totalRemainingFilterTime sum: 0ns, count: 1, min: 0ns, max: 0ns queryThreadIoLatency sum: 0, count: 1, min: 0, max: 0 HashBuild operator reports range and number of distinct values for the join keys. diff --git a/velox/docs/develop/expression-evaluation.rst b/velox/docs/develop/expression-evaluation.rst index 37204af58da6e..ec9ef94a76bc8 100644 --- a/velox/docs/develop/expression-evaluation.rst +++ b/velox/docs/develop/expression-evaluation.rst @@ -241,8 +241,8 @@ This is calculated by Expr::computeMetadata() virtual methods and stored in member variables of the exec::Expr class. * *distinctFields_* - List of distinct input columns. -* *multiplyReferencedFields_* - Subset of distinctFields_ that are used as inputs by multiple subexpressions. -* *sameAsParentDistinctFields_* - True if distinctFields_ matches one of the parent's distinctFields_ (parents to refer expressions that have this expression as input). +* *multiplyReferencedFields_* - Subset of `distinctFields_` that are used as inputs by multiple subexpressions. +* *sameAsParentDistinctFields_* - True if `distinctFields_` matches one of the parent's `distinctFields_` (parents to refer expressions that have this expression as input). * *propagatesNulls_* - Boolean indicating whether a null in any of the input columns causes this expression to always return null for the row. * *deterministic_* - Boolean indicating whether this expression and all its children are deterministic. * *hasConditionals_* - Boolean indicating whether this expression or any of its children is an IF, SWITCH, AND or OR expression. @@ -313,7 +313,7 @@ vector has 1’000 entries. These are represented with an indices buffer of 1000 values in the range of [0, 2] and an inner flat vector of size 3: [red, green, blue]. When evaluating **upper(color)** expression, Expr::peelEncodings () method is used to peel off a dictionary and produce a new set of inputs: -inner flat vector or size 3 and a set of indices into that vector: [0, 1, 2]. +inner flat vector of size 3 and a set of indices into that vector: [0, 1, 2]. Then, “upper” function is applied to 3 values - [red, green, blue] - to produce another flat vector of size 3: [RED, GREEN, BLUE]. Finally, the result is wrapped in a dictionary vector using the original indices to produce a diff --git a/velox/docs/develop/hash-table.rst b/velox/docs/develop/hash-table.rst new file mode 100644 index 0000000000000..1d82a1427ca39 --- /dev/null +++ b/velox/docs/develop/hash-table.rst @@ -0,0 +1,148 @@ +.. role:: raw-html(raw) + :format: html + +.. role:: m(math) + +========== +Hash table +========== + +The hash table used in Velox is similar to the +`F14 hash table `_. +The main difference is that the Velox hash table allows vectorized inserts and lookups, while F14 doesn't. + + +Layout +------ + +The hash table is implemented as an array of buckets. It is a linear data structure. +Each bucket uses 128 bytes (2 * 64 = 2 cache lines) and contains 16 slots. +Each hash table entry occupies one slot. The hash table’s capacity is the total number of slots: total +number of buckets * 16. The hash table’s capacity is always a power of 2. + +Each slot consists of 2 pieces: a tag (7 bits) and a pointer (6 bytes). +There are a total of 16 tags and 16 pointers in a bucket. These are stored tags first, followed by +pointers. Each tag occupies 1 byte (only 7 bits are used). 16 tags occupy 16 bytes. Each pointer +occupies 6 bytes. 16 pointers occupy 96 bytes. There are 16 bytes left unused at the end of the bucket. +These are referred to as padding. + +.. image:: images/ht-layout.png + :align: center + +A hash table is never full. There are always some empty slots. Velox allows the hash table to fill up to +:raw-html:`` of capacity before resizing. +On resize the hash table’s capacity doubles. + +Individual buckets may be completely empty, partially filled or full. Buckets are filled left to right. +If a bucket is partially full, then first N tags and N pointers are filled and the rest are free (N < 16). + + +Inserting an entry +------------------ + +To insert a new entry we need to figure out which slot to put it in. +A slot is identified by bucket and offset within the bucket. First, we compute a hash of the entry. +Then, we compute a tag and a bucket number from the hash. + +We use 7 bits of the hash for the tag: bits 38-44 inclusive. We use N bits of the hash for the bucket +starting from bit 8. + +The number of bits used for the bucket is decided based on the hash table capacity. +Remember that capacity is always a power of 2: :m:`2^n`. Each bucket stores 16 entries, hence, we +need :m:`2^{(n-4)}` buckets to store :m:`2^n` entries. Hence, we need to use n-4 bits of the hash for the bucket. + +Let's say we have a hash table that can store a million entries: :m:`2^{20}` = 1,048,576. Here, n = 20 and +N = n - 4 = 16. We will use 16 bits for the bucket. + +Given hash number: + +01011011 11001010 011\ :raw-html:`11100 01`\101001 10110111 +1\ :raw-html:`0010100 11111000 1`\ 1001110 + +We compute the tag as 1\ :raw-html:`1110001` and bucket offset +as 1,374,336 (00000000 00000000 00000000 00000000 00000000 +0\ :raw-html:`0010100 11111000 1`\ 0000000). +Bucket offset is the number of bytes from the start of the hash table. + +.. code-block:: c++ + + bucket offset = bucket number * 128 + bucket number = bucket offset / 128 + +The bucket offset is used to get to the bucket, in this case bucket number 10737. +The candidate bucket can be empty, partially filled or full. + +**The bucket is empty.** + +.. image:: images/ht-empty.png + :align: center + +In this case, we simply insert our entry into the first slot of the bucket. +We store only the tag (7 bits of the hash value) in the hash table. The hash number itself is not stored. +The pointer refers to memory outside of the hash table where the complete value is stored. +This is usually a row in a RowContainer. The hash table can be viewed as an index on top of a RowContainer +that helps locate records with matching keys faster. The hash table itself doesn’t store data or keys. + +.. image:: images/ht-insert1.png + :align: center + +**The bucket is partially filled.** + +For example, the bucket has one slot occupied (like above). +In this case there is a possibility that the new entry is a duplicate of the one already stored. +Hence, we compare the tag of the new entry with tags stored in the bucket. +If none match, this entry is not a duplicate, so we store it in the next available slot in the bucket. + +.. image:: images/ht-insert2.png + :align: center + +However, if one or more existing tags match the tag of the new entry, we follow the pointers to compare +the keys to determine whether there is a match or not. If there is no match we insert a new entry. +Otherwise, there is a duplicate. The row is chained to the row list pointed to by the row entry +and no new entry is inserted. + +**The bucket is full.** + +.. image:: images/ht-full.png + :align: center + +First, we need to check if the new entry is a duplicate of one of the 16 entries stored in the bucket. +We compare tags and if necessary follow the pointers to compare the keys. +If there is a match, the row is chained to the row list pointed to by the row entry +and no new entry is inserted. If there is no match, we go to the +next bucket and repeat the process. In rare cases, we may end up checking many buckets until we find a +duplicate existing entry or an empty slot for the new entry. That’s why it is important to ensure that +the hash table is never full and there are enough gaps in the form of empty slots. + +Resizing +-------- + +If the hash table fills up beyond :raw-html:`` +of capacity, it needs to be resized. Each resize doubles the capacity. +A new hash table is allocated and all existing entries inserted using the “Inserting an entry” process. +Since we know that all entries are unique, the “Inserting an entry” process can be simplified to +eliminate the logic for checking whether a new entry is a duplicate of an existing one. Hence, to +insert an entry, we compute a hash, extract tag and bucket number, go to the bucket and insert the +entry if there is space. If the bucket is full, we proceed to the next bucket and continue until we +find a bucket with an empty slot. We insert the new entry there. + +Use Cases +--------- + +The main use cases for the hash table are `Join `_ and +`Aggregation `_ operators. + +The HashBuild operator builds the hash table to store unique values of the join keys found on the build +side of the join. The HashProbe operator looks up entries in the hash table using join keys from the +probe side. The HashProbe operator does not insert new entries into the hash table and never triggers +a resize. The pointers in the hash table refer to rows in a RowContainer that store individual +rows for the build side of the join. + +The HashAggregation operator stores unique grouping keys in the hash table. The pointers in the hash table +refer to rows in a RowContainer that store grouping keys along with accumulators of the aggregate +functions. + +Implementation +-------------- + +The hash table is implemented by the ``HashTable`` class in namespace ``facebook::velox::exec``. diff --git a/velox/docs/develop/images/empty_frames.png b/velox/docs/develop/images/empty_frames.png new file mode 100644 index 0000000000000..017c41155f476 Binary files /dev/null and b/velox/docs/develop/images/empty_frames.png differ diff --git a/velox/docs/develop/images/ht-empty.png b/velox/docs/develop/images/ht-empty.png new file mode 100644 index 0000000000000..f8c17b4288e1f Binary files /dev/null and b/velox/docs/develop/images/ht-empty.png differ diff --git a/velox/docs/develop/images/ht-full.png b/velox/docs/develop/images/ht-full.png new file mode 100644 index 0000000000000..d4611b42efe69 Binary files /dev/null and b/velox/docs/develop/images/ht-full.png differ diff --git a/velox/docs/develop/images/ht-insert1.png b/velox/docs/develop/images/ht-insert1.png new file mode 100644 index 0000000000000..58867e918d800 Binary files /dev/null and b/velox/docs/develop/images/ht-insert1.png differ diff --git a/velox/docs/develop/images/ht-insert2.png b/velox/docs/develop/images/ht-insert2.png new file mode 100644 index 0000000000000..515f06be3614a Binary files /dev/null and b/velox/docs/develop/images/ht-insert2.png differ diff --git a/velox/docs/develop/images/ht-layout.png b/velox/docs/develop/images/ht-layout.png new file mode 100644 index 0000000000000..ad47e9d727455 Binary files /dev/null and b/velox/docs/develop/images/ht-layout.png differ diff --git a/velox/docs/develop/images/memory-arbitration.png b/velox/docs/develop/images/memory-arbitration.png new file mode 100644 index 0000000000000..cb712ecf5c99d Binary files /dev/null and b/velox/docs/develop/images/memory-arbitration.png differ diff --git a/velox/docs/develop/images/memory-function.png b/velox/docs/develop/images/memory-function.png new file mode 100644 index 0000000000000..daa5b1aaab57c Binary files /dev/null and b/velox/docs/develop/images/memory-function.png differ diff --git a/velox/docs/develop/images/memory-manager.png b/velox/docs/develop/images/memory-manager.png new file mode 100644 index 0000000000000..9f95b7fabe68d Binary files /dev/null and b/velox/docs/develop/images/memory-manager.png differ diff --git a/velox/docs/develop/images/memory-pool.png b/velox/docs/develop/images/memory-pool.png new file mode 100644 index 0000000000000..c70d8270a22eb Binary files /dev/null and b/velox/docs/develop/images/memory-pool.png differ diff --git a/velox/docs/develop/images/memory-system.png b/velox/docs/develop/images/memory-system.png new file mode 100644 index 0000000000000..cff7d16d30ec8 Binary files /dev/null and b/velox/docs/develop/images/memory-system.png differ diff --git a/velox/docs/develop/images/size-class.png b/velox/docs/develop/images/size-class.png new file mode 100644 index 0000000000000..3513989891fa8 Binary files /dev/null and b/velox/docs/develop/images/size-class.png differ diff --git a/velox/docs/develop/joins.rst b/velox/docs/develop/joins.rst index 1bedfc6626373..e879f51a2264f 100644 --- a/velox/docs/develop/joins.rst +++ b/velox/docs/develop/joins.rst @@ -110,7 +110,8 @@ data by probing the hash table and continues execution as specified by downstream plan nodes. HashJoinNode is translated into two separate operators: HashProbe and HashBuild. HashProbe operator becomes part of the probe-side pipeline. HashBuild operator is installed as the last operator of the -build-side pipeline. The output of the HashBuild operator is a hash table which +build-side pipeline. The output of the HashBuild operator is a +`hash table `_ which HashProbe operator gets access to via a special mechanism: JoinBridge. .. image:: images/join-pipelines.png @@ -118,7 +119,7 @@ HashProbe operator gets access to via a special mechanism: JoinBridge. :align: center Both HashBuild and HashAggregation operators use the same data structure for the -hash table: velox::exec::HashTable. The payload, the non-join key columns +hash table: `velox::exec::HashTable `_. The payload, the non-join key columns referred to as dependent columns, are stored row-wise in the RowContainer. Using the hash table in join and aggregation allows for a future optimization @@ -183,7 +184,7 @@ join key values on the build side are unique it is possible to replace the join completely with the pushed down filter. Velox detects such opportunities and turns the join into a no-op after pushing the filter down. -Dynamic filter pushdown optimization is enabled for inner, left semi, and +Dynamic filter pushdown optimization is enabled for inner, left semi, and right semi joins. Broadcast Join @@ -197,7 +198,7 @@ the join is executed using broadcast or partitioned strategy has no effect on the join execution itself. The only difference is that broadcast execution allows for dynamic filter pushdown while partitioned execution does not. -PartitionedOutput operator and PartitionedOutputBufferManager support +PartitionedOutput operator and OutputBufferManager support broadcasting the results of the plan evaluation. This functionality is enabled by setting boolean flag "broadcast" in the PartitionedOutputNode to true. @@ -362,11 +363,11 @@ right side whose values need to match, and an optional filter to apply to join results. To execute a plan with a merge join, Velox creates two separate pipelines. One -pipeline processes the right side data and puts it into JoinMergeSource. The +pipeline processes the right side data and puts it into MergeJoinSource. The other pipeline processes the data on the left side, joins it with the right side data and continues execution as specified by downstream plan nodes. MergeJoinNode is translated into MergeJoin operator and a CallbackSink backed -by JoinMergeSource. MergeJoin operator becomes part of the left-side +by MergeJoinSource. MergeJoin operator becomes part of the left-side pipeline. CallbackSink is installed at the end of the right-side pipeline. .. image:: images/merge-join-pipelines.png diff --git a/velox/docs/develop/lambda-functions.rst b/velox/docs/develop/lambda-functions.rst index 19dd2e258395c..7f66c7c826603 100644 --- a/velox/docs/develop/lambda-functions.rst +++ b/velox/docs/develop/lambda-functions.rst @@ -200,32 +200,16 @@ the signature of the "filter" function: Testing ------- -Testing framework doesn't support Presto SQL lambda expressions, e.g. one cannot -evaluate "filter(a, x - >x >= b)" expression directly. Instead, use the -registerLambda helper method of the FunctionBaseTest class to register lambda -expression and give it a name, then use that name to specify the lambda -parameter. Here is an example that evaluates "filter(a, x ->x >= b)" expression -in a test: +Testing framework fully supports evaluating lambda expression. Just write +an expression as you would in Presto SQL: .. code-block:: c++ - auto rowType = ROW({"a", "b"}, {ARRAY(BIGINT()), BIGINT()}); + auto result = evaluate("filter(a, x -> (x >= b))", data); - registerLambda("lambda", ROW({"x"}, {BIGINT()}), rowType, "x >= b")); +In the above, 'data' is expected to have a column "a" of type array and +column "b" of type matching array element type. For example, "a" can be +an array(integer) and "b" an integer. - auto result = - evaluate("filter(a, function('lambda'))", data); - -The first argument to registerLambda is the name for the lambda. This name can -later be used to refer to the lambda in a function call. - -The second argument is the signature of the lambda, e.g. the list of lambda -parameters along with their names and types. - -The third argument is the type of the input data to the overall expression. This -is used to resolve the types of captures. - -The last argument is the lambda body as SQL expression. - -To specify lambda expression as an argument of a lambda function use function -(‘’) syntax. +The only caveat is you need to put lambda body in parentheses. +`x -> (x >= b)` works, but `x -> x >= b` doesn't. diff --git a/velox/docs/develop/memory.rst b/velox/docs/develop/memory.rst new file mode 100644 index 0000000000000..c5ed4a63fe279 --- /dev/null +++ b/velox/docs/develop/memory.rst @@ -0,0 +1,905 @@ +================= +Memory Management +================= + +Background +---------- + +The Velox memory system is built on top of the `std::mmap `_ library to avoid the +`memory fragmentation issue `_ with std::malloc. It provides the basic +memory allocation functions for query execution as well as the advanced +memory management functions such as fair memory sharing, transparent file cache +and server out-of-memory (OOM) prevention. + +Velox provides the large contiguous and noncontiguous buffer allocation +functions to optimize the query memory allocation patterns. For example, a +query can allocate a large contiguous memory for a hash table +(*HashTable::allocateTables*) by using std::mmap to allocate physical memory from +the OS directly. For small buffer allocations, a query can allocate a large +chunk of non-contiguous memory, and then use `memory arena technique `_ like +*StreamArena* or *HashStringAllocator* to provide the small allocations on top of +it to reduce the number of expensive actual memory allocations. + +Velox provides fair memory sharing among running queries by adjusting their +memory capacities at runtime in response to their memory usage changes. This +process is called memory arbitration. It ensures the total allocated memory +capacity of all the queries is within the system configured query memory limit. +It also prevents each individual query from running out of the user configured +per-query memory limit. When a query tries to allocate more memory than its +current capacity, the memory arbitration either increases the query’s capacity +by reclaiming the used memory from the other queries with larger capacities, or +reclaiming from the query itself if it exceeds per-query memory limit, to free +up space within its current capacity. The memory reclaim is achieved through +techniques like `disk spilling `_. + +Velox provides transparent file cache to accelerate table scan through the hot +data reuse and prefetch. The file cache is integrated with the memory system to +achieve dynamic memory sharing between file cache and query memory. When a +query fails to allocate memory, we retry the allocation by shrinking the file +cache. Therefore, the file cache size is automatically adjusted in response to +the query memory usage change. + +Velox provides server out of memory (OOM) prevention by managing the physical +memory allocation on its own through the std::mmap library. This allows us to +enforce explicit control on `Resident Set Size (RSS) `_ of Velox memory usage. The +memory allocator in Velox handles all the memory allocations from both file +cache and query memory. It ensures the total allocated memory won’t exceed the +system memory limit configured for Velox to use. To further handle the spiky +memory usage from the non-Velox components, Velox provides a generic server +memory pushback mechanism to automatically shrink the file cache to return the +unused Velox memory back to the OS when the server is detected under low memory +condition. + +Overall Design +-------------- + +Velox memory system consists of the following major memory components: memory +manager (*MemoryManager*), memory allocator (*MemoryAllocator*), file cache +(*AsyncDataCache*), memory arbitrator (*MemoryArbitrator*) and a number of memory +pools (*MemoryPool*). The memory manager creates all the other components. It +creates the memory allocator and memory arbitrator when initializing the memory +system, and creates the memory pool on-demand for query execution. + +.. image:: images/memory-system.png + :width: 600 + :align: center + +When a query starts execution, it first creates a root memory pool (query pool) +from the memory manager, and then creates a tree of child memory pools from the +root according to the query plan: a child memory pool for each query task +(task pool), a grandchild memory pool for each plan node (node pool) and a +great-grandchild memory pool for each operator instance (operator pool). During +the query execution, it allocates memory from the leaf operator pools and +propagates the memory usage up to the root query pool. If the aggregated memory +usage at the root exceeds the current query memory capacity, the query pool +sends a request to the memory arbitrator to grow its capacity. The memory +arbitrator either grows the requestor query pool’s capacity by reclaiming the +used memory from the other queries which has the largest capacity in the +system, or reclaiming from the request query pool itself to free up space +within its current capacity if it exceeds the per-query memory limit or it has +the largest capacity in the system. The used memory reclaim is achieved through +techniques like `disk spilling `_. If the memory arbitration succeeds, the leaf +operator pool can proceed with the actual memory allocation from memory +allocator. However, if the memory arbitration fails, then the query which has +the largest capacity in the system is chosen to fail with query memory capacity +exceeded error (local OOM). The failed query may or may not be the requestor +query pool itself. + +The memory allocator does the actual memory allocation from its own managed +memory space in units of machine pages (4KB). It tracks the amount of allocated +memory, and returns an error if an allocation request exceeds the system memory +limit. This enables explicit control on RSS of Velox memory usage to help +prevent the server OOM. + +The file cache provides the in-memory hot data cache and prefetch functions +when the user query accesses the remote storage. It allocates memory from the +memory allocator directly which is not counted in query memory usage. To +prevent the memory allocation failure because of excessive file cache memory +usage, the file cache retries the allocation failure by shrinking the file +cache. This achieves dynamic memory sharing between file cache and query memory +in response to the user query workload changes. + +.. image:: images/memory-function.png + :width: 800 + :align: center + :alt: Memory Management Functions + +To summarize, the memory manager manages the memory pools and coordinates the +accesses between different memory components. The memory pool tracks a query’s +memory usage and interacts with the memory arbitrator to adjust the memory +capacity allocations among running queries to achieve fair memory sharing. The +memory allocator manages the physical memory allocations to prevent server OOM, +and interacts with file cache to achieve dynamic memory sharing between query +memory and file cache to maximize the memory efficiency. The rest of the +document describes each memory component in detail. + +Memory Manager +-------------- + +.. image:: images/memory-manager.png + :width: 600 + :align: center + :alt: Memory Manager + +The memory manager is created on server startup with the provided +*MemoryManagerOption*. It creates a memory allocator instance to manage the +physical memory allocations for both query memory allocated through memory pool +and cache memory allocated through the file cache. It ensures the total +allocated memory is within the system memory limit (specified by +*MemoryManagerOptions::allocatorCapacity*). The memory manager also creates a +memory arbitrator instance to arbitrate the memory capacity among running +queries. It ensures the total allocated query memory capacity is within the +query memory limit (specified by *MemoryManagerOptions::arbitratorCapacity*). The +memory arbitrator also prevents each individual query running out of its +per-query memory limit (specified by *QueryConfig::query_max_memory_per_node*) by +reclaiming overused memory through `disk spilling `_ (refer to `memory arbitrator +section <#memory-arbitrator>`_ for details). + +After setting up the Velox memory system, the memory manager manages the memory +pools for query execution. When a query starts, it creates a root query pool +from the memory manager, and then creates a tree of child pools from the query +pool according to the query plan (see `memory pool section <#memory-pool>`_ for detail) for +memory allocations and usage tracking. + +The memory manager keeps track of all the live query pools for the memory +arbitration process. When a query pool sends a request to the memory manager to +grow its capacity (*MemoryManager::growPool*), the memory manager forwards the +request to the memory arbitrator with the list of alive query pools as the +arbitration candidates. The memory arbitrator reclaims the used memory from the +candidates with the largest capacity first, and increases the requestor pool’s +capacity with the freed memory space accordingly. If the requestor pool already +has the largest capacity among all the candidates, then the memory arbitrator +reclaims memory from the requestor itself to free up space within its current +capacity. See `memory arbitration process section <#memory-arbitration-process>`_ for detailed description of +the memory arbitration process. + +The memory manager doesn’t have ownership of user created query pools but only +tracks their liveness through *MemoryManager::dropPool* method which is invoked +by the query pool’s destructor to remove itself from the tracked list +(*MemoryManager::pools_*). The *QueryCtx* object owns the query pool which stays +alive until the query finishes. + +The memory manager creates and owns a system root pool for Velox internal +operations such as `disk spilling `_. The difference between system root pool and +user created query root pool is that there is no per-query memory limit for the +system root pool so it doesn’t participate in the memory arbitration. The +reason is that the system operations are not executed on behalf of a particular +user query. Take `disk spilling `_ for example, it is triggered by memory +arbitration to free up used memory from the queries. We don’t expect +significant memory usage during a system operation, and eventually the memory +allocator guarantees the actual allocated memory are within the system memory +limit no matter if it is for system operation or for user query execution. In +practice, we shall reserve some space from the memory allocator to compensate +for such system memory usage. We can do that by configuring the query +memory limit (*MemoryManagerOptions::arbitratorCapacity*) to be smaller than the system memory +limit (*MemoryManagerOptions::allocatorCapacity*) (refer to `OOM prevention section <#server-oom-prevention>`_ +for detail). + +Memory System Setup +^^^^^^^^^^^^^^^^^^^ + +Here is the code block from Prestissimo that initializes the Velox memory system: + +.. code-block:: c++ + :linenos: + + void PrestoServer::initializeVeloxMemory() { + auto* systemConfig = SystemConfig::instance(); + const uint64_t memoryGb = systemConfig->systemMemoryGb(); + MemoryManagerOptions options; + options.allocatorCapacity = memoryGb << 30; + options.useMmapAllocator = systemConfig->useMmapAllocator(); + if (!systemConfig->memoryArbitratorKind().empty()) { + options.arbitratorKind = systemConfig->memoryArbitratorKind(); + const uint64_t queryMemoryGb = systemConfig->queryMemoryGb(); + options.queryMemoryCapacity = queryMemoryGb << 30; + ... + } + memory::initializeMemoryManager(options); + + if (systemConfig->asyncDataCacheEnabled()) { + ... + cache_ = std::make_shared( + memoryManager()->allocator(), memoryBytes, std::move(ssd)); + } + ... + } + +* L5: set the memory allocator capacity (system memory limit) from + the Prestissimo system config +* L6: set the memory allocator type from the Prestissimo system config. If + *useMmapAllocator* is true, we use *MmapAllocator*, otherwise use + *MallocAllocator*. `Memory Allocator section <#memory-allocator>`_ describes these two + types of allocators +* L8: set the memory arbitrator kind from the Prestissimo system config. + Currently, we only support the *“SHARED”* arbitrator kind (see `memory arbitrator section <#memory-arbitrator>`_). + *“NOOP”* arbitrator kind will be deprecated soon (`#8220 `_) +* L10: set the memory arbitrator capacity (query memory limit) from the + Prestissimo system config +* L13: creates the process-wide memory manager which creates memory + allocator and arbitrator inside based on MemoryManagerOptions initialized from previous steps +* L15-19: creates the file cache if it is enabled in Prestissimo system + config + +Memory Pool +----------- + +The memory pool provides memory allocation functions for query execution. It +also tracks a query’s memory usage for per-query memory limit enforcement. +As shown in the Query Memory Pool Hierarchy figure, a query creates a tree of +memory pools that mirrors the query plan to have a fine-grained tracking of +memory usage for figuring out which task(s) or operator(s) use most of the +memory. At the root of the tree, *QueryCtx* creates a root query pool from the +memory manager. Each query task creates a child task pool from the query pool. +A query task executes a fragment of the query plan (e.g. an execution stage in +a distributed query execution plan in Prestissimo). Each plan node in a task’s +plan fragment creates a child node pool from the task pool +(*Task::getOrAddNodePool*). Each plan node belongs to one or more task +execution pipelines. Each pipeline might have multiple driver instances running +in parallel. Each driver instance consists of a pipeline of query operators, +and an operator is an instantiation of a query plan node in a driver. Hence +each operator creates a child operator pool from the node pool +(*Task::addOperatorPool*). + +.. image:: images/memory-pool.png + :width: 500 + :align: center + :alt: Memory Pool + +Query allocates memory from the operator pool at the leaf of the tree and +propagates the memory usage all the way up to the query pool at the root of the +tree to check if the memory usage has exceeded the per-query memory limit or +not. The memory allocation always happens at the leaf operator pool, the +intermediate pools only aggregate the memory usage (node pool and task pool), +and it is the root query pool that enforces the per-query memory limit. Given +that, we introduce two memory pool types (defined by *MemoryPool::Kind*) to +simplify the memory pool management: one is *LEAF* type which only allows the +memory allocations and the other is *AGGREGATE* type which aggregates the +memory usage from all its children but is not allowed to allocate memory +directly. Hence, the operator pool is *LEAF* type and all the others are +*AGGREGATE* type. We only enforce the memory limit check at the root query +pool. + +Memory Usage Tracking +^^^^^^^^^^^^^^^^^^^^^ + +To track query memory usage, a leaf operator pool needs to propagate the memory +usage all the way up to the root query pool and check the memory limit for +every allocation, but this would be slow. Hence, memory pool uses a memory +reservation mechanism to track the query memory usage. A memory reservation is +made in 1MB or larger chunks to avoid excessive locking, propagating and +checking memory usage for every single allocation (see *MemoryPool::quantizedSize* +description below). A leaf operator pool maintains two counters for memory +reservation: one is the actual used memory (*MemoryPoolImpl::usedReservationBytes_*) +and the other is the memory reserved from the root query pool +(*MemoryPoolImpl::reservationBytes_*). The difference between the two counters +is the available memory for a leaf operator pool to use. + +The intermediate pools only use *reservationBytes_* to count the aggregated memory +reservations held by all its child pools. The root query pool has two additional +counters for memory limit check: one is its current memory capacity +(*MemoryPoolImpl::capacity_*) which is the amount of memory available for the +query to use. The memory arbitrator sets this based on how many queries are +running, the total query memory limit and how much memory each query needs. The +other is max capacity (*MemoryPool::maxCapacity_*) which is the max capacity that +a query can grow up to. It is set by the user and is fixed during a query’s +lifetime (*QueryConfig::kQueryMaxMemoryPerNode*). The memory arbitrator can’t +set a query’s *capacity_* beyond its *maxCapacity_* limit. + +When the root query pool receives a new memory reservation request, it increases +*reservationBytes_* and checks if it is within its current *capacity_* limit. If +it does, the root query pool accepts the request. If not, the root query pool +asks the memory arbitrator (via memory manager) to grow its capacity through the +memory arbitration (see `memory arbitrator section <#memory-arbitrator>`_ for details). +If the memory arbitration fails, the root query pool fails the request with a +query memory capacity exceeded error (local OOM error). + +*MemoryPool::reserve* and *MemoryPool::release* are the two methods used by the +memory pool for memory reservation. The memory reservation is thread-safe and +*MemoryPool::reserveThreadSafe* is the main function that implements the memory +reservation logic: + +#. The leaf memory pool calls *MemoryPool::reservationSizeLocked* to calculate + the new required reservation (*incrementBytes*). It is based on the memory + allocation size, and available memory reservation + (*reservationBytes_ - usedReservationBytes_*). + +#. If *incrementBytes* is zero, the leaf memory pool has sufficient available + reservation so it doesn’t need new reservation and just update + *usedReservationBytes_* to reflect the new memory usage. + +#. If *incrementBytes* is not zero, the leaf memory pool needs to call + *MemoryPool::incrementReservationThreadSafe* (see below) to propagate the + increment all the way up to the root memory pool to check if the new + reservation request exceeds the query’s current capacity or not. If not, + accept the reservation by incrementing *reservationBytes_* accordingly. + + Note that if *MemoryPool::incrementReservationThreadSafe* fails, it throws an + exception to fail the memory allocation request with a local OOM error. + +#. The leaf memory pool goes back to step-1 to check if there is sufficient + available reservation for the allocation request after the reservation + succeeds. + + Note that the concurrent allocation requests to the same leaf memory pool + might steal away the reservation made in step-3 so we have to check again. + We don’t hold the leaf memory pool’s lock while making a reservation from + the root memory pool, which could be a blocking operation if memory + arbitration is involved. Therefore, there could be a race condition if there + are two concurrent memory reservation requests from the same leaf memory + pool. But we don’t expect it to happen very often in practice. + +As mentioned above, to avoid frequent concurrent memory reservations to the +root memory pool to reduce the cpu cost, the leaf memory pool does quantized +memory reservation. It rounds up the actual reservation bytes to the next large +quantized reservation value (MemoryPool::quantizedSize): + +- round up to next MB if size < 16MB +- round up to next 4MB if size < 64MB +- round up to next 8MB if size >= 64MB + +With the quantized reservation, we never reserve less than 1 MB of memory. Even +if we only need 1KB, we’ll have to reserve 1MB and if there is not enough +memory available the query will fail. It also means that if we run at +concurrency of 15, each driver thread will reserve at least 1MB and therefore +the query would require at least 15 MB of memory even if it uses just a few KB. + +The implementation of MemoryPool::incrementReservationThreadSafe: + +#. A non-root memory pool calls its parent pool’s *incrementReservationThreadSafe* + method recursively to propagate the reservation request all the way up to + the root memory pool + +#. Check *MemoryPool::incrementReservationThreadSafe* result from the parent pool: + + a. If the function returns true, the reservation succeeds from the root + memory pool and proceeds to accept the reservation (Step-3) + b. If the function returns false, then reservation succeeds but has conflicts + with other concurrent reservation requests detected at the root memory + pool. We need to retry from the leaf memory pool again by returning false + to *MemoryPoolImpl::reserveThreadSafe* + c. If the memory reservation fails at the root memory pool, the function + expects a query memory capacity exceeded exception thrown and the memory + allocation fails + +#. Call *MemoryPool::maybeIncrementReservation* to try to increment the + reservation and check the result: + + a. For a non-root memory pool, this should always succeed as we only check + capacity at the root memory pool + b. For a root memory pool, the function might return false if the reservation + request exceeds its current capacity and goes to step-4 to request memory + arbitration + +#. The root memory pool calls *MemoryManager::growPool* to grow its capacity. + This triggers the memory arbitration process inside the memory arbitrator + +#. If *MemoryManager::growPool* returns true, then we succeed in growing memory + capacity (or reducing the memory usage within its current capacity). The + function calls *MemoryPool::maybeIncrementReservation* again to check if the + memory reservation can be satisfied or not. If not, then there should be a + concurrent memory reservation request that takes away the grown memory + capacity. Returns false to retry from the leaf memory pool again in this case + (step2-b). Otherwise, returns true (step2-a). + +#. If *MemoryManager::growPool* returns false, then we fail to grow capacity + from the memory arbitrator and throws an query memory capacity exceeded error + (step2-c) + +Memory Pool APIs +^^^^^^^^^^^^^^^^ + +Memory pool has three sets of APIs for memory pool management, memory allocation +and memory arbitration. The following is a list of the major APIs to use in each +of the three sets. + +Memory Pool Management +"""""""""""""""""""""" + +.. code-block:: c++ + + /// Creates a root memory pool with specified 'name' and 'maxCapacity'. + /// 'reclaimer' is provided for memory arbitration process. + std::shared_ptr MemoryManager::addRootPool( + const std::string& name = "", + int64_t maxCapacity = kMaxMemory, + std::unique_ptr reclaimer = nullptr); + + /// Create an aggregate child memory pool which allows to create child memory + /// pools from it, and it used to aggregate memory usage from its child pools. + /// Aggregate memory pool is not allowed to allocate memory directly. + virtual std::shared_ptr MemoryPool::addAggregateChild( + const std::string& name); + + /// Create a leaf child memory pool which allows to allocate memory but are not + /// allowed to create child pools. + virtual std::shared_ptr MemoryPool::addLeafChild( + const std::string& name); + + /// Creates new instance of MemoryPool for an operator, stores it in the task + /// to ensure lifetime and returns a raw pointer. + velox::memory::MemoryPool* Task::addOperatorPool( + const core::PlanNodeId& planNodeId, + int pipelineId, + uint32_t driverId, + const std::string& operatorType); + + /// Creates new instance of MemoryPool for a plan node, stores it in the task + /// to ensure lifetime and returns a raw pointer. + memory::MemoryPool* Task::getOrAddNodePool( + const core::PlanNodeId& planNodeId); + +Memory Allocation +""""""""""""""""" + +The memory pool provides three types of memory allocations. If a user needs a +large chunk of buffer allocation and the allocated buffer doesn’t need to be +contiguous, then it can use *MemoryPool::allocateNonContiguous* to allocate a +number of variable sized buffers (see `non-contiguous allocation section <#non-contiguous-allocation>`_ for details). Velox uses this allocation for +*RowContainer*, *StreamArena*/*HashStringAllocator* and *AsyncDataCache* etc. If a user +needs a large contiguous buffer allocation with size > 1MB, then it can use +*MemoryPool::allocateContiguous* to allocate a large chunk of physical memory +from the OS directly through std::mmap (see `contiguous allocation section <#contiguous-allocation>`_ for +details). Velox uses this allocation for *HashTable*. For any other ad hoc +allocations, we can use *MemoryPool::allocate*. The memory allocator determines +how to allocate memory based on the actual allocation size (see +`small allocation section <#small-allocation>`_ for details). + +.. code-block:: c++ + + /// Allocates a buffer with specified 'size'. If the memory allocation is + /// smaller than a predefined threshold, then we delegate the allocation to + /// std::malloc (MmapAllocator::Options::maxMallocBytes). + virtual void* MemoryPool::allocate(int64_t size) = 0; + + /// Frees an allocated buffer. + virtual void MemoryPool::free(void* p, int64_t size) = 0; + + /// Allocates one or more runs that add up to at least 'numPages', with the + /// smallest run being at least 'minSizeClass' pages. 'minSizeClass' must + /// be <= the size of the largest size class (see non-contiguous allocation + /// section for size class definition). The new memory is returned in 'out' on + /// success and any memory formerly referenced by 'out' is freed. The function + /// throws if allocation fails and 'out' references no memory and any partially + /// allocated memory is freed. + virtual void MemoryPool::allocateNonContiguous( + MachinePageCount numPages, + Allocation& out, + MachinePageCount minSizeClass = 0) = 0; + + /// Frees non-contiguous 'allocation'. 'allocation' is empty on return. + virtual void MemoryPool::freeNonContiguous(Allocation& allocation) = 0; + + /// Makes a large contiguous mmap of 'numPages'. The new mapped pages are + /// returned in 'out' on success. Any formly mapped pages referenced by 'out' + /// is unmapped in all the cases even if the allocation fails. + virtual void MemoryPool::allocateContiguous( + MachinePageCount numPages, + ContiguousAllocation& out) = 0; + + /// Frees contiguous 'allocation'. 'allocation' is empty on return. + virtual void MemoryPool::freeContiguous(ContiguousAllocation& allocation) = 0; + +Memory Arbitration +"""""""""""""""""" + +The `memory arbitrator section <#memory-arbitrator>`_ below discusses how these memory arbitration +related methods are used in the memory arbitration and reclaim process. + +.. code-block:: c++ + + /// Returns the number of bytes that haven't been reserved for use, and can be + /// freed by reducing this memory pool's limit. + virtual uint64_t MemoryPool::freeBytes() const = 0; + + /// Invoked to bump up the memory pool's capacity by 'bytes'. The function + /// returns the memory pool's new capacity after the grow. + virtual uint64_t MemoryPool::grow(uint64_t bytes) = 0; + + /// Invoked to free up to the specified amount of unused memory reservations by + /// reducing this memory pool's capacity without actually freeing up any + /// used memory. The function returns the actually freed memory bytes. If + /// 'targetBytes' is zero, the function frees all the unused memory reservation + /// bytes. + virtual uint64_t MemoryPool::shrink(uint64_t targetBytes = 0) = 0; + + /// Invoked by the memory arbitrator to enter memory arbitration processing. It + /// is a noop if 'reclaimer_' is not set, otherwise invoke the reclaimer's + /// corresponding method. + virtual void MemoryPool::enterArbitration(); + + /// Invoked by the memory arbitrator to leave memory arbitration processing. It + /// is a noop if 'reclaimer_' is not set, otherwise invoke the reclaimer's + /// corresponding method. + virtual void MemoryPool::leaveArbitration(); + + /// Function estimates the number of reclaimable bytes and returns in + /// 'reclaimableBytes'. If the 'reclaimer' is not set, the function returns + /// std::nullopt. Otherwise, it will invoke the corresponding method of the + /// reclaimer. + virtual std::optional reclaimableBytes() const = 0; + + /// Invoked by the memory arbitrator to reclaim memory from this memory pool + /// with specified reclaim target bytes. If 'targetBytes' is zero, then it + /// tries to reclaim all the reclaimable memory from the memory pool. It is + /// noop if the reclaimer is not set, otherwise invoke the reclaimer's + /// corresponding method. + virtual uint64_t MemoryPool::reclaim(uint64_t targetBytes); + +Memory Arbitrator +----------------- + +The memory arbitrator is used to arbitrate the memory capacity across running +queries to achieve fair memory sharing and prevent a query from running out of +its memory limit. To arbitrate memory capacity between running queries, the +memory arbitrator needs to be able to reclaim the used memory from a query +through techniques such as `disk spilling `_, and then transfer the freed memory +between queries by adjusting their memory pool’s capacities accordingly (see +`memory arbitration process section <#memory-arbitration-process>`_ for detail). + +The *MemoryArbitrator* is defined to support different implementations for +different query systems. As for now, we implement *SharedArbitrator* for both +Prestissimo and Prestissimo-on-Spark. `Gulten `_ implements its own memory +arbitrator to integrate with the `Spark memory system `_. *SharedArbitrator* +ensures the total allocated memory capacity is within the query memory limit +(*MemoryManagerOptions::arbitratorCapacity*), and also ensures each individual +query’s capacity is within the per-query memory limit (*MemoryPool::maxCapacity_*). +When a query needs to grow its capacity, *SharedArbitrator* either reclaims the +used memory from the query itself if it has exceeded its max memory capacity, +or increases its capacity by reclaiming used memory from the other queries with +the largest memory capacity in the system. + +Memory Arbitration Process +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. image:: images/memory-arbitration.png + :width: 800 + :align: center + :alt: Memory Arbitration Process + +The end-to-end memory arbitration process in *SharedArbitrator* works as follows: + +#. The query operator A allocates memory from its leaf operator pool (operator + pool A) +#. The operator pool A sends the memory reservation request to the root query + pool (query pool A) +#. The query pool A is the root memory pool and it checks if the memory + reservation request is within the current capacity or not + (*MemoryPoolImpl::capacity_*). Let’s assume the request has exceeded the + current capacity to trigger memory arbitration +#. The query pool A sends a request to the memory manager to grow its capacity + for the new reservation (*MemoryManager::growPool*) +#. The memory manager forwards the request to the memory arbitrator + (*MemoryArbitrator::growCapacity*) with the requestor memory pool plus the list + of the root query pools as the memory arbitration candidates. The memory + manager keeps the candidate query pools alive during the memory arbitration + process +#. The memory arbitrator serializes the memory arbitration processing with one + request at a time to ensure a consistent view of memory capacity allocated + among queries. The memory arbitrator might receive concurrent arbitration + requests from different queries or even from different driver threads of the + same query. For each memory arbitration request: + + a. The memory arbitrator invokes *MemoryPool::enterArbitration* method of the + requestor memory pool before starting memory arbitration. The request + memory pool here is the operator pool A which initiates the memory + reservation request. It calls *MemoryReclaimer::enterArbitration* method of + the associated operator reclaimer (*Operator::MemoryReclaimer*). The + operator reclaimer puts the driver thread into the suspension state + (*Task::enterSuspended*). To reclaim memory from a query task, we need to + first pause the task to stop all its driver threads to avoid any + concurrent updates to its operator states during the memory reclamation. + If the query task of the request memory pool is chosen to reclaim memory, + then we have to put its driver thread into suspension state, otherwise + the query task will never be paused as the request driver thread is under + the memory arbitration process. Note a suspended driver thread is not + counted as running in task pause processing. + + b. The memory arbitrator calls *SharedArbitrator::ensureCapacity* to check if + the requestor query pool exceeds its max memory capacity limit with the + new reservation or not (*MemoryPool::maxCapacity_*). If not, proceed to + step-6-c. Otherwise, the memory arbitrator tries to reclaim used memory + from the requestor pool itself. If memory reclamation has freed up + sufficient memory from the requestor pool for the new reservation within + its current capacity, then memory arbitration succeeds. If the requestor + pool still exceeds the max memory capacity limit, then memory arbitration + fails. Otherwise proceed to step-6-c. + + c. The memory arbitrator runs the fast path + (*SharedArbitrator::reclaimFreeMemoryFromCandidates*) to reclaim the unused + memory reservations from the candidate query pools without actually + freeing the used memory. It first tries to reclaim from itself and then + from the candidate pools which have the most free capacity + (*MemoryPool::freeBytes*) until it reaches the memory reclaim target. Note + that we set the memory reclaim target to a large value + (*MemoryManagerOptions::memoryPoolTransferCapacity*) which could be more + than the actual needed size, to avoid the frequent memory arbitrations. + + d. If the memory arbitrator hasn’t reclaimed enough free memory on fast + path, it runs the slow path + (*SharedArbitrator::reclaimUsedMemoryFromCandidates*) to reclaim the used + memory from the candidate pools with the most reclaimable memory (see + `memory reclaim process section <#memory-reclaim-process>`_ for the detailed memory + reclaim process within a query). + + e. If the memory arbitrator has reclaimed enough memory, it grants the + reclaimed memory to the requestor pool by increasing its memory capacity + (*MemoryPool::grow*). If not, the memory arbitrator has to call + *SharedArbitrator::handleOOM* to send the memory pool abort + (*MemoryPool::abort*) request to the candidate memory pool with the largest + capacity as victim to free up memory to let the other running queries + with enough memory proceed. The memory pool abort fails the query + execution and waits for its completion to release all the held memory + resources. + + f. If the victim query pool is the requestor pool itself, then memory + arbitration fails. Otherwise, go back to step-6-c to retry the memory + arbitration one more time before giving up. + + g. The memory arbitrator invokes *MemoryPool::leaveArbitration* method of the + requestor memory pool at the end of memory arbitration. The operator + reclaimer moves its driver thread out of suspension state + (*Task::leaveSuspended*). + +Memory Reclaim Process +^^^^^^^^^^^^^^^^^^^^^^ + +Here is the memory reclaim process within a query: + +#. The memory arbitrator invokes *MemoryPool::reclaim* method of a candidate + query pool with a reclaim target in bytes, which calls the corresponding + method of the associated memory reclaimer object (*MemoryReclaimer::reclaim*). + The query pool uses the default implementation which sorts its child task + pools based on the reclaimable bytes (*MemoryPool::reclaimableBytes*), and + reclaim from the task with the most reclaimable bytes until reaches the + reclaim target + +#. The query pool invokes the reclaim method of the task pool which in turn + calls into the associated task reclaimer (*Task::MemoryReclaimer*). The + latter first pauses the task execution (*Task::requestPause*), and then + sorts its child node pools based on the reclaimable bytes and reclaims + memory from the node pools with the most reclaimable bytes. After reaching + the reclaim target or having reclaimed from all the node pools, task + reclaimer resumes the task execution (*Task::resume*) + +#. The task pool invokes the reclaim method of the node pool which reclaim + memory from its child operator pool with the most reclaimable bytes + +#. The node pool eventually calls the operator pool to do the actual memory + reclamation (*Operator::MemoryReclaimer*). Currently we support memory + reclamation through disk spilling and table writer flush. *Operator::reclaim* + is added to support memory reclamation with the default implementation does + nothing. Only spillable operators override that method: *OrderBy*, *HashBuild*, + *HashAggregation*, *RowNumber*, *TopNRowNumber*, *Window* and *TableWriter*. + As for now, we simply spill everything from the spillable operator’s row + container to free up memory. After we add memory compaction support for row + containers, we could leverage fine-grained disk spilling features in Velox + to only spill and free the required amounts of memory. + +Note memory arbitrator can’t reclaim from a spillable operator if it has +triggered memory arbitration in the middle of data processing even after it +has stopped its query task execution. To prevent this, we added +*Operator::nonReclaimableSection_* to indicate if an operator is under a +non-reclaimable section or not, and the memory arbitrator can’t reclaim memory +from an operator which is under a non-reclaimable section. The driver execution +framework sets a running operator in the non-reclaimable section by default. +The spillable operator chooses to clear the non-reclaimable section at specific +call sites such as the memory reservation (*MemoryPool::maybeReserve*) before the +actual data processing to allow the memory arbitrator to reclaim memory. + +Memory Allocator +---------------- + +The memory allocator manages the physical memory allocations for both query +memory allocated through memory pool and cache memory allocated directly from +file cache. The memory allocator ensures the total allocated memory is always +within the system memory limit. *MemoryAllocator* defines the memory allocator +interface. We have two allocator implementations: *MallocAllocator* delegates +all the memory allocations to std::malloc which is simple and reliable. We +provide it as the default option but we believe it has the issue with RSS +variation caused by memory fragmentation. Therefore we built *MMapAllocator* to +manage the physical memory allocations using the std::mmap to have explicit +control on RSS. We haven't yet confirmed whether *MmapAllocator* works better +than *MallocAllocator*, but we are able to run a sizable Prestissimo workload +using it. We will compare that workload using two allocators to determine which +one is better in the future. Users can choose the allocator for their +application by setting *MemoryManagerOptions::useMmapAllocator* (see +`memory system setup section <#memory-system-setup>`_ for example). + +Non-Contiguous Allocation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. image:: images/size-class.png + :width: 500 + :align: center + :alt: Size Class + +A non-contiguous allocation is defined as an *Allocation* object which consists +of a number of PageRun(s). Each page run contains a contiguous buffer and the +buffers from different page runs don’t have to be contiguous. *MMapAllocator* +defines *MmapAllocator::SizeClass* data structure (similar to the one used in +`Umbra `_) to manage the non-contiguous allocation. A *SizeClass* object provides +allocation of a fixed size buffer (class page) which is a power of 2 of a +machine page size. *MMapAllocator* creates 9 different *SizeClass* objects with +class page size ranging from 1 machine page (4KB) to 256 machine pages (1MB). +To allocate a large number of machines pages, *MmapAllocator* calls +*MemoryAllocator::allocationSize* to build the allocation plan +(*MemoryAllocator::SizeMix*) which consists of a list of chosen *SizeClass* objects +and the number of class pages to allocate from each of them. + +*MemoryAllocator::allocationSize* generates the allocation plan by searching from +the largest fit *SizeClass* to the min *SizeClass* as specified by the user. If min +*SizeClass* is not 1, there could be waste of memory in the last allocated class +page. As the example in the diagram, for an allocation request of 150 pages and +min *SizeClass* of 4, we choose to allocate 2 class pages from *SizeClass/64*, 1 +from *SizeClass/16* and 2 from *SizeClass/4*. The total number of allocated machine +pages is 152. There are two machine pages wasted in the last allocated class +page from *SizeClass/4*. The memory allocator allocates memory from each of the +chosen *SizeClass* objects based on the allocation plan. The allocation result is +returned in an *Allocation* object which consists of 4 page runs: two runs from +*SizeClass/64* (the two allocated class pages are not contiguous in memory), one +from *SizeClass/16* and one from *SizeClass/4* (the two allocated class pages are +contiguous in memory). + +Each *SizeClass* object sets up its own memory space using std::mmap with the +same size of the system memory limit. The setup memory space doesn’t cause any +memory allocation from the OS (or have backing memory) until the user writes +into the allocated memory space. The SizeClass object divides its own memory +space into a number of class pages, and uses *SizeClass::pageAllocated_* bitmap +to track if a class page is allocated or not. It uses the other bitmap +*SizeClass::pageMapped_* to track if a class page has backing memory or not +(mapped class page). To ensure RSS of Velox memory usage is within the system +memory limit, we assume an allocated class page always has backing memory, and +a freed class page also has backing memory until we call std::madvise to free +it back to the OS. To free a class page, we just clear the allocation bit in +*pageAllocated_* bitmap but we don’t call std::madvise to free the backing memory +immediately as std::madvise is an expensive OS call. We also expect a freed +class page is very likely to be reused again. Given that, we only remove +backing memory of a freed class page for new allocation if the total number of +mapped class pages reaches the system memory limit. *numMappedFreePages_* is used +to track the number of freed class pages that still have backing memory in each +*SizeClass* object. *SizeClass::adviseAway* implements the lazy backing memory +free control logic. + +We apply two optimizations to accelerate the free class page lookup. One is to +use an aggregated bitmap (*mappedFreeLookup_*) to track the free class pages in +a group. Each bit in *mappedFreeLookup_* corresponds to 512 bits (8 words) in +*pageAllocated_*. If a bit is set in *mappedFreeLookup_*, then at least one of 512 +bits in *pageAllocated_* is not set. The other is to use simd instruction to +operate on the bitmap to further accelerate the cpu execution. + +The simplified *MmapAllocator::allocateNonContiguous* implementation: + +.. code-block:: c++ + + bool MmapAllocator::allocateNonContiguous( + MachinePageCount numPages, + Allocation& out, + ReservationCallback reservationCB, + MachinePageCount minSizeClass) override; + +#. calls *MemoryAllocator::allocationSize* with *numPages* and *minSizeClass*. + *numPages* specifies the number of machine pages to allocate. *minSizeClass* + specifies the minimum class page size to allocate from. The function returns + the number of class pages to allocate from each chosen *SizeClass* in + *MemoryAllocator::SizeMix*. The sum of machine pages to allocate from all + *SizeClass* objects should be no less than the requested *numPages*. + +#. increase the memory allocator’s memory usage and check if it exceeds the + system memory limit (*MemoryAllocator::capacity_*). If it exceeds, then fails + the allocation and reverts the memory usage update. Otherwise, proceeds to + make reservation in memory pool in step-3. + + * *MMapAllocator* uses *MallocAllocator::numAllocated_* to count the allocated + memory in units of machines pages + * *MMapAllocator* allocations are wrapped by *AsyncDataCache::makeSpace* which + retries the allocation failure by shrinking the file cache for a number of + times before giving up. Each retry takes a backoff delay and make it + harder to evict from cache + * *AsyncDataCache::makeSpace* not only retries the allocation from the memory + pool but also from the file cache itself. In the latter case, the old + cache entries will be evicted out to make space for new cache data + +#. invokes *reservationCB* to increase the memory pool’s reservation to check if + the new allocation exceeds the query memory limit or not. If it exceeds, we + revert the memory usage update made in step-2 and re-throws the query memory + capacity exceeded exception caught from *reservationCB*. The *reservationCB* is + null if the allocation is from file cache. + +#. allocates class pages from each of chosen SizeClass objects. If any one of + the *SizeClass* allocation fails, then the entire allocation fails. We free + the succeeded *SizeClass* allocations, and revert the memory pool reservation + (step-3) and memory usage (step-2) updates. + +#. The class page allocations return the number of machine pages needed to set + up backing memory. This refers to the allocated class pages which don’t have + the backing memory and the corresponding bits in *SizeClass::pageMapped_* are + not set. We call *MmapAllocator::ensureEnoughMappedPages* to ensure the total + number of mapped class pages that have backing memory with this new + allocation doesn’t exceed the system memory limit. If it exceeds, we call + *MmapAllocator::adviseAway* to remove the backing memory of the freed class + pages. If *MmapAllocator::adviseAway* call fails, then we fail the allocation + and revert all the changes made in previous steps for this allocation. + +#. calls *MmapAllocator::markAllMapped* to set all the allocated class pages as + mapped in *SizeClass::pageMapped_* and the allocation succeeds. + +Contiguous Allocation +^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: c++ + + virtual bool MemoryAllocator::allocateContiguous( + MachinePageCount numPages, + Allocation* collateral, + ContiguousAllocation& allocation, + ReservationCallback reservationCB = nullptr) = 0; + +Contiguous allocation is defined as a *ContiguousAllocation* object which +contains a large contiguous buffer. It is used for very large contiguous buffer +allocation (>1MB) like allocating a hash table. Its implementation is very +simple. It calls std::mmap to allocate a contiguous chunk of physical memory +from the OS directly. Similar to non-contiguous allocation, it needs to call +*MmapAllocator::ensureEnoughMappedPages* to ensure the size of the mapped +memory space is within the system memory limit. To free a contiguous +allocation, the memory allocator calls std::munmap to return the physical +memory back to the OS right away. + +Small Allocation +^^^^^^^^^^^^^^^^ + +.. code-block:: c++ + + void* MmapAllocator::allocateBytes( + uint64_t bytes, + uint16_t alignment = kMinAlignment) override; + +*MmapAllocator::allocateBytes* allocate memory in three different ways based on +the actual allocation size (bytes). If the allocation size is smaller than a +configured threshold (*MmapAllocator::Options::maxMallocBytes*), *MmapAllocator* +delegates the allocation to std::malloc. If the allocation size is within class +page size range (<= 1MB), it allocates the buffer as a class page from one of +*SizeClass* objects. Otherwise, it allocates the buffer as a large contiguous +allocation. + +We don’t expect many small memory allocations from the query systems using +*MmapAllocator*. In Prestissimo, only very few small memory allocations are +delegated to std::malloc. The large in-memory state such as *RowContainer* and +*HashTable* allocate either contiguous or non-contiguous allocations. As for now, +we don’t cap the memory allocations delegated to std::malloc in the +*MmapAllocator*. We provide an option +(*MmapAllocator::Options::smallAllocationReservePct*) for the query system to +reserve a small amount of memory capacity in *MmapAllocator* to compensate for +these ad-hoc small allocations in practice. + +Server OOM Prevention +--------------------- + +The memory allocator ensures all the memory usage from Velox doesn’t exceed +system memory limit. This is critical to prevent the server from running out +of memory as we expect Velox to use a significant portion of the server memory +in operation. For instance, Prestissimo in Meta configures 80% of server memory +for Velox and the rest 20% for the non-Velox components such as program binary, +http streaming shuffle and remote storage client etc. + +However, the memory capacity enforcement in Velox itself is not sufficient to +prevent the server from running out of memory in face of spiky memory usage +from non-Velox components. For instance, we found in Prestissimo that the http +streaming shuffle in a large Prestissimo setup (>400 workers) can cause very +high spiky memory usage that easily leads to Prestissimo worker OOMs. In a +large cluster, each worker (*PrestoExchangeSource*) might receive the streaming +data from a large number of sources at the same time. The memory profiles +collected at times close to OOM show that >50% of non-Velox memory are allocated +from http proxygen. To prevent server OOM caused by http streaming shuffle, we +added throttle control in Prestissimo streaming shuffle to limit the number of +sources to read at a time to cap the streaming shuffle memory usage. + +In addition to building the throttle mechanism specific to each non-Velox +component, we also provide a generic server memory pushback mechanism in Meta +Prestissimo to collaborate with Velox to handle the spiky memory usage from +non-Velox components. A *PeriodicMemoryChecker* is running in the background to +check the system memory usage periodically. Whenever the system memory usage +exceeds a certain threshold, it tries to free up memory from Velox by shrinking +the file cache (*AsyncDataCache::shrink*), and returns the freed cache memory +back to the OS. This way we can automatically shrink the file cache in response +to the transient spiky memory usage from non-Velox components in a query system. diff --git a/velox/docs/develop/operators.rst b/velox/docs/develop/operators.rst index b1431e3101403..b7cac0c915e2b 100644 --- a/velox/docs/develop/operators.rst +++ b/velox/docs/develop/operators.rst @@ -50,6 +50,7 @@ TableWriteNode TableWrite TableWriteMergeNode TableWriteMerge PartitionedOutputNode PartitionedOutput ExchangeNode Exchange Y +ExpandNode Expand MergeExchangeNode MergeExchange Y ValuesNode Values Y LocalMergeNode LocalMerge @@ -167,6 +168,10 @@ individual measures are sorted and de-duplicated. - One or more measures to compute. Each measure specifies an expression, e.g. count(1), sum(a), avg(b), optional boolean input column that's used to mask out rows for this particular measure, optional list of input columns to sort by before computing the measure, an optional flag to indicate that inputs must be deduplicated before computing the measure. Expressions must be in the form of aggregate function calls over input columns directly, e.g. sum(c) is ok, but sum(c + d) is not. * - ignoreNullKeys - A boolean flag indicating whether the aggregation should drop rows with nulls in any of the grouping keys. Used to avoid unnecessary processing for an aggregation followed by an inner join on the grouping keys. + * - globalGroupingSets + - If the AggregationNode is over a GroupIdNode, then some groups could be global groups which have only GroupId grouping key values. These represent global aggregate values. + * - groupId + - GroupId is the grouping key in the AggregationNode for the groupId column generated by an underlying GroupIdNode. It must be of BIGINT type. Properties of individual measures. @@ -200,7 +205,7 @@ accumulators. This requires more memory as compared to when inputs do not need to be de-duplicated. Furthermore, many aggregate functions produce same results on sorted and -unsorted inputs, e.g. func:`min`, func:`max`, :func:`count`, :func:`sum`. +unsorted inputs, e.g. :func:`min`, :func:`max`, :func:`count`, :func:`sum`. The query planner should avoid generating plans that request sorted inputs for such aggregate functions. Some examples of aggregate functions that are sensitive to the order of inputs include :func:`array_agg` and :func:`min_by` @@ -214,6 +219,211 @@ aggregate functions. Finally, note that computing measures over sorted input is only possible if aggregation step is 'single'. Such computations cannot be split into partial + final. +To illustrate the need for globalGroupingSets and groupIdColumn, we examine the following SQL + +.. code-block:: sql + + SELECT orderkey, sum(total_quantity) FROM orders GROUP BY CUBE (orderkey); + +This is equivalent to the following SQL with GROUPING SETS + +.. code-block:: sql + + SELECT orderkey, sum(total_quantity) FROM orders GROUP BY GROUPING SETS ((orderkey), ()); + +The SQL gives sub-totals of total_quantity for each orderkey along with the global sum (from the empty grouping set). + +The optimizer plans the above query as an Aggregation over a GroupId node. + +Lets say the orders table has 5 rows: + +.. code-block:: + + orderkey total_quantity + 1 5 + 2 6 + 2 7 + 3 8 + 4 9 + +After GroupId for the grouping sets ((orderkey), ()) the table has the following 10 rows + +.. code-block:: + + orderkey total_quantity group_id + 1 5 0 + 2 6 0 + 2 7 0 + 3 8 0 + 4 9 0 + null 5 1 + null 6 1 + null 7 1 + null 8 1 + null 9 1 + +A subsequent aggregation with grouping keys (orderkey, group_id) gives the sub-totals for the query + +.. code-block:: + + orderkey total_quantity group_id + 1 5 0 + 2 13 0 + 3 8 0 + 4 9 0 + null 35 1 + +If there were no input rows for this GROUP BY CUBE, then the expected result is a single row with the default value for the +global aggregation. For the above query that would be: + +.. code-block:: + + orderkey total_quantity group_id + null null 1 + +To generate this special row the AggregationNode needs the groupId for the global grouping set (1 in this case) and it +returns a single row for it with the aggregates default value. + +Note: Presto allows multiple global grouping sets in a single SQL query. + +.. code-block:: sql + + SELECT orderkey, sum(total_quantity) FROM orders GROUP BY GROUPING SETS ((), ()); + +Hence, globalGroupingSets is a vector of groupIds. + +.. _ExpandNode: + +ExpandNode +~~~~~~~~~~~ + +For each input row, generates N rows with M columns according to specified 'projections'. +'projections' is an N x M matrix of expressions: a vector of N rows each having M columns. +Each expression is either a column reference or a constant. Both null and non-null constants are allowed. +'names' is a list of M new column names. The semantic of this operator matches Spark. Using project and unnest can be +employed to implement the expand functionality. However, the performance is suboptimal when creating an array +constructor within the Project operation. + +.. list-table:: + :widths: 10 30 + :align: left + :header-rows: 1 + + * - Property + - Description + * - projections + - A vector of N rows each having M columns. Each expression is either a column reference or a constant. + * - names + - A list of new column names. + +ExpandNode is typically used to compute GROUPING SETS, CUBE, ROLLUP and COUNT DISTINCT. + +To illustrate how ExpandNode works lets examine the following SQL query: + +.. code-block:: sql + + SELECT l_orderkey, l_partkey, count(l_suppkey) FROM lineitem GROUP BY ROLLUP(l_orderkey, l_partkey); + +In the planning phase, Spark generates an Expand operator with the following projection list: + +.. code-block:: + + [l_suppkey, l_orderkey, l_partkey, 0], + [l_suppkey, l_orderkey, null, 1], + [l_suppkey, null, null, 3] + +Note: The last column serves as a special group ID, indicating the grouping set to which each row belongs. In Spark, this ID is calculated using a bitmask. If a certain column is selected, the bit value is assigned as 0; otherwise, it is assigned as 1. Therefore, the binary representation of the first row is (000), resulting in 0. The binary representation of the second row is (001), resulting in 1. The binary representation of the third row is (011), resulting in 3. + +For example, if the input rows are: + +.. code-block:: + + l_suppkey l_orderkey l_partkey + 93 1 673 + 75 2 674 + 38 3 22 + +After the computation by the ExpandNode, each row will generate 3 rows of data. So there will be a total of 9 rows: + +.. code-block:: + + l_suppkey l_orderkey l_partkey grouping_id_0 + 93 1 673 0 + 93 1 null 1 + 93 null null 3 + 75 2 674 0 + 75 2 null 1 + 75 null null 3 + 38 3 22 0 + 38 3 null 1 + 38 null null 3 + +Aggregation operator that follows, groups these 9 rows by (l_orderkey, l_partkey, grouping_id_0) and computes count(l_suppkey): + +.. code-block:: + + l_orderkey l_partkey count(l_suppkey) + 1 673 1 + null null 3 + 1 null 1 + 2 null 1 + 2 674 1 + 3 null 1 + 3 22 1 + +Another example would be COUNT DISTINCT query. + +.. code-block:: sql + + SELECT COUNT(DISTINCT l_suppkey), COUNT(DISTINCT l_partkey) FROM lineitem; + +In the planning phase, Spark generates an Expand operator with the following projection list: + +.. code-block:: + + [l_suppkey, null, 1], + [null, l_partkey, 2] + +For example, if the input rows are: + +.. code-block:: + + l_suppkey l_partkey + 93 673 + 75 674 + 38 22 + +After the computation by the ExpandNode, each row will generate 2 rows of data. So there will be a total of 6 rows: + +.. code-block:: + + l_suppkey l_partkey grouping_id_0 + 93 null 1 + null 673 2 + 75 null 1 + null 674 2 + 38 null 1 + null 22 2 + +Aggregation operator that follows, groups these rows by (l_suppkey, l_partkey, grouping_id_0) and produces: + +.. code-block:: + + l_suppkey l_partkey grouping_id_0 + 93 null 1 + 75 null 1 + 38 null 1 + null 673 2 + null 674 2 + null 22 2 + +Another Aggregation operator that follows, computes global count(l_suppkey) and count(l_partkey) producing final result: + +.. code-block:: + + COUNT(DISTINCT l_suppkey) COUNT(DISTINCT l_partkey) + 3 3 + .. _GroupIdNode: GroupIdNode @@ -233,8 +443,7 @@ followed by the group ID column. The type of group ID column is BIGINT. * - Property - Description * - groupingSets - - List of grouping key sets. Keys within each set must be unique, but keys can repeat across the sets. - - Grouping keys are specified with their output names. + - List of grouping key sets. Keys within each set must be unique, but keys can repeat across the sets. Grouping keys are specified with their output names. * - groupingKeyInfos - The names and order of the grouping key columns in the output. * - aggregationInputs @@ -353,7 +562,9 @@ NestedLoopJoinNode NestedLoopJoinNode represents an implementation that iterates through each row from the left side of the join and, for each row, iterates through all rows from the right side of the join, comparing them based on the join condition to find matching rows -and emitting results. Nested loop join supports non-equality join. +and emitting results. Nested loop join supports non-equality joins, and emit output +rows in the same order as the probe input (for inner and left outer joins) for each +thread of execution. .. list-table:: :widths: 10 30 @@ -383,7 +594,7 @@ identified sort fields as well as a sorting order. * - Property - Description * - sortingKeys - - List of one of more input columns to sort by. + - List of one of more input columns to sort by. Sorting keys must be unique. * - sortingOrders - Sorting order for each of the soring keys. The supported orders are: ascending nulls first, ascending nulls last, descending nulls first, descending nulls last. * - isPartial @@ -406,7 +617,7 @@ operations. * - Property - Description * - sortingKeys - - List of one of more input columns to sort by. + - List of one of more input columns to sort by. Must not be empty and must not contain duplicates. * - sortingOrders - Sorting order for each of the soring keys. See OrderBy for the list of supported orders. * - count @@ -523,7 +734,7 @@ distribution fields. * - keys - Zero or more input fields to use for calculating a partition for each row. * - numPartitions - - Number of partitions to split the data into.g + - Number of partitions to split the data into. * - replicateNullsAndAny - Boolean flag indicating whether rows with nulls in the keys should be sent to all partitions and, in case there are no such rows, whether a single arbitrarily chosen row should be sent to all partitions. Used to provide global-scope information necessary to implement anti join semantics on a single node. * - partitionFunctionFactory @@ -707,7 +918,7 @@ next batch of input. This operator accumulates state: a hash table mapping partition keys to total number of rows seen in this partition so far. Returning the row numbers as -a column in the output is optional. This operator doesn't support spilling yet. +a column in the output is optional. This operator supports spilling. This operator is equivalent to a WindowNode followed by FilterNode(row_number <= limit), but it uses less memory and CPU and makes @@ -752,9 +963,9 @@ FilterNode(row_number <= limit), but it uses less memory and CPU. * - Property - Description * - partitionKeys - - Partition by columns for the window functions. + - Partition by columns for the window functions. May be empty. * - sortingKeys - - Order by columns for the window functions. + - Order by columns for the window functions. Must not be empty and must not overlap with 'partitionKeys'. * - sortingOrders - Sorting order for each sorting key above. The supported sort orders are asc nulls first, asc nulls last, desc nulls first and desc nulls last. * - rowNumberColumnName diff --git a/velox/docs/develop/scalar-functions.rst b/velox/docs/develop/scalar-functions.rst index 52c3dd795e2a0..b1e79bbd0aa68 100644 --- a/velox/docs/develop/scalar-functions.rst +++ b/velox/docs/develop/scalar-functions.rst @@ -15,8 +15,10 @@ ceil function can be implemented as: .. code-block:: c++ - template + template struct CeilFunction { + VELOX_DEFINE_FUNCTION_TYPES(TExec); + template FOLLY_ALWAYS_INLINE void call(T& result, const T& a) { result = std::ceil(a); @@ -36,22 +38,25 @@ function to be called on different input types, e.g. float and double. Note that template instantiation will only happen during function registration, described in the "Registration" section below. -Please avoid using the obsolete VELOX_UDF_BEGIN/VELOX_UDF_END macros. +Do not use legacy VELOX_UDF_BEGIN and VELOX_UDF_END macros. The "call" function (or one of its variations) may return (a) void indicating the function never returns null values, or (b) boolean indicating whether -the result of the computation is null. True means the result is not null; -false means the result is null. If "ceil(0)" were to return null, the function -above could be re-written as follows: +the result of the computation is null. The meaning of the returned boolean is +"result was set", i.e. true means non-null result was populated, false means +no (null) result. If "ceil(0)" were to return a null, the function could be +re-written as follows: .. code-block:: c++ - template + template struct NullableCeilFunction { + VELOX_DEFINE_FUNCTION_TYPES(TExec); + template FOLLY_ALWAYS_INLINE bool call(T& result, const T& a) { result = std::ceil(a); - return a != 0; + return a != 0; // Return NULL if input is zero. } }; @@ -60,28 +65,56 @@ The argument list must start with an output parameter “result” followed by t function arguments. The “result” argument must be a reference. Function arguments must be const references. The C++ types of the function arguments and the result argument must match :doc:`Velox types`. -Since the result argument must be a reference, some of the types listed below -have a different result argument type: ========== ============================== ============================= Velox Type C++ Argument Type C++ Result Type ========== ============================== ============================= -VARCHAR StringView out_type -VARBINARY StringView out_type +BOOLEAN arg_type out_type +TINYINT arg_type out_type +SMALLINT arg_type out_type +INTEGER arg_type out_type +BIGINT arg_type out_type +REAL arg_type out_type +DOUBLE arg_type out_type +TIMESTAMP arg_type out_type +DATE arg_type out_type +VARCHAR arg_type out_type +VARBINARY arg_type out_type ARRAY arg_type> out_type> MAP arg_type> out_type> ROW arg_type> out_type> ========== ============================== ============================= -arg_type and out_type templates are defined by using the -VELOX_DEFINE_FUNCTION_TYPES(TExecParams) macro in the class definition. These -types provide interfaces similar to std::string, std::vector, std::unordered_map -and std::tuple. The underlying implementations are optimized to read and write -from and to the columnar representation without extra copying. +arg_type and out_type templates are defined by the +VELOX_DEFINE_FUNCTION_TYPES(TExec) macro in the struct definition. For +primitive types, arg_type is the same as out_type and the same as T. +This holds for boolean, integers, floating point types and timestamp. +For DATE, arg_type is the same as out_type and is defined as int32_t. + +A signature of a function that takes an integer and a double and returns +a double would look like this: + +.. code-block:: c++ + + void call(arg_type& result, const arg_type& a, const arg_type& b) + +Which is equivalent to + +.. code-block:: c++ + + void call(double& result, const int32_t& a, const double& b) + +For strings, arg_type is defined as StringView, while out_type +is defined as StringWriter. + +arg_type and out_type for Varchar, Array, Map and Row provide interfaces similar +to std::string, std::vector, std::unordered_map and std::tuple. The underlying +implementations are optimized to read and write from and to the columnar +representation without extra copying. More explanation and the APIs of the arg_type +and out_type for string and complex types can be found in :doc:`view-and-writer-types`. Note: Do not pay too much attention to complex type mappings at the moment. -They are included here for completeness, but require a whole separate -discussion. +They are included here for completeness. Null Behavior ^^^^^^^^^^^^^ @@ -95,7 +128,7 @@ an artificial example of a ceil function that returns 0 for null input: .. code-block:: c++ - template + template struct CeilFunction { template FOLLY_ALWAYS_INLINE void callNullable(T& result, const T* a) { @@ -133,9 +166,9 @@ an array: .. code-block:: c++ - template + template struct ArrayMinFunction { - VELOX_DEFINE_FUNCTION_TYPES(TExecParams); + VELOX_DEFINE_FUNCTION_TYPES(TExec); template FOLLY_ALWAYS_INLINE bool callNullFree( @@ -173,7 +206,7 @@ An example of such function is rand(): .. code-block:: c++ - template + template struct RandFunction { static constexpr bool is_deterministic = false; @@ -205,9 +238,9 @@ Here is an example of a trim function: .. code-block:: c++ - template + template struct TrimFunction { - VELOX_DEFINE_FUNCTION_TYPES(TExecParams); + VELOX_DEFINE_FUNCTION_TYPES(TExec); // ASCII input always produces ASCII result. static constexpr bool is_default_ascii_behavior = true; @@ -241,11 +274,39 @@ not go away prematurely. The output types can be scalar strings (varchar and varbinaries), but also complex types containing strings, such as arrays, maps, and rows. +The setNoCopy method of the out_type template can be used to set the result +to a string in the input argument without copying. The setEmpty method +can be used to set the result to an empty string. + .. code-block:: c++ // Results refer to strings in the first argument. static constexpr int32_t reuse_strings_from_arg = 0; + +Here is an example of a zero-copy function: + +.. code-block:: c++ + + template + struct TrimFunction { + VELOX_DEFINE_FUNCTION_TYPES(TExec); + + // Results refer to strings in the first argument. + static constexpr int32_t reuse_strings_from_arg = 0; + + FOLLY_ALWAYS_INLINE void call( + out_type& result, + const arg_type& input) { + if (input.size() == 0) { + result.setEmpty(); + return; + } + result.setNoCopy(stringImpl::trimUnicodeWhiteSpace(input)); + } + }; + + Access to Session Properties and Constant Inputs ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -267,13 +328,14 @@ properties and using it when processing inputs. .. code-block:: c++ - template + template struct HourFunction { - VELOX_DEFINE_FUNCTION_TYPES(TExecParams); + VELOX_DEFINE_FUNCTION_TYPES(TExec); - const date::time_zone* timeZone_ = nullptr; + const tz::TimeZone* timeZone_ = nullptr; FOLLY_ALWAYS_INLINE void initialize( + const std::vector& inputTypes, const core::QueryConfig& config, const arg_type* /*timestamp*/) { timeZone_ = getTimeZoneFromConfig(config); @@ -296,14 +358,15 @@ individual rows. .. code-block:: c++ - template + template struct DateTruncFunction { - VELOX_DEFINE_FUNCTION_TYPES(TExecParams); + VELOX_DEFINE_FUNCTION_TYPES(TExec); - const date::time_zone* timeZone_ = nullptr; + const tz::TimeZone* timeZone_ = nullptr; std::optional unit_; FOLLY_ALWAYS_INLINE void initialize( + const std::vector& inputTypes, const core::QueryConfig& config, const arg_type* unitString, const arg_type* /*timestamp*/) { @@ -359,356 +422,68 @@ we need to call registerFunction again: We need to call registerFunction for each signature we want to support. -Codegen -^^^^^^^ - -To allow the function to be used in the codegen, extract the “kernel” of the -function into a header file and call that from the “call” or “callNullable”. -Here is an example with ceil function. - -.. code-block:: c++ - - #include "velox/functions/prestosql/ArithmeticImpl.h" - - template - struct CeilFunction { - template - FOLLY_ALWAYS_INLINE bool call(T& result, const T& a) { - result = ceil(a); - return true; - } - }; - -velox/functions/prestosql/ArithmeticImpl.h: +Here is a mapping from Velox types to C++ types that should be used for +argument and return types during registration. + +========== ===================== +Velox Type C++ Type +========== ===================== +BOOLEAN bool +TINYINT int8_t +SMALLINT int16_t +INTEGER int32_t +BIGINT int64_t +REAL float +DOUBLE double +TIMESTAMP Timestamp +DATE Date +VARCHAR Varchar +VARBINARY Varbinary +ARRAY Array +MAP Map +ROW Row +========== ===================== + +For example, to register array_min function for string inputs: .. code-block:: c++ - template - T ceil(const T& arg) { - T results = std::ceil(arg); - return results; - } - -Make sure the header files that define the “kernels” are free of dependencies -as much as possible to allow for faster compilation in codegen. + registerFunction>({"array_min"}); -Complex Types -^^^^^^^^^^^^^ - -Inputs (View Types) -******************* -Input complex types are represented in the simple function interface using light-weight lazy -access abstractions that enable efficient direct access to the underlying data in Velox -vectors. -As mentioned earlier, the helper aliases arg_type and null_free_arg_type can be used in function's signatures to -map Velox types to the corresponding input types. The table below shows the actual types that are -used to represent inputs of different complex types. - -============================== ========================= ============================== - C++ Argument Type C++ Actual Argument Type Corresponding `std` type -============================== ========================= ============================== -arg_type> NullableArrayView> std::vector> -arg_type> NullableMapView std::map> -arg_type> NullableRowView std::tuple... -null_free_arg_type> NullFreeArrayView std::vector -null_free_arg_type> NullFreeMapView std::map -null_free_arg_type>> NullFreeRowView std::tuple -============================== ========================= ============================== - -The view types are designed to have interfaces similar to those of std::containers, in fact in most cases -they can be used as a drop in replacement. The table above shows the mapping between the Velox type and -the corresponding std type. For example: a *Map, Array>* corresponds to const -*std::map, std::vector>*. - -All views types are cheap to copy objects, for example the size of ArrayView is 16 bytes at max. - -**OptionalAccessor**: - -OptionalAccessor is an *std::optional* like object that provides lazy access to the nullity and -value of the underlying Velox vector at a specific index. Currently, it is used to represent elements of nullable input arrays -and values of nullable input maps. Note that keys in the map are assumed to be always not nullable in Velox. - -The object supports the following methods: - -- arg_type value() : unchecked access to the underlying value. - -- arg_type operator \*() : unchecked access to the underlying value. - -- bool has_value() : return true if the value is not null. - -- bool operator() : return true if the value is not null. - -The nullity and the value accesses are decoupled, and hence if someone knows inputs are null-free, -accessing the value does not have the overhead of checking the nullity. So is checking the nullity. -Note that, unlike std::container, function calls to value() and operator* are r-values (temporaries) and not l-values, -they can bind to const references and l-values but not references. - -OptionalAccessor is assignable to and comparable with std::optional> for primitive types. -The following expressions are valid, where array[0] is an optional accessor. +To register array_min function for arrays of any type, use Generic for the element type: .. code-block:: c++ - std::optional = array[0]; - if(array[0] == std::nullopt) ... - if(std::nullopt == array[0]) ... - if(array[0]== std::optional{1}) ... + registerFunction, Array>>({"array_min"}); -**NullableArrayView and NullFreeArrayView** - -NullableArrayView and NullFreeArrayView have interfaces similar to that of *std::vector>* and *std::vector*, -the code below shows the function arraySum, a range loop is used to iterate over the values. +Since array_min needs to sort the elements to find the smallest, the element +type needs to be orderable. You can restrict array elements to orderable types +using Orderable. .. code-block:: c++ - template - struct ArraySum { - VELOX_DEFINE_FUNCTION_TYPES(T); - - bool call(const int64_t& output, const arg_type>& array) { - output = 0; - for(const auto& element : array) { - if (element.has_value()) { - output += element.value(); - } - } - return true; - } - }; - - -ArrayView supports the following: - -- size_t size() : return the number of elements in the array. + registerFunction, Array>>({"array_min"}); -- operator[](size_t index) : access element at index. It returns either null_free_arg_type or OptionalAccessor. - -- ArrayView::Iterator begin() : iterator to the first element. - -- ArrayView::Iterator end() : iterator indicating end of iteration. - -- bool mayHaveNulls() : constant time check on the underlying vector nullity. When it returns false, there are definitely no nulls, a true does not guarantee null existence. - -- ArrayView::SkipNullsContainer SkipNulls() : return an iterable container that provides direct access to non-null values in the underlying array. For example, the function above can be written as: +You can use multiple generic types in a function signature. For example, to register +map_top_n function: .. code-block:: c++ - template - struct ArraySum { - VELOX_DEFINE_FUNCTION_TYPES(T); + registerFunction< + MapTopNFunction, + Map, Orderable>, // result map type + Map, Orderable>, // input map type + int64_t // type of N argument + >({"map_top_n"}); - bool call(const int64_t& output, const arg_type>& array) { - output = 0; - for (const auto& value : array.skipNulls()) { - output += value; - } - return true; - } - }; +Generic types must use T1, T2, T3... naming. -The skipNulls iterator will check the nullity at each index and skip nulls, a more performant implementation -would skip reading the nullity when mayHaveNulls() is false. +Finally, you can specify that an argument must be constant using Constant. +For example, to specify rand signature with a constant seed argument: .. code-block:: c++ - template - struct ArraySum { - VELOX_DEFINE_FUNCTION_TYPES(T); - - bool call(const int64_t& output, const arg_type>& array) { - output = 0; - if (array.mayHaveNulls()) { - for(const auto& value : array.skipNulls()) { - output += value; - } - return true; - } - - // No nulls, skip reading nullity. - for (const auto& element : array) { - output += element.value(); - } - return true; - } - }; - -Note: calls to operator[], iterator de-referencing, and iterator pointer de-referencing are r-values (temporaries), -versus l-values in STD containers. Hence those can be bound to const references or l-values but not normal references. - -**NullableMapView and NullFreeMapView** - -NullableMapView and NullFreeMapView has an interfaces similar to std::map> and std::map, -the code below shows an example function mapSum, sums up the keys and values. - -.. code-block:: c++ - - template - struct MapSum{ - bool call(const int64_t& output, const arg_type>& map) { - output = 0; - for (const auto& [key, value] : map) { - output += key; - if (value.has_value()) { - value += value.value(); - } - } - return true; - } - }; - -MapView supports the following: - -- MapView::Element begin() : iterator to the first map element. - -- MapView::Element end() : iterator that indicates end of iteration. - -- size_t size() : number of elements in the map. - -- MapView::Iterator find(const key_t& key): performs a linear search for the key, and returns iterator to the element if found otherwise returns end(). Only supported for primitive key types. - -- MapView::Iterator operator[](const key_t& key): same as find, throws an exception if element not found. - -- MapView::Element - -MapView::Element is the type returned by dereferencing MapView::Iterator. It has two members: - -- first : arg_type | null_free_arg_type - -- second: OptionalAccessor | null_free_arg_type - -- MapView::Element participates in struct binding: auto [v, k] = \*map.begin(); - -Note: iterator de-referencing and iterator pointer de-referencing result in temporaries. Hence those can be bound to -const references or value variables but not normal references. - - -**Temporaries lifetime C++** - -While c++ allows temporaries(r-values) to bound to const references by extending their lifetime, one must be careful and -know that only the assigned temporary lifetime is extended but not all temporaries in the RHS expression chain. -In other words, the lifetime of any temporary within an expression is not extended. - -For example, for the expression const auto& x = map.begin()->first. -c++ does not extend the lifetime of the result of map.begin() since it's not what is being -assigned. And in such a case, the assignment has undefined behavior. - -.. code-block:: c++ - - // Safe assignments. single rhs temporary. - const auto& a = array[0]; - const auto& b = *a; - const auto& c = map.begin(); - const auto& d = c->first; - - // Unsafe assignments. (undefined behaviours) - const auto& a = map.begin()->first; - const auto& b = **it; - - // Safe and cheap to assign to value. - const auto a = map.begin()->first; - const auto b = **it; - -Note that in the range-loop, the range expression is assigned to a universal reference. Thus, the above concern applies to it. - -.. code-block:: c++ - - // Unsafe range loop. - for(const auto& e : **it){..} - - // Safe range loop. - auto itt = *it; - for(const auto& e : *itt){..} - -.. _outputs-write: - -Outputs (Writer Types) -********************** - -Outputs of complex types are represented using special writers that are designed in a way that -minimizes data copying by writing directly to Velox vectors. - -**ArrayWriter** - -- out_type& add_item() : add non-null item and return the writer of the added value. -- add_null(): add null item. -- reserve(vector_size_t size): make sure space for `size` items is allocated in the underlying vector. -- vector_size_t size(): get the length of the array. -- resize(vector_size_t size): change the size of the array reserving space for the new elements if needed. - -- void add_items(const T& data): append data from any container with std::vector-like interface. -- void copy_from(const T& data): assign data to match that of any container with std::vector-like interface. - -- void add_items(const NullFreeArrayView& data): append data from array view (faster than item by item). -- void copy_from(const NullFreeArrayView& data): assign data from array view (faster than item by item). - -- void add_items(const NullableArrayView& data): append data from array view (faster than item by item). -- void copy_from(const NullableArrayView& data): assign data from array view (faster than item by item). - -When V is primitive, the following functions are available, making the writer usable as std::vector. - -- push_back(std::optional): add item or null. -- PrimitiveWriter operator[](vector_size_t index): return a primitive writer that is assignable to std::optional for the item at index (should be called after a resize). -- PrimitiveWriter back(): return a primitive writer that is assignable to std::optional for the item at index length -1. - - -**MapWriter** - -- reserve(vector_size_t size): make sure space for `size` entries is allocated in the underlying vector. -- std::tuple&, out_type&> add_item(): add non-null item and return the writers of key and value as tuple. -- out_type& add_null(): add null item and return the key writer. -- vector_size_t size(): return the length of the array. - -- void add_items(const T& data): append data from any container with std::vector> like interface. -- void copy_from(const NullFreeMapView& data): assign data from array view (faster than item by item). -- void copy_from(const NullableMapView& data): assign data from array view (faster than item by item). - -When K and V are primitives, the following functions are available, making the writer usable as std::vector>. - -- resize(vector_size_t size): change the size. -- emplace(K, std::optional): add element to the map. -- std::tuple> operator[](vector_size_t index): returns pair of writers for element at index. Key writer is assignable to K. while value writer is assignable to std::optional. - -**RowWriter** - -- template set_null_at(): set null for row item at index I. -- template get_writer_at(): set not null for row item at index I, and return writer to to the row element at index I. - -When all types T... are primitives, the following functions are available. - -- void operator=(const std::tuple& inputs): assignable to std::tuple. -- void operator=(const std::tuple...>& inputs): assignable to std::tuple...>. -- void copy_from(const std::tuple& inputs): similar as the above. - -When a given Ti is primitive, the following is valid. - -- PrimitiveWriter exec::get(RowWriter): return a primitive writer for item at index I that is assignable to std::optional. - -**PrimitiveWriter** - -Assignable to std::optional allows writing null or value to the primitive. Returned by complex writers when writing nullable -primitives. - -**StringWriter<>**: - -- void reserve(size_t newCapacity) : Reserve a space for the output string with size of at least newCapacity. -- void resize(size_t newCapacity) : Set the size of the string. -- char* data(): returns pointer to the first char of the string, can be written to directly (safe to write to index at capacity()-1). -- vector_size_t capacity(): returns the capacity of the string. -- vector_size_t size(): returns the size of the string. -- operator+=(const T& input): append data from char* or any type with data() and size(). -- append(const T& input): append data from char* or any type with data() and size(). -- copy_from(const T& input): append data from char* or any type with data() and size(). - -When Zero-copy optimization is enabled (see zero-copy-string-result section above), the following functions can be used. - -- void setEmpty(): set to empty string. -- void setNoCopy(const StringView& value): set string to an input string without performing deep copy. - - -Limitations -*********** -1. If a function throws an exception while writing a complex type, then the output of the -row being written as well as the output of the next row are undefined. Hence, it's recommended -to avoid throwing exceptions after writing has started for a complex output within the function. + registerFunction>({"rand"}); Variadic Arguments ^^^^^^^^^^^^^^^^^^ @@ -768,7 +543,10 @@ Vector Functions Simple functions process a single row and produce a single value as a result. Vector functions process a batch or rows and produce a vector of results. -Some of the defining features of these functions are: +When implementing a function, simple function is preferred unless the implementation +of vector function provides a significant performance gain which can be demonstrated +with a benchmark. +Some of the defining features of vector functions are: - take vectors as inputs and produce vectors as a result; - have access to vector encodings and metadata; @@ -805,15 +583,9 @@ The “rows” parameter specifies the set of rows in the incoming batch to process. This set may not include all the rows. By default, a vector function is assumed to have the default null behavior, e.g. null in any input produces a null result. In this case, the expression evaluation engine will exclude -rows with nulls from the “rows” specified in the call to “apply”. If a -function has a different behavior for null inputs, it must override the -isDefaultNullBehavior method to return false. - -.. code-block:: c++ - - bool isDefaultNullBehavior() const override { - return false; - } +rows with nulls from the “rows” specified in the call to “apply”. If a function +has a different behavior for null inputs, it must specify that during registration. +See :ref:`vector function registration` for more details. In this case, the “rows” parameter will include rows with null inputs and the function will need to handle these. By default, the function can assume that @@ -866,14 +638,9 @@ of the function arguments. These vectors are not necessarily flat and may be dictionary or constant encoded. However, a deterministic function that takes a single argument and has default null behavior is guaranteed to receive its only input as a flat or constant vector. By default, a function is assumed to -be deterministic. If that’s not the case, the function must override -isDeterministic method to return false. - -.. code-block:: c++ - - bool isDeterministic() const override { - return false; - } +be deterministic. If that’s not the case, it must specify the non-deterministic +behavior during registration. See :ref:`vector function registration` +for more details. Note that :ref:`decoded-vector` can be used to get a flat vector-like interface to any vector. A helper class exec::DecodedArgs can be used to decode multiple arguments. @@ -999,8 +766,8 @@ and eliminate the overhead of calling DecodedVector::valueAt template. .. code-block:: c++ if (base->isIdentityMapping() && exp->isIdentityMapping()) { - auto baseValues = base->values(); - auto expValues = exp->values(); + auto baseValues = base->data(); + auto expValues = exp->data(); rows.applyToSelected([&](int row) { rawResults[row] = std::pow(baseValues[row], expValues[row]); }); @@ -1017,13 +784,13 @@ exponent. .. code-block:: c++ if (base->isIdentityMapping() && exp->isIdentityMapping()) { - auto baseValues = base->values(); - auto expValues = exp->values(); + auto baseValues = base->data(); + auto expValues = exp->data(); rows.applyToSelected([&](int row) { rawResults[row] = std::pow(baseValues[row], expValues[row]); }); } else if (base->isIdentityMapping() && exp->isConstantMapping()) { - auto baseValues = base->values(); + auto baseValues = base->data(); auto expValue = exp->valueAt(0); rows.applyToSelected([&](int row) { rawResults[row] = std::pow(baseValues[row], expValue); @@ -1084,6 +851,8 @@ Simple functions are compatible with the TRY expression by default. The framewor wraps the “call” and “callNullable” methods in a try-catch and reports errors using context.setError. +.. _Registration: + Registration ^^^^^^^^^^^^ @@ -1095,12 +864,22 @@ Use exec::registerVectorFunction to register a stateless vector function. const std::string& name, std::vector signatures, std::unique_ptr func, + VectorFunctionMetadata metadata = {}, bool overwrite = true) exec::registerVectorFunction takes a name, a list of supported signatures -and unique_ptr to an instance of the function. An optional “overwrite” flag -specifies whether to overwrite a function if a function with the specified -name already exists. +and unique_ptr to an instance of the function. It takes an optional 'metadata' +parameter that specifies whether a function is deterministic, has default null +behavior, and other properties. A helper VectorFunctionMetadataBuilder class +allows to easily construct 'metadata'. For example, + +.. code-block:: c++ + + VectorFunctionMetadataBuilder().defaultNullBehavior(false).build(); + + +An optional “overwrite” flag specifies whether to overwrite a function if a function +with the specified name already exists. Use exec::registerStatefulVectorFunction to register a stateful vector function. @@ -1115,6 +894,7 @@ to a vector function over an equivalent simple function. const std::string& name, std::vector signatures, VectorFunctionFactory factory, + VectorFunctionMetadata metadata = {}, bool overwrite = true) exec::registerStatefulVectorFunction takes a name, a list of supported @@ -1182,15 +962,14 @@ argument in order. The concat function takes an arbitrary number of varchar inputs and returns a varchar. FunctionSignatureBuilder allows specifying that the last augment may -appear zero or more times by calling variableArity() method. +appear zero or more times by calling variableArity("varchar") method. .. code-block:: c++ // varchar... -> varchar exec::FunctionSignatureBuilder() .returnType("varchar") - .argumentType("varchar") - .variableArity() + .variableArity("varchar") .build() The map_keys function takes any map and returns an array of map keys. diff --git a/velox/docs/develop/simd.rst b/velox/docs/develop/simd.rst index 7c8eb09582480..69653d16f90c9 100644 --- a/velox/docs/develop/simd.rst +++ b/velox/docs/develop/simd.rst @@ -178,4 +178,4 @@ mask. Note when the data type is 16 bits long, we need to do the process in 2 batches (``loadIndices(0)`` and ``loadIndices(1)``), because the indices are 32 bits -long and one SIME vector is not large enough to contain all the indices needed. +long and one SIMD vector is not large enough to contain all the indices needed. diff --git a/velox/docs/develop/spilling.rst b/velox/docs/develop/spilling.rst index 2f67e74f09f1a..6a96afe07c265 100644 --- a/velox/docs/develop/spilling.rst +++ b/velox/docs/develop/spilling.rst @@ -7,7 +7,8 @@ Background Spilling in Velox allows a query to succeed using a limited amount of memory when some operators are accumulating large state. For example, a hash -aggregation operator stores the intermediate aggregation state in a hash table, +aggregation operator stores the intermediate aggregation state in a +`hash table `_, and it starts to produce the results after processing all the input. In high cardinality workloads (large number of groups) the size of the hash table exceeds the query’s memory limit. @@ -167,7 +168,7 @@ partition to create a sorted reader to restore the spilled partition state. std::unique_ptr> Spiller::startMerge( int32_t partition); -**unsorted spill restore**: Used by order by hash build and hash probe +**unsorted spill restore**: Used by hash build and hash probe operators. The operator first calls Spiller::finishSpill() to mark the completion of spilling. The Spiller collects metadata for the spilled partitioned and returns these to the operator. The operator processes the @@ -209,22 +210,6 @@ spillable operators. The latter in turn frees up memory by spilling out (part) of its memory state to disk. The integration of spilling with the memory management system is under development. -Velox can be configured to trigger spilling if the spillable operator's memory -usage exceeds a configurable limit: - -.. code-block:: c++ - - uint64_t QueryConfig::aggregationSpillMemoryThreshold() const; - - uint64_t QueryConfig::orderBySpillMemoryThreshold() const; - - uint64_t QueryConfig::joinSpillMemoryThreshold() const; - -This allows us to run queries using limited amount of memory without the memory -arbitration support. Note that the spilling itself can’t totally prevent out of -memory as the last memory allocation that exceeds the memory limit, can be made -from any operator in a query plan not always from the spillable one. - Spill Parameters ---------------- Spill File Size @@ -368,14 +353,7 @@ other to ensure all operators spill the same set of partitions. If operators spill independently, it is possible to end up with all partitions being spilled. To build a hash table, we need all rows from one or more partitions. Unlike hash aggregation and order by, the hash join spilling is explicitly -controlled by the hash build operators. A SpillOperatorGroup object coordinates -the spilling on all the operators. The SpillOperatorGroup object is shared by -all the hash build operators. It implements a recurring barrier function. When -spilling gets triggered, the object starts a barrier to stop all the hash build -operators executions. The last operator reaching the barrier acts as the -coordinator. It collects spillable stats from the Spillers of all the -operators, chooses a set of partitions to spill, and runs spilling on all the -Spillers with the selected partitions. +controlled by the hash build operators. .. image:: images/spill-hash-join-probe.png :width: 400 diff --git a/velox/docs/develop/testing.rst b/velox/docs/develop/testing.rst index 1cce1fdae7fe1..ce18009466880 100644 --- a/velox/docs/develop/testing.rst +++ b/velox/docs/develop/testing.rst @@ -7,3 +7,7 @@ Testing Tools testing/fuzzer testing/join-fuzzer + testing/memory-arbitration-fuzzer + testing/row-number-fuzzer + testing/writer-fuzzer + testing/spark-query-runner.rst diff --git a/velox/docs/develop/testing/async-data-cache-fuzzer.rst b/velox/docs/develop/testing/async-data-cache-fuzzer.rst new file mode 100644 index 0000000000000..1259064724d57 --- /dev/null +++ b/velox/docs/develop/testing/async-data-cache-fuzzer.rst @@ -0,0 +1,63 @@ +============ +Cache Fuzzer +============ + +Cache fuzzer is designed to test the correctness and the reliability of the +in-memory async data cache and the durable local SSD cache, and their +interactions such as staging from async data cache to SSD cache, and load the +cache miss data from SSD cache into async data cache. + +During each iteration, the fuzzer performs the following actions steps by steps: +1. Creating a set of data files on local file system with varying sizes as source data files. +2. Setting up the async data cache with and without SSD using a specific configuration. +3. Performing parallel random reads from the source data files created in step1. + +How to run +---------- + +Use velox_cache_fuzzer_test binary to run cache fuzzer: + +:: + + velox/exec/tests/velox_cache_fuzzer_test + +By default, the fuzzer will go through 10 interations. Use --steps +or --duration-sec flag to run fuzzer for longer. Use --seed to +reproduce fuzzer failures. + +Here is a full list of supported command line arguments. + +* ``–-steps``: How many iterations to run. Each iteration generates and + evaluates one tale writer plan. Default is 10. + +* ``–-duration_sec``: For how long to run in seconds. If both ``-–steps`` + and ``-–duration_sec`` are specified, –duration_sec takes precedence. + +* ``–-seed``: The seed to generate random expressions and input vectors with. + +* ``–-num_threads``: Number of read threads. + +* ``–-read_iteration_sec``: For how long each read thread should run (in seconds). + +* ``–-num_source_files``: Number of data files to be created. + +* ``–-min_source_file_bytes``: Minimum source file size in bytes. + +* ``–-max_source_file_bytes``: Maximum source file size in bytes. + +* ``–-memory_cache_bytes``: Memory cache size in bytes. + +* ``–-ssd_cache_bytes``: Ssd cache size in bytes. + +* ``–-num_ssd_cache_shards``: Number of SSD cache shards. + +* ``–-ssd_checkpoint_interval_bytes``: Checkpoint after every + ``--ssd_checkpoint_interval_bytes``/``--num_ssd_cache_shards`` written into + each file. 0 means no checkpointing. + +* ``–-enable_checksum``: Enable checksum write to SSD. + +* ``–-enable_checksum_read_verification``: Enable checksum read verification + from SSD. + +If running from CLion IDE, add ``--logtostderr=1`` to see the full output. diff --git a/velox/docs/develop/testing/fuzzer.rst b/velox/docs/develop/testing/fuzzer.rst index cb883f69e4b1d..97dd226698a84 100644 --- a/velox/docs/develop/testing/fuzzer.rst +++ b/velox/docs/develop/testing/fuzzer.rst @@ -1,6 +1,9 @@ -================================= -Expression and Aggregation Fuzzer -================================= +========================================== +Expression, Aggregation, and Window Fuzzer +========================================== + +Expression Fuzzer +----------------- Velox allows users to define UDFs (user-defined functions) and UDAFs (user-defined aggregate functions) and provides a fuzzer tools to test the @@ -18,13 +21,19 @@ expression fuzzer evaluates each expression twice and asserts the results to be the same: using regular evaluation path and using simplified evaluation that flattens all input vectors before evaluating an expression. +Aggregation Fuzzer +------------------ + The Aggregation Fuzzer tests the HashAggregation operator, the StreamingAggregation operator and UDAFs by generating random aggregations and evaluating these on random input vectors. The Aggregation Fuzzer tests global aggregations (no grouping keys), group-by aggregations (one or more grouping keys), distinct aggregations(no aggregates), -aggregations with and without masks. +aggregations with and without masks, aggregations over sorted and distinct inputs. + +The Aggregation Fuzzer includes testing of spilling and abandoning partial +aggregation. The results of aggregations using functions supported by DuckDB are compared with DuckDB results. @@ -52,11 +61,72 @@ using OrderBy and StreamingAggregation. Fuzzer iterations alternate between generating plans using Values or TableScan nodes. +Many functions work well with random input data. However, some functions have +restrictions on the input values and random data tend to violate these causing +failures and preventing the fuzzer from exercising the aggregation beyond the +initial sanity checks. + +For example, “min” function has 2 signatures: + +.. code-block:: + + min(x) → [same as x] + Returns the minimum value of all input values. + + min(x, n) → array<[same as x]> + Returns n smallest values of all input values of x. n must be a positive integer and not exceed 10,000. + +The second signature, let's call it min_n, has 2 arguments. The first argument +is the value and the second is a constant number of minimum values to return. +Most of the time, randomly generated value for the second argument doesn’t fall +into [1, 10’000] range and aggregation fails: + +.. code-block:: + + VeloxUserError + Error Source: USER + Error Code: INVALID_ARGUMENT + Reason: (3069436511015786487 vs. 10000) second argument of max/min must be less than or equal to 10000 + Retriable: False + Expression: newN <= 10'000 + Function: checkAndSetN + File: /Users/mbasmanova/cpp/velox-1/velox/functions/prestosql/aggregates/MinMaxAggregates.cpp + Line: 574 + +Similarly, approx_distinct function has a signature that allows to specify max +standard error in the range of [0.0040625, 0.26000]. Random values for 'e' have +near zero chance to fall into this range. + +To enable effective testing of these functions, Aggregation Fuzzer allows +registering custom input generators for individual functions. + When testing aggregate functions whose results depend on the order of inputs (e.g. map_agg, map_union, arbitrary, etc.), the Fuzzer verifies that all plans succeed or fail with compatible user exceptions. When plans succeed, the Fuzzer verifies that number of result rows is the same across all plans. +Additionally, Fuzzer tests order-sensitive functions using aggregations over +sorted inputs. When inputs are sorted, the results are deterministic and therefore +can be verified. + +Fuzzer also supports specifying custom result verifiers. For example, array_agg +results can be verified by first sorting the result arrays. Similarly, map_agg +results can be partially verified by transforming result maps into sorted arrays +of map keys. approx_distinct can be verified by comparing the results with +count(distinct). + +A custom verifier may work by comparing results of executing two logically +equivalent Velox plans or results of executing Velox plan and equivalent query +in Reference DB. These verifiers using transform the results to make them +deterministic, then compare. This is used to verify array_agg, set_agg, +set_union, map_agg, and similar functions. + +A custom verifier may also work by analyzing the results of single execution +of a Velox plan. For example, approx_distinct verifies the results by +computing count(distinct) on input data and checking whether the results +of approx_distinct are within expected error bound. Verifier for approx_percentile +works similarly. + At the end of the run, Fuzzer prints out statistics that show what has been tested: @@ -71,6 +141,27 @@ tested: Total aggregations verified against DuckDB: 2537 (44.63%) Total failed aggregations: 1061 (18.67%) +.. _window-fuzzer: + +Window Fuzzer +------------- + +The Window fuzzer tests the Window operator with window and aggregation +functions by generating random window queries and evaluating them on +random input vectors. Results of the window queries can be compared to +Presto as the source of truth. + +For each window operation, fuzzer generates multiple logically equivalent +plans and verifies that results match. These plans include + +- Values -> Window +- TableScan -> PartitionBy -> Window +- Values -> OrderBy -> Window (streaming) +- TableScan -> OrderBy -> Window (streaming) + +Window fuzzer currently doesn't use any custom result verifiers. Functions +that require custom result verifiers are left unverified. + How to integrate --------------------------------------- @@ -89,23 +180,25 @@ aggregate functions supported by the engine, and call ``AggregationFuzzerRunner::run()`` defined in `AggregationFuzzerRunner.h`_. See `AggregationFuzzerTest.cpp`_. -.. _AggregationFuzzerRunner.h: https://github.com/facebookincubator/velox/blob/main/velox/exec/tests/AggregationFuzzer.h +.. _AggregationFuzzerRunner.h: https://github.com/facebookincubator/velox/blob/main/velox/exec/fuzzer/AggregationFuzzer.h -.. _AggregationFuzzerTest.cpp: https://github.com/facebookincubator/velox/blob/main/velox/exec/tests/AggregationFuzzerTest.cpp +.. _AggregationFuzzerTest.cpp: https://github.com/facebookincubator/velox/blob/main/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp Aggregation Fuzzer allows to indicate functions whose results depend on the -order of inputs and optionally provide an expression to apply to the result to -make it stable. For example, the results of array_agg can be stabilized by -applying array_sort on top: array_sort(array_map(x)) and the results of map_agg -can be stabilized using array_sort(map_keys(map_agg(k, v))). Order-dependent -functions are tested to ensure no crashes or failures. The results of -order-dependent functions with stabilizing expressions are further verified for -correctness by ensuring that results of logically equivalent plans match. +order of inputs and optionally provide custom result verifiers. The Fuzzer +also allows to provide custom input generators for individual functions. + +Integration with the Window Fuzzer is similar to Aggregation Fuzzer. See +`WindowFuzzerRunner.h`_ and `WindowFuzzerTest.cpp`_. + +.. _WindowFuzzerRunner.h: https://github.com/facebookincubator/velox/blob/main/velox/exec/fuzzer/WindowFuzzer.h + +.. _WindowFuzzerTest.cpp: https://github.com/facebookincubator/velox/blob/main/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp How to run ---------------------------- -Fuzzers support a number of powerful command line arguments. +All fuzzers support a number of powerful command line arguments. * ``–-steps``: How many iterations to run. Each iteration generates and evaluates one expression or aggregation. Default is 10. @@ -119,7 +212,11 @@ Fuzzers support a number of powerful command line arguments. * ``–-batch_size``: The size of input vectors to generate. Default is 100. -There are also arguments that toggle certain fuzzer features: +* ``--null_ratio``: Chance of adding a null constant to the plan, or null value in a vector (expressed as double from 0 to 1). Default is 0.1. + +* ``--max_num_varargs``: The maximum number of variadic arguments fuzzer will generate for functions that accept variadic arguments. Fuzzer will generate up to max_num_varargs arguments for the variadic list in addition to the required arguments by the function. Default is 10. + +Below are arguments that toggle certain fuzzer features in Expression Fuzzer: * ``--retry_with_try``: Retry failed expressions by wrapping it using a try() statement. Default is false. @@ -131,6 +228,8 @@ There are also arguments that toggle certain fuzzer features: * ``--velox_fuzzer_enable_complex_types``: Enable testing of function signatures with complex argument or return types. Default is false. +* ``--velox_fuzzer_enable_decimal_type``: Enable testing of function signatures with decimal argument or return type. Default is false. + * ``--lazy_vector_generation_ratio``: Specifies the probability with which columns in the input row vector will be selected to be wrapped in lazy encoding (expressed as double from 0 to 1). Default is 0.0. * ``--velox_fuzzer_enable_column_reuse``: Enable generation of expressions where one input column can be used by multiple subexpressions. Default is false. @@ -141,21 +240,19 @@ There are also arguments that toggle certain fuzzer features: * ``--max_expression_trees_per_step``: This sets an upper limit on the number of expression trees to generate per step. These trees would be executed in the same ExprSet and can re-use already generated columns and subexpressions (if re-use is enabled). Default is 1. -In addition, Aggregation Fuzzer also supports tuning parameters: - -* ``--num_batches``: The number of input vectors of size `--batch_size` to generate. Default is 10. +* ``--velox_fuzzer_max_level_of_nesting``: Max levels of expression nesting. Default is 10 and minimum is 1. -* ``--max_num_varargs``: The maximum number of variadic arguments fuzzer will generate for functions that accept variadic arguments. Fuzzer will generate up to max_num_varargs arguments for the variadic list in addition to the required arguments by the function. Default is 10. +In addition, Aggregation Fuzzer supports the tuning parameter: -* ``--null_ratio``: Chance of adding a null constant to the plan, or null value in a vector (expressed as double from 0 to 1). Default is 0.1. +* ``--num_batches``: The number of input vectors of size `--batch_size` to generate. Default is 10. -* ``--velox_fuzzer_max_level_of_nesting``: Max levels of expression nesting. Default is 10 and minimum is 1. +Window Fuzzer supports verifying window query results against reference DB: -* ``--num_batches``: The number of input vectors of size `--batch_size` to generate. Default is 10. +* ``--enable_window_reference_verification``: When true, the results of the window aggregation are compared to reference DB results. Default is false. If running from CLion IDE, add ``--logtostderr=1`` to see the full output. -An example set of arguments to run the fuzzer with all features enabled is as follows: +An example set of arguments to run the expression fuzzer with all features enabled is as follows: ``--duration_sec 60 --enable_variadic_signatures --lazy_vector_generation_ratio 0.2 @@ -169,6 +266,21 @@ An example set of arguments to run the fuzzer with all features enabled is as fo --repro_persist_path= --logtostderr=1`` + +`WindowFuzzerTest.cpp`_ and `AggregationFuzzerTest.cpp`_ allow results to be +verified against Presto. To setup Presto as a reference DB, please follow these +`instructions`_. The following flags control the connection to the presto +cluster; ``--presto_url`` which is the http server url along with its port number +and ``--req_timeout_ms`` which sets the request timeout in milliseconds. The +timeout is set to 1000 ms by default but can be increased if this time is +insufficient for certain queries. Example command: + +:: + + velox/functions/prestosql/fuzzer:velox_window_fuzzer_test --enable_window_reference_verification --presto_url="http://127.0.0.1:8080" --req_timeout_ms=2000 --duration_sec=60 --logtostderr=1 --minloglevel=0 + +.. _instructions: https://github.com/facebookincubator/velox/issues/8111 + How to reproduce failures ------------------------------------- @@ -176,7 +288,7 @@ When Fuzzer test fails, a seed number and the evaluated expression are printed to the log. An example is given below. Developers can use ``--seed`` with this seed number to rerun the exact same expression with the same inputs, and use a debugger to investigate the issue. For the example below, the command -to reproduce the error would be ``velox/expression/tests/velox_expression_fuzzer_test --seed 1188545576``. +to reproduce the error would be ``velox/expression/fuzzer/velox_expression_fuzzer_test --seed 1188545576``. :: @@ -235,6 +347,8 @@ ExpressionRunner supports the following flags: * ``--sql_path`` path to expression SQL that was created by the Fuzzer +* ``--registry`` function registry to use for evaluating expression. One of "presto" (default) or "spark". + * ``--complex_constant_path`` optional path to complex constants that aren't accurately expressable in SQL (Array, Map, Structs, ...). This is used with SQL file to reproduce the exact expression, not needed when the expression doesn't contain complex constants. * ``--lazy_column_list_path`` optional path for the file stored on-disk which contains a vector of column indices that specify which columns of the input row vector should be wrapped in lazy. This is used when the failing test included input columns that were lazy vector. @@ -255,6 +369,10 @@ ExpressionRunner supports the following flags: * ``--store_result_path`` optional directory path for storing the results of evaluating SQL expression or query in 'common', 'simplified' or 'query' modes. +* ``--findMinimalSubExpression`` optional Whether to find minimum failing subexpression on result mismatch. Set to false by default. + +* ``--useSeperatePoolForInput`` optional If true (default), expression evaluator and input vectors use different memory pools. This helps trigger code-paths that can depend on vectors having different pools. For eg, when copying a flat string vector copies of the strings stored in the string buffers need to be created. If however, the pools were the same between the vectors then the buffers can simply be shared between them instead. + Example command: :: diff --git a/velox/docs/develop/testing/join-fuzzer.rst b/velox/docs/develop/testing/join-fuzzer.rst index be7d61a467bfc..3e16434e986d4 100644 --- a/velox/docs/develop/testing/join-fuzzer.rst +++ b/velox/docs/develop/testing/join-fuzzer.rst @@ -15,7 +15,7 @@ combined with randomly generated payload. When generating the join plan node, fuzzer shuffles join keys and output columns and randomly drops some columns from the output. -The fuzzer runs the query plan and compares the results with DuckDB. +The fuzzer runs the query plan and compares the results with the reference (DuckDB or Presto) as the expected result. The fuzzer then generates a set of different but logically equivalent plans, runs them and verifies that results are the same. Each plan runs twice: with @@ -42,7 +42,7 @@ Use velox_join_fuzzer_test binary to run join fuzzer: velox/exec/tests/velox_join_fuzzer_test -By default, the fuzzer will go through 10 interations. Use --steps +By default, the fuzzer will go through 10 iterations. Use --steps or --duration-sec flag to run fuzzer for longer. Use --seed to reproduce fuzzer failures. @@ -65,4 +65,8 @@ Here is a full list of supported command line arguments. * ``--enable_spill``: Whether to test with spilling or not. Default is true. +* ``--arbitrator_capacity``: Arbitrator capacity in bytes. Default is 6L << 30. + +* ``--allocator_capacity``: Allocator capacity in bytes. Default is 8L << 30. + If running from CLion IDE, add ``--logtostderr=1`` to see the full output. diff --git a/velox/docs/develop/testing/memory-arbitration-fuzzer.rst b/velox/docs/develop/testing/memory-arbitration-fuzzer.rst new file mode 100644 index 0000000000000..2895138a6faff --- /dev/null +++ b/velox/docs/develop/testing/memory-arbitration-fuzzer.rst @@ -0,0 +1,57 @@ +========================= +MemoryArbitration Fuzzer +========================= + +The MemoryArbitrationFuzzer is a test tool designed to automatically generate and execute multiple query plans +in parallel with tight total memory budget. It aims to stress the memory arbitration processing, and validate there is +no crash or hanging in a concurrent query execution mode. The query either succeeds or failed with expected errors. +It works as follows: + +1. Data Generation: It starts by generating a random set of input data, also known as a vector. This data can + have a variety of encodings and data layouts to ensure thorough testing. +2. Plan Generation: Generate multiple plans with different query shapes. Currently, it supports HashJoin and + HashAggregation plans. +3. Query Execution: Create multiple threads, each thread randomly picks a plan with spill enabled or not, and repeatedly + running this process until ${iteration_duration_sec} seconds. The query thread expects query to succeed or fail with + query OOM or abort errors, otherwise it throws. +4. Iteration: This process is repeated multiple times to ensure reliability and robustness. + +How to run +---------- + +Use velox_memory_arbitration_fuzzer_test binary to run this fuzzer: + +:: + + velox/exec/tests/velox_memory_arbitration_fuzzer_test --seed 123 --duration_sec 60 + +By default, the fuzzer will go through 10 iterations. Use --steps +or --duration-sec flag to run fuzzer for longer. Use --seed to +reproduce fuzzer failures. + +Here is a full list of supported command line arguments. + +* ``–-steps``: How many iterations to run. Each iteration generates and + evaluates one expression or aggregation. Default is 10. + +* ``–-duration_sec``: For how long to run in seconds. If both ``-–steps`` + and ``-–duration_sec`` are specified, –duration_sec takes precedence. + +* ``–-seed``: The seed to generate random expressions and input vectors with. + +* ``–-v``: Verbose logging (from `Google Logging Library `_). + +* ``–-batch_size``: The size of input vectors to generate. Default is 100. + +* ``--num_batches``: The number of input vectors of size `--batch_size` to + generate. Default is 5. + +* ``--iteration_duration_sec``: For how long it should run (in seconds) per iteration. + +* ``--arbitrator_capacity``: Arbitrator capacity in bytes. + +* ``--allocator_capacity``: Allocator capacity in bytes. + +* ``--num_threads``: Number of threads running queries in parallel per iteration. + +If running from CLion IDE, add ``--logtostderr=1`` to see the full output. diff --git a/velox/docs/develop/testing/row-number-fuzzer.rst b/velox/docs/develop/testing/row-number-fuzzer.rst new file mode 100644 index 0000000000000..381c4813e9855 --- /dev/null +++ b/velox/docs/develop/testing/row-number-fuzzer.rst @@ -0,0 +1,59 @@ +================ +RowNumber Fuzzer +================ + +The RowNumberFuzzer is a testing tool that automatically generate equivalent query plans and then executes these plans +to validate the consistency of the results. It works as follows: + +1. Data Generation: It starts by generating a random set of input data, also known as a vector. This data can + have a variety of encodings and data layouts to ensure thorough testing. +2. Plan Generation: Generate two equivalent query plans, one is row-number over ValuesNode as the base plan. + and the other is over TableScanNode as the alter plan. +3. Query Execution: Executes those equivalent query plans using the generated data and asserts that the results are + consistent across different plans. + i. Execute the base plan, compare the result with the reference (DuckDB or Presto) and use it as the expected result. + #. Execute the alter plan multiple times with and without spill, and compare each result with the + expected result. +4. Iteration: This process is repeated multiple times to ensure reliability and robustness. + +How to run +---------- + +Use velox_row_number_fuzzer_test binary to run rowNumber fuzzer: + +:: + + velox/exec/tests/velox_row_number_fuzzer_test --seed 123 --duration_sec 60 + +By default, the fuzzer will go through 10 iterations. Use --steps +or --duration-sec flag to run fuzzer for longer. Use --seed to +reproduce fuzzer failures. + +Here is a full list of supported command line arguments. + +* ``–-steps``: How many iterations to run. Each iteration generates and + evaluates one expression or aggregation. Default is 10. + +* ``–-duration_sec``: For how long to run in seconds. If both ``-–steps`` + and ``-–duration_sec`` are specified, –duration_sec takes precedence. + +* ``–-seed``: The seed to generate random expressions and input vectors with. + +* ``–-v=1``: Verbose logging (from `Google Logging Library `_). + +* ``–-batch_size``: The size of input vectors to generate. Default is 100. + +* ``--num_batches``: The number of input vectors of size `--batch_size` to + generate. Default is 5. + +* ``--enable_spill``: Whether to test with spilling or not. Default is true. + +* ``--presto_url`` The PrestoQueryRunner url along with its port number. + +* ``--req_timeout_ms`` Timeout in milliseconds of an HTTP request to the PrestoQueryRunner. + +* ``--arbitrator_capacity``: Arbitrator capacity in bytes. Default is 6L << 30. + +* ``--allocator_capacity``: Allocator capacity in bytes. Default is 8L << 30. + +If running from CLion IDE, add ``--logtostderr=1`` to see the full output. diff --git a/velox/docs/develop/testing/spark-query-runner.rst b/velox/docs/develop/testing/spark-query-runner.rst new file mode 100644 index 0000000000000..b72880fede115 --- /dev/null +++ b/velox/docs/develop/testing/spark-query-runner.rst @@ -0,0 +1,75 @@ +================== +Spark Query Runner +================== + +Introduction +------------ + +The Spark Query Runner is a tool designed to facilitate the testing of Velox. +It helps ensure the correctness of Velox's computing against Spark and +provides a method for identifying potential issues in Velox's implementation. +Spark Query Runner is designed to run against Spark-3.5.1. + +How It Works +------------ + +The Spark Query Runner operates by executing given SQL queries on Spark and +returning results as Velox data format, which allows the comparison of results +between Velox and Spark. + +Since Spark 3.4, Spark Connect has introduced a decoupled client-server architecture +for Spark that allows remote connectivity to Spark clusters. From the client +perspective, Spark Connect mostly behaves as any other gRPC client, which is polyglot +and cross-platforms. During execution, the Spark Connect endpoint embedded on the +Spark Server receives and parses queries into Spark’s logical plan operators. +From there, the standard Spark execution process kicks in, ensuring that Spark +Connect leverages all of Spark’s optimizations and enhancements. Results are +streamed back to the client through gRPC as Arrow-encoded row batches. + +In the Spark Query Runner, we use Spark Connect to submit queries to Spark and fetch +the results back to Velox. The steps for this process are as follows: + +1. Provide the Spark SQL query to be executed. The query could be generated from Velox + plan node or manually written. +2. Create a protobuf message `ExecutePlanRequest` from the SQL query. The protocols + used by Spark Connect are defined in `Apache Spark `_. +3. Submit the message to SparkConnectService through gRPC API `ExecutePlan`. +4. Fetch Spark's results from execution response. Results are in Arrow IPC stream format, + and can be read as Arrow RecordBatch by `arrow::ipc::RecordBatchReader`. +5. Convert Arrow RecordBatch as Velox vector for the comparison with Velox's results. + +Usage +----- + +To use the Spark Query Runner, you will need to deploy an executable Spark and start the +Spark Connect server with below command. + +.. code-block:: + + "$SPARK_HOME"/sbin/start-connect-server.sh --jars "$JAR_PATH"/spark-connect_2.12-3.5.1.jar + + +The jar of Spark Connect could be downloaded from `maven repository `_. +If Spark Connect server is started successfully, you can see log as below. The server will +be started at `localhost:15002`. + +.. code-block:: + + INFO SparkConnectServer: Spark Connect server started at: 0:0:0:0:0:0:0:0%0:15002 + +Another option is to use Spark Query Runner in the docker image `ghcr.io/facebookincubator/velox-dev:spark-server` +provided by Velox. It includes an executable Spark and the start script. You can download +the image and run below command to start Spark connect server in it. + +.. code-block:: + + bash /opt/start-spark.sh + +You can then provide the Spark Query Runner with the SQL query and the data to run the +query on. The tool will execute the query on Spark and return results as Velox data format. + +Currently to use Spark as reference DB is only supported in aggregate fuzzer test, in which +the results from Velox and Spark are compared to check for any differences. If the results +match, it indicates that Velox is producing the correct output. If the results differ, it +suggests a potential issue in Velox that needs to be investigated. You can trigger its test +referring to :doc:`Fuzzer `. diff --git a/velox/docs/develop/testing/writer-fuzzer.rst b/velox/docs/develop/testing/writer-fuzzer.rst new file mode 100644 index 0000000000000..ab78732a1e549 --- /dev/null +++ b/velox/docs/develop/testing/writer-fuzzer.rst @@ -0,0 +1,48 @@ +============= +Writer Fuzzer +============= + +Writer fuzzer tests table write plan with up to 5 regular columns, up to +3 partition keys, up to 3 bucket columns, up to 3 sorted columns. + +At each iteration, fuzzer randomly generates a table write plan with different +table properties including un-partitioned and partitioned, non-bucketed and bucketed, +sorted and unsorted. + +The fuzzer then generates inputs and runs the query plan and compares the +results with PrestoDB. +As of now, we compare: +1. How many rows were written. +2. Output directories have the same directory layout and hierarchy. +3. Same data were written by velox and prestoDB including bucket number. +4. Data of sorted columns is in the same order if table is sorted. + +How to run +---------- + +Use velox_writer_fuzzer_test binary to run join fuzzer: + +:: + + velox/exec/tests/velox_writer_fuzzer_test + +By default, the fuzzer will go through 10 interations. Use --steps +or --duration-sec flag to run fuzzer for longer. Use --seed to +reproduce fuzzer failures. + +Here is a full list of supported command line arguments. + +* ``–-steps``: How many iterations to run. Each iteration generates and + evaluates one tale writer plan. Default is 10. + +* ``–-duration_sec``: For how long to run in seconds. If both ``-–steps`` + and ``-–duration_sec`` are specified, –duration_sec takes precedence. + +* ``–-seed``: The seed to generate random expressions and input vectors with. + +* ``–-batch_size``: The size of input vectors to generate. Default is 100. + +* ``--num_batches``: The number of input vectors of size `--batch_size` to + generate. Default is 5. + +If running from CLion IDE, add ``--logtostderr=1`` to see the full output. diff --git a/velox/docs/develop/timestamp.rst b/velox/docs/develop/timestamp.rst new file mode 100644 index 0000000000000..b29577dea25e9 --- /dev/null +++ b/velox/docs/develop/timestamp.rst @@ -0,0 +1,241 @@ +================================= +Timestamp and Timezone Management +================================= + +Concepts +-------- + +Following ANSI SQL semantics, TIMESTAMP is a data type that represents a +reading of a wall clock and a calendar, e.g, ``2024-04-09 18:25:00``. Note that +a TIMESTAMP does not represent an absolute point in time, as the exact same +wall clock time may be read in different instants in time depending on where +one is situated on Earth. For example, ``2024-04-09 18:25:00`` in California +and in China were perceived at different absolute points in time, about 15 +hours apart.   + +To represent absolute points in time, SQL defines a TIMESTAMP WITH TIMEZONE +type, which conceptually represents a pair of a wall time and calendar read +(say, ``2024-04-09 18:25:00``), and a timezone (``PDT``, or +``America/Los_Angeles``). With these two values, one can unambiguously +represent an absolute instant in time.  + +Naturally, a TIMESTAMP WITH TIMEZONE can be cast into a TIMESTAMP by just +ignoring the timezone and keeping the timestamp wall time, and a TIMESTAMP can +be cast into a TIMESTAMP WITH TIMEZONE by associating a timezone to it. The +timezone can either be explicitly specified by the users, or implicitly taken +from the user system or session information.  + +Physical Representation +----------------------- + +Representing timestamps in memory as a string or a set of values for year, +month, day, hour, and so on, is inefficient. Therefore, timestamps are usually +stored in a columnar layout as a 64 bit integer representing the number of +seconds elapsed since ``1970-01-01 00:00:00``. Negative values represent time +prior to that.  + +However, note that the physical representation of the timestamp is orthogonal +to its logical meaning. For example, the timestamp represented by the ``0`` +integer was perceived at different absolute points in time depending on the +observer’s timezone, and does not necessarily imply that it was observed in the +UTC timezone. When a timestamp represents the number of seconds in UTC +specifically (at that exact absolute instant in time), it may be called a *unix +epoch* or *unix time*. + +Velox Classes and APIs +---------------------- + +Velox provides a few classes and APIs to allow developers to store, process, +and convert timestamps across timezones: + +**Timestamp:** In Velox, timestamps are represented by the `Timestamp +`_ +class. The Timestamp class stores two 64 bit integers, one containing the +number of seconds from ``1970-01-01 00:00:00``, and one containing the +nanoseconds offset in that particular second, in order to provide nanosecond +precision. A few more observations: + +* While “seconds” can be negative to represent time before ``1970-01-01 + 00:00:00``, “nanoseconds” are always positive. + +* Although Velox supports nanoseconds precision, engines like Presto and Spark + may only need milliseconds or microsecond precision. + +* The Timestamp class only offers a physical representation of timestamps, but + does not carry logical information about its timezone. In other words, it + cannot, by itself, represent an absolute point in time. + +**Timezone IDs:** To physically represent timezones, Velox provides the +`TimezoneMap.h `_ +API. This API provides a 1:1 mapping from each available timezone to a +monotonically increasing integer (a timezone ID), such that this integer can be +used to efficiently represent timezones, preventing the use of inefficient +timezone string names like ``America/Los_Angeles``. Considering there are about +2k valid timezone definitions, 12 bits are enough to represent timezone IDs.  + +Timezone IDs in Velox are based on the id map used by Presto and are +`available here `_. +They are automatically generated using `this script `_. +While timezone IDs are an implementation detail and ideally should not leak +outside of Velox execution, they are exposed if data containing +TimestampWithTimezones are serialized, for example. + +**TimestampWithTimezone:** To represent an absolute point in time, Velox provides +`TimestampWithTimezone `_. +This abstraction implements the TIMESTAMP WITH TIMEZONE SQL semantic discussed +above, and is based on Presto’s implementation - therefore only supporting +millisecond-precision. + +TimestampWithTimezone physically packs two integers in a single 64 word, using +12 bits for timezone ID, and 52 bits for a millisecond-precision timestamp. + +Note that to accelerate timestamp conversion functions, the timestamps stored +in a TimestampWithTimezone are **always relative to UTC** - they are unix epochs. +This means that converting a TimestampWithTimezone across timezones is +efficiently done by just overwriting the 12 bits, and that comparisons can be +done by just comparing the 52 bits relative to timestamp (ignoring the timezone +ID). + +However, unpacking/converting a TimestampWithTimezone into an absolute time +definition requires a +`timezone conversion `_. + +Conversions Across Timezones +---------------------------- + +A common operation required when processing timestamps and timezone is finding +the wall clock and calendar read in a specific timezone given an absolute point +in time described by a wall clock and calendar read in a different timezone. +For example, at the exact point in time when UTC hits ``1970-01-01 00:00:00``, +what was the wall clock read in China? + +Timezone conversions are tricky since they are non-linear and depend on +daylight savings time schedules and other local regulations, and these change +over time. To enable such conversions, `IANA `_ +periodically publishes a global source of authority database for timezone +conversions, which is periodically pushed to systems using packages like tzdata +for Linux.  + +In Velox, Timezone conversions are done using std::chrono. Starting in C++20, +std::chrono `supports conversion of timestamp across timezones `_. +To support older versions of the C++ standard, in Velox we vendor an +implementation of this API at `velox/external/date/ `_. +This class handles timezone conversions by leveraging APIs provided by the +operating system, based on the tzdata database installed locally. If systems +happen to have inconsistent or older versions of the tzdata database, Velox’s +conversions may produce inconsistent results.  + +On Linux, you can check the tzdata installed in your system by: + +.. code-block:: bash + + $ rpm -qa | grep tzdata + tzdata-2024a-1.fc38.noarch + +Timezone conversions are done using special methods in the Timestamp class: +``Timestamp::toGMT()`` and ``Timestamp::toTimezone()``. They can take either a +timezone ID or a tz::TimeZone pointer. Providing a tz::TimeZone is +generally more efficient, but std::chrono does not handle time zone offsets +such as ``+09:00``. Timezone offsets are only supported in the API version +that takes a timezone ID. + +Casts +----- + +This section describes examples of timestamp casts following ANSI SQL +semantics, using `Presto as a reference implementation `_, +using ``set session legacy_timestamp = false;`` (see the section below for +details). + +Timestamp literals are created based on whether time zone information is found +on the string on not: + +:: + + SELECT typeof(TIMESTAMP '1970-01-01 00:00:00'); -- timestamp + SELECT typeof(TIMESTAMP '1970-01-01 00:00:00 UTC'); -- timestamp with time zone + +Converting a TimestampWithTimezone into a Timestamp works by dropping the +timezone information and returning only the timestamp portion: + +:: + + SELECT cast(TIMESTAMP '1970-01-01 00:00:00 UTC' as timestamp); -- 1970-01-01 00:00:00.000 + SELECT cast(TIMESTAMP '1970-01-01 00:00:00 America/New_York' as timestamp); -- 1970-01-01 00:00:00.000 + +To convert a Timestamp into a TimestampWithTimezone, one needs to specify a +timezone. In Presto, the session timezone is used by default: + +:: + + SELECT current_timezone(); -- America/Los_Angeles + SELECT cast(TIMESTAMP '1970-01-01 00:00:00' as timestamp with time zone); -- 1970-01-01 00:00:00.000 America/Los_Angeles + +Conversion across TimestampWithTimezone can be done using the AT TIME ZONE +construct.  + +The semantic of this operation is: at the absolute point in time described by +the source TimestampWithTimezone (``1970-01-01 00:00:00 UTC``), what would be +the clock/calendar read at the target timezone (Los Angeles)? + +:: + + SELECT TIMESTAMP '1970-01-01 00:00:00 UTC' AT TIME ZONE 'America/Los_Angeles'; -- 1969-12-31 16:00:00.000 America/Los_Angeles + SELECT TIMESTAMP '1970-01-01 00:00:00 UTC' AT TIME ZONE 'UTC'; -- 1970-01-01 00:00:00.000 UTC + +Strings can be converted into Timestamp and TimestampWithTimezone: + +:: + + SELECT cast('1970-01-01 00:00:00' as timestamp); -- 1970-01-01 00:00:00.000 + SELECT cast('1970-01-01 00:00:00 America/Los_Angeles' as timestamp with time zone); -- 1970-01-01 00:00:00.000 America/Los_Angeles + +One can also convert a TimestampWithTimezone into a unix epoch/time. The +semantic of this operation is: at the absolute point in time described by the +timestamp with timezone taken as a parameter, what was the unix epoch? Remember +that unix epoch is the number of seconds since ``1970-01-01 00:00:00`` in UTC: + +:: + + SELECT to_unixtime(TIMESTAMP '1970-01-01 00:00:00 UTC'); -- 0.0 + SELECT to_unixtime(TIMESTAMP '1970-01-01 00:00:00 America/Los_Angeles'); -- 28800.0 + +The opposite conversion can be achieved using ``from_unixtime()``. The function +may take an optional second parameter to specify the timezone, having the same +semantic as AT TIME ZONE described above: + +:: + + SELECT from_unixtime(0); -- 1970-01-01 00:00:00.000 + SELECT from_unixtime(0, 'UTC'); -- 1970-01-01 00:00:00.000 UTC  + SELECT from_unixtime(0, 'America/Los_Angeles'); -- 1969-12-31 16:00:00.000 America/Los_Angeles + +Presto Cast Legacy Behavior +--------------------------- + +For historical reasons, Presto used to interpret a TIMESTAMP as an absolute +point in time at the user’s time zone, instead of a timezone-less wall clock +reading as the ANSII SQL defines it. More information +`can be found here `_.  + +Although this has been fixed in newer versions, a ``legacy_timestamp`` session +flag was added  to preserve backwards compatibility. When this flag is set, +timestamps have a different semantic: + +:: + + SET SESSION legacy_timestamp = true; + SELECT cast(TIMESTAMP '1970-01-01 00:00:00 UTC' as timestamp); -- 1969-12-31 16:00:00.000 + SELECT cast('1970-01-01 00:00:00 UTC' as timestamp); -- 1969-12-31 16:00:00.000 + +To support the two timestamp semantics, the +``core::QueryConfig::kAdjustTimestampToTimezone`` query flag was added to Velox. +When this flag is set, Velox will convert the timestamp into the user’s session +time zone to follow the expected semantic, although non-ANSI SQL compliant. + +Other Resources +--------------- + +* https://github.com/prestodb/presto/issues/7122 +* https://github.com/a0x8o/presto/blob/master/presto-docs/src/main/sphinx/language/timestamp.rst +* https://github.com/facebookincubator/velox/issues/8037 diff --git a/velox/docs/develop/types.rst b/velox/docs/develop/types.rst index 5613ba7397854..80c94020f23a7 100644 --- a/velox/docs/develop/types.rst +++ b/velox/docs/develop/types.rst @@ -64,6 +64,7 @@ unsigned integer for nanoseconds. Nanoseconds represent the high-precision part the timestamp, which is less than 1 second. Valid range of nanoseconds is [0, 10^9). Timestamps before the epoch are specified using negative values for the seconds. Examples: + * Timestamp(0, 0) represents 1970-01-0 T00:00:00 (epoch). * Timestamp(10*24*60*60 + 125, 0) represents 1970-01-11 00:02:05 (10 days 125 seconds after epoch). * Timestamp(19524*24*60*60 + 500, 38726411) represents 2023-06-16 08:08:20.038726411 @@ -72,6 +73,33 @@ Examples: * Timestamp(-5000*24*60*60 - 1000, 123456) represents 1956-04-24 07:43:20.000123456 (5000 days 1000 seconds before epoch plus 123456 nanoseconds). +Floating point types (REAL, DOUBLE) have special values negative infinity, positive infinity, and +not-a-number (NaN). + +For NaN the semantics are different than the C++ standard floating point semantics: + +* The different types of NaN (+/-, signaling/quiet) are treated as canonical NaN (+, quiet). +* `NaN = NaN` returns true. +* NaN is treated as a normal numerical value in join and group-by keys. +* When sorting, NaN values are considered larger than any other value. When sorting in ascending order, NaN values appear last. When sorting in descending order, NaN values appear first. +* For a number N: `N > NaN` is false and `NaN > N` is true. + +For negative infinity and positive infinity the following C++ standard floating point semantics apply: + +Given N is a positive finite number. + +* +inf * N = +inf +* -inf * N = -inf +* +inf * -N = -inf +* -inf * -N = +inf +* +inf * 0 = NaN +* -inf * 0 = NaN +* +inf = +inf returns true. +* -inf = -inf returns true. +* Positive infinity and negative infinity are treated as normal numerical values in join and group-by keys. +* Positive infinity sorts lower than NaN and higher than any other value. +* Negative infinity sorts lower than any other value. + Logical Types ~~~~~~~~~~~~~ Logical types are backed by a physical type and include additional semantics. @@ -96,8 +124,8 @@ point in a number. For example, the number `123.45` has a precision of `5` and a scale of `2`. DECIMAL types are backed by `BIGINT` and `HUGEINT` physical types, which store the unscaled value. For example, the unscaled value of decimal `123.45` is `12345`. `BIGINT` is used upto 18 precision, and has a range of -[:math:`-10^{18} + 1, +10^{18} - 1`]. `HUGEINT` is used starting from 19 precision -upto 38 precision, with a range of [:math:`-10^{38} + 1, +10^{38} - 1`]. +:math:`[-10^{18} + 1, +10^{18} - 1]`. `HUGEINT` is used starting from 19 precision +upto 38 precision, with a range of :math:`[-10^{38} + 1, +10^{38} - 1]`. All the three values, precision, scale, unscaled value are required to represent a decimal value. @@ -109,6 +137,14 @@ the existing physical types. For example, Presto Types described below are imple by extending the physical types. An OPAQUE type must be used when there is no physical type available to back the logical type. +When extending an existing physical type, if different compare and/or hash semantics are +needed instead of those provided by the underlying native C++ type, this can be achieved by +doing the following: +* Pass `true` for the `providesCustomComparison` argument in the custom type's base class's constructor. +* Override the `compare` and `hash` functions inherited from the `TypeBase` class (you must implement both). +Note that this is currently only supported for custom types that extend physical types that +are primitive and fixed width. + Complex Types ~~~~~~~~~~~~~ Velox supports the ARRAY, MAP, and ROW complex types. @@ -133,10 +169,60 @@ Presto Type Physical Type ======================== ===================== HYPERLOGLOG VARBINARY JSON VARCHAR -TIMESTAMP WITH TIME ZONE ROW +TIMESTAMP WITH TIME ZONE BIGINT +UUID HUGEINT +IPADDRESS HUGEINT ======================== ===================== TIMESTAMP WITH TIME ZONE represents a time point in milliseconds precision -from UNIX epoch with timezone information. Its physical type contains one 64-bit -signed integer for milliseconds and another 16-bit signed integer for timezone ID. -Valid range of timezone ID is [1, 1680], its definition can be found in ``TimeZoneDatabase.cpp``. \ No newline at end of file +from UNIX epoch with timezone information. Its physical type is BIGINT. +The high 52 bits of bigint store signed integer for milliseconds in UTC. +Supported range of milliseconds is [0xFFF8000000000000L, 0x7FFFFFFFFFFFF] +(or [-69387-04-22T03:45:14.752, 73326-09-11T20:14:45.247]). The low 12 bits +store timezone ID. Supported range of timezone ID is [1, 1680]. +The definition of timezone IDs can be found in ``TimeZoneDatabase.cpp``. + +IPADDRESS represents an IPV6 or IPV4 formatted IPV6 address. Its physical +type is HUGEINT. The format that the address is stored in is defined as part of `(RFC 4291#section-2.5.5.2) `_ +As Velox is run on Little Endian systems and the standard is network byte(Big Endian) +order, we reverse the bytes to allow for masking and other bit operations +used in IPADDRESS/IPPREFIX related functions. This type can be used to +create IPPREFIX networks as well as to check IPADDRESS validity within +IPPREFIX networks. + +Spark Types +~~~~~~~~~~~~ +The `data types `_ in Spark have some semantic differences compared to those in +Presto. These differences require us to implement the same functions +separately for each system in Velox, such as min, max and collect_set. The +key differences are listed below. + +* Spark operates on timestamps with "microsecond" precision while Presto with + "millisecond" precision. + Example:: + + SELECT min(ts) + FROM ( + VALUES + (cast('2014-03-08 09:00:00.123456789' as timestamp)), + (cast('2014-03-08 09:00:00.012345678' as timestamp)) + ) AS t(ts); + -- 2014-03-08 09:00:00.012345 + +* In function comparisons, nested null values are handled as values. + Example:: + + SELECT equalto(ARRAY[1, null], ARRAY[1, null]); -- true + + SELECT min(a) + FROM ( + VALUES + (ARRAY[1, 2]), + (ARRAY[1, null]) + ) AS t(a); + -- ARRAY[1, null] + +* MAP type is not comparable and not orderable in Spark. In Presto, MAP type is + also not orderable, but it is comparable if both key and value types are + comparable. The implication is that MAP type cannot be used as a join, group + by or order by key in Spark. diff --git a/velox/docs/develop/vectors.rst b/velox/docs/develop/vectors.rst index b2b1f16d4edda..1a3cb0b0b6a41 100644 --- a/velox/docs/develop/vectors.rst +++ b/velox/docs/develop/vectors.rst @@ -175,10 +175,10 @@ std::shared_ptr using the FlatVectorPtr alias. template using FlatVectorPtr = std::shared_ptr>; -The following diagram shows a flat vector of type INTEGER with 11 values. This +The following diagram shows a flat vector of type INTEGER with 12 values. This vector is represented as FlatVector. The `values_` buffer has space for -at least 11 consecutive entries of 4 bytes each. Nulls buffer has space for at -least 11 consecutive entries of 1 bit each. Values in positions 2,7, 11 are +at least 12 consecutive entries of 4 bytes each. Nulls buffer has space for at +least 12 consecutive entries of 1 bit each. Values in positions 2,7, 11 are null, e.g. bits 2, 7, 11 in `nulls_` buffer are 0. The rest of the bits in the `nulls_` buffer are 1. Entries 2, 7, 11 in `values_` buffer contain garbage. @@ -215,9 +215,9 @@ Strings in the string buffers appear not necessarily in order and there can be gaps between individual strings. A single vector may use one or more string buffers. -The following diagram shows a vector of type VARCHAR with 7 values. This vector +The following diagram shows a vector of type VARCHAR with 8 values. This vector is represented as FlatVector. `values_` buffer has space for at least -7 entries 16 bytes each. `stringBuffers_` array has one entry containing a +8 entries 16 bytes each. `stringBuffers_` array has one entry containing a concatenation of non-inlined strings. Each entry in `values_` buffer uses 4 bytes to store the size of the string. @@ -277,12 +277,13 @@ characters. bool isNull_ = false; BufferPtr stringBuffer_; -BaseVector::wrapInConstant() static method can be used to create a constant +BaseVector::createConstant() static method can be used to create a constant vector from a scalar value. .. code-block:: c++ static std::shared_ptr createConstant( + const TypePtr& type, variant value, vector_size_t size, velox::memory::MemoryPool* pool); @@ -322,9 +323,9 @@ Multiple dictionary vectors can refer to the same base vector. We are saying that the dictionary vector wraps the base vector. Here is a dictionary of type INTEGER that represents a result of a filter: n % 2 -= 0. The base vector contains 11 entries. Only 5 of these entries passed the -filter, hence, the size of the dictionary vector is 5. The indices buffer -contains 5 entries referring to positions in the original vector that passed += 0. The base vector contains 12 entries. Only 6 of these entries passed the +filter, hence, the size of the dictionary vector is 6. The indices buffer +contains 6 entries referring to positions in the original vector that passed the filter. .. image:: images/dictionary-subset2.png @@ -385,8 +386,9 @@ ArrayVector ~~~~~~~~~~~ ArrayVector stores values of type ARRAY. In addition to nulls buffer, it -contains offsets and sizes buffers and an elements vector. Offsets and sizes -are 32-bit integers. +contains offsets and sizes buffers and an elements vector. Offsets and sizes are +32-bit integers. The non-null non-empty ranges formed by offsets and sizes in a +vector is not allowed to overlap with each other. .. code-block:: c++ @@ -442,8 +444,9 @@ MapVector ~~~~~~~~~ MapVector stores values of type MAP. In addition to nulls buffer, it contains -offsets and sizes buffers, keys and values vectors. Offsets and sizes are -32-bit integers. +offsets and sizes buffers, keys and values vectors. Offsets and sizes are 32-bit +integers. The non-null non-empty ranges formed by offsets and sizes in a vector +is not allowed to overlap with each other. .. code-block:: c++ diff --git a/velox/docs/develop/view-and-writer-types.rst b/velox/docs/develop/view-and-writer-types.rst new file mode 100644 index 0000000000000..5c8bf89ee1169 --- /dev/null +++ b/velox/docs/develop/view-and-writer-types.rst @@ -0,0 +1,335 @@ +===================== +View and Writer Types +===================== + +View types and writer types are used as the input and output parameter types +respectively for complex and string types in the simple function interface of +both scalar and aggregate functions. + +Inputs (View Types) +------------------- + +Input complex types are represented in the simple function interface using light-weight lazy +access abstractions that enable efficient direct access to the underlying data in Velox +vectors. +As mentioned earlier, the helper aliases arg_type and null_free_arg_type can be used in function's signatures to +map Velox types to the corresponding input types. The table below shows the actual types that are +used to represent inputs of different complex types. + +============================== ========================= ============================== + C++ Argument Type C++ Actual Argument Type Corresponding `std` type +============================== ========================= ============================== +arg_type> NullableArrayView> std::vector> +arg_type> NullableMapView std::map> +arg_type> NullableRowView std::tuple... +null_free_arg_type> NullFreeArrayView std::vector +null_free_arg_type> NullFreeMapView std::map +null_free_arg_type>> NullFreeRowView std::tuple +============================== ========================= ============================== + +The view types are designed to have interfaces similar to those of std::containers, in fact in most cases +they can be used as a drop in replacement. The table above shows the mapping between the Velox type and +the corresponding std type. For example: a *Map, Array>* corresponds to const +*std::map, std::vector>*. + +All views types are cheap to copy objects, for example the size of ArrayView is 16 bytes at max. + +**OptionalAccessor**: + +OptionalAccessor is an *std::optional* like object that provides lazy access to the nullity and +value of the underlying Velox vector at a specific index. Currently, it is used to represent elements of nullable input arrays +and values of nullable input maps. Note that keys in the map are assumed to be always not nullable in Velox. + +The object supports the following methods: + +- arg_type value() : unchecked access to the underlying value. + +- arg_type operator \*() : unchecked access to the underlying value. + +- bool has_value() : return true if the value is not null. + +- bool operator() : return true if the value is not null. + +The nullity and the value accesses are decoupled, and hence if someone knows inputs are null-free, +accessing the value does not have the overhead of checking the nullity. So is checking the nullity. +Note that, unlike std::container, function calls to value() and operator* are r-values (temporaries) and not l-values, +they can bind to const references and l-values but not references. + +OptionalAccessor is assignable to and comparable with std::optional> for primitive types. +The following expressions are valid, where array[0] is an optional accessor. + +.. code-block:: c++ + + std::optional = array[0]; + if(array[0] == std::nullopt) ... + if(std::nullopt == array[0]) ... + if(array[0]== std::optional{1}) ... + +**NullableArrayView and NullFreeArrayView** + +NullableArrayView and NullFreeArrayView have interfaces similar to that of *std::vector>* and *std::vector*, +the code below shows the function arraySum, a range loop is used to iterate over the values. + +.. code-block:: c++ + + template + struct ArraySum { + VELOX_DEFINE_FUNCTION_TYPES(T); + + bool call(const int64_t& output, const arg_type>& array) { + output = 0; + for(const auto& element : array) { + if (element.has_value()) { + output += element.value(); + } + } + return true; + } + }; + + +ArrayView supports the following: + +- size_t **size** () : return the number of elements in the array. + +- **operator[]** (size_t index) : access element at index. It returns either null_free_arg_type or OptionalAccessor. + +- ArrayView::Iterator **begin** () : iterator to the first element. + +- ArrayView::Iterator **end** () : iterator indicating end of iteration. + +- bool **mayHaveNulls** () : constant time check on the underlying vector nullity. When it returns false, there are definitely no nulls, a true does not guarantee null existence. + +- ArrayView::SkipNullsContainer **skipNulls** () : return an iterable container that provides direct access to non-null values in the underlying array. For example, the function above can be written as: + +.. code-block:: c++ + + template + struct ArraySum { + VELOX_DEFINE_FUNCTION_TYPES(T); + + bool call(const int64_t& output, const arg_type>& array) { + output = 0; + for (const auto& value : array.skipNulls()) { + output += value; + } + return true; + } + }; + +The skipNulls iterator will check the nullity at each index and skip nulls, a more performant implementation +would skip reading the nullity when mayHaveNulls() is false. + +.. code-block:: c++ + + template + struct ArraySum { + VELOX_DEFINE_FUNCTION_TYPES(T); + + bool call(const int64_t& output, const arg_type>& array) { + output = 0; + if (array.mayHaveNulls()) { + for(const auto& value : array.skipNulls()) { + output += value; + } + return true; + } + + // No nulls, skip reading nullity. + for (const auto& element : array) { + output += element.value(); + } + return true; + } + }; + +Note: calls to operator[], iterator de-referencing, and iterator pointer de-referencing are r-values (temporaries), +versus l-values in STD containers. Hence those can be bound to const references or l-values but not normal references. + +**NullableMapView and NullFreeMapView** + +NullableMapView and NullFreeMapView has an interfaces similar to std::map> and std::map, +the code below shows an example function mapSum, sums up the keys and values. + +.. code-block:: c++ + + template + struct MapSum{ + bool call(const int64_t& output, const arg_type>& map) { + output = 0; + for (const auto& [key, value] : map) { + output += key; + if (value.has_value()) { + value += value.value(); + } + } + return true; + } + }; + +MapView supports the following: + +- MapView::Element **begin** () : iterator to the first map element. + +- MapView::Element **end** () : iterator that indicates end of iteration. + +- size_t **size** () : number of elements in the map. + +- MapView::Iterator **find** (const key_t& key): performs a linear search for the key, and returns iterator to the element if found otherwise returns end(). Only supported for primitive key types. + +- MapView::Iterator **operator[]** (const key_t& key): same as find, throws an exception if element not found. + +- MapView::Element + +MapView::Element is the type returned by dereferencing MapView::Iterator. It has two members: + +- first : arg_type | null_free_arg_type + +- second: OptionalAccessor | null_free_arg_type + +- MapView::Element participates in struct binding: auto [v, k] = \*map.begin(); + +Note: iterator de-referencing and iterator pointer de-referencing result in temporaries. Hence those can be bound to +const references or value variables but not normal references. + +Generic input types are implemented using GenericView that supports the following: + +- uint64_t **hash** () const : returns a hash of the value; used to define std::hash(); allows GenericView's to be stored in folly::F14 sets and maps as well as STL's sets and maps. +- bool **isNull** () const : returns true if the value is NULL +- bool **operator==** (const GenericView& other) const : equality comparison with another GenericView +- std::optional **compare** (const GenericView& other, const CompareFlags flags) const : comparison with another GenericView +- TypeKind **kind** () const : returns TypeKind of the value +- const TypePtr& **type** () const : returns Velox type of the value +- std::string **toString** () const : returns string representaion of the value for logging and debugging +- template typename VectorReader::exec_in_t **castTo** () const : cast to concrete view type +- template std::optional::exec_in_t> **tryCastTo** () const : best-effort attempt to cast to a concrete view type + +**Temporaries lifetime C++** + +While c++ allows temporaries(r-values) to bound to const references by extending their lifetime, one must be careful and +know that only the assigned temporary lifetime is extended but not all temporaries in the RHS expression chain. +In other words, the lifetime of any temporary within an expression is not extended. + +For example, for the expression const auto& x = map.begin()->first. +c++ does not extend the lifetime of the result of map.begin() since it's not what is being +assigned. And in such a case, the assignment has undefined behavior. + +.. code-block:: c++ + + // Safe assignments. single rhs temporary. + const auto& a = array[0]; + const auto& b = *a; + const auto& c = map.begin(); + const auto& d = c->first; + + // Unsafe assignments. (undefined behaviours) + const auto& a = map.begin()->first; + const auto& b = **it; + + // Safe and cheap to assign to value. + const auto a = map.begin()->first; + const auto b = **it; + +Note that in the range-loop, the range expression is assigned to a universal reference. Thus, the above concern applies to it. + +.. code-block:: c++ + + // Unsafe range loop. + for(const auto& e : **it){..} + + // Safe range loop. + auto itt = *it; + for(const auto& e : *itt){..} + +.. _outputs-write: + +Outputs (Writer Types) +---------------------- + +Outputs of complex types are represented using special writers that are designed in a way that +minimizes data copying by writing directly to Velox vectors. + +**ArrayWriter** + +- out_type& **add_item** () : add non-null item and return the writer of the added value. +- **add_null** (): add null item. +- **reserve** (vector_size_t size): make sure space for `size` items is allocated in the underlying vector. +- vector_size_t **size** (): return the length of the array. +- **resize** (vector_size_t size): change the size of the array reserving space for the new elements if needed. + +- void **add_items** (const T& data): append data from any container with std::vector-like interface. +- void **copy_from** (const T& data): assign data to match that of any container with std::vector-like interface. + +- void **add_items** (const NullFreeArrayView& data): append data from array view (faster than item by item). +- void **copy_from** (const NullFreeArrayView& data): assign data from array view (faster than item by item). + +- void **add_items** (const NullableArrayView& data): append data from array view (faster than item by item). +- void **copy_from** (const NullableArrayView& data): assign data from array view (faster than item by item). + +When V is primitive, the following functions are available, making the writer usable as std::vector. + +- **push_back** (std::optional): add item or null. +- PrimitiveWriter **operator[]** (vector_size_t index): return a primitive writer that is assignable to std::optional for the item at index (should be called after a resize). +- PrimitiveWriter **back** (): return a primitive writer that is assignable to std::optional for the item at index length -1. + + +**MapWriter** + +- **reserve** (vector_size_t size): make sure space for `size` entries is allocated in the underlying vector. +- std::tuple&, out_type&> **add_item()** : add non-null item and return the writers of key and value as tuple. +- out_type& **add_null()** : add null item and return the key writer. +- vector_size_t **size** (): return the length of the map. + +- void **add_items** (const T& data): append data from any container with std::vector> like interface. +- void **copy_from** (const NullFreeMapView& data): assign data from map view (faster than item by item). +- void **copy_from** (const NullableMapView& data): assign data from map view (faster than item by item). + +When K and V are primitives, the following functions are available, making the writer usable as std::vector>. + +- **resize** (vector_size_t size): change the size. +- **emplace** (K, std::optional): add element to the map. +- std::tuple> **operator[]** (vector_size_t index): returns pair of writers for element at index. Key writer is assignable to K. while value writer is assignable to std::optional. + +**RowWriter** + +- template **set_null_at** (): set null for row item at index I. +- template **get_writer_at** (): set not null for row item at index I, and return writer to the row element at index I. + +When all types T... are primitives, the following functions are available. + +- void **operator=** (const std::tuple& inputs): assignable to std::tuple. +- void **operator=** (const std::tuple...>& inputs): assignable to std::tuple...>. +- void **copy_from** (const std::tuple& inputs): similar as the above. + +When a given Ti is primitive, the following is valid. + +- PrimitiveWriter exec::get(RowWriter): return a primitive writer for item at index I that is assignable to std::optional. + +**PrimitiveWriter** + +Assignable to std::optional allows writing null or value to the primitive. Returned by complex writers when writing nullable +primitives. + +**StringWriter<>** + +- void **reserve** (size_t newCapacity) : Reserve a space for the output string with size of at least newCapacity. +- void **resize** (size_t newCapacity) : Set the size of the string. +- char* **data** (): returns pointer to the first char of the string, can be written to directly (safe to write to index at capacity()-1). +- vector_size_t **capacity** (): returns the capacity of the string. +- vector_size_t **size** (): returns the size of the string. +- **operator+=** (const T& input): append data from char* or any type with data() and size(). +- **append** (const T& input): append data from char* or any type with data() and size(). +- **copy_from** (const T& input): append data from char* or any type with data() and size(). + +When Zero-copy optimization is enabled (see zero-copy-string-result section above), the following functions can be used. + +- void **setEmpty** (): set to empty string. +- void **setNoCopy** (const StringView& value): set string to an input string without performing deep copy. + +**GenericWriter** + +- TypeKind **kind** () const : returns TypeKind of the value +- const TypePtr& **type** () const : returns Velox type of the value +- void **copy_from** (const GenericView& view) : assign data from another GenericView +- template typename VectorWriter::exec_out_t& **castTo** () : cast to concrete writer type +- template typename VectorWriter::exec_out_t* **tryCastTo** () : best-effort attempt to cast to a concrete writer type diff --git a/velox/docs/develop/window.rst b/velox/docs/develop/window.rst new file mode 100644 index 0000000000000..6abd407504054 --- /dev/null +++ b/velox/docs/develop/window.rst @@ -0,0 +1,320 @@ +================ +Window functions +================ + +Velox supports window function evaluation using Window operator. In this guide +we will discuss some intricate design problems in this operator. + +This doc assumes familiarity with Window functions described in +:doc:`../functions/presto/window`. + + +Window frames +------------- + +Window functions can optionally include a FRAME clause. The FRAME clause +can be thought of as a spec for a sliding window of rows which bound +the window function computation for the given row. + +Not all window functions are bound by the FRAME clause. + +- Aggregates computed as window functions and value functions :func:`first_value`, + :func:`last_value` and :func:`nth_value` honor window frames. +- Rank functions :func:`row_number`, :func:`rank`, :func:`dense_rank`, + :func:`percent_rank`, :func:`ntile`, :func:`cume_dist` and value functions + :func:`lead` and :func:`lag` are not affected by window frames. + +A frame can be ROWS type or RANGE type, and it runs from frame_start to +frame_end. A FRAME clause is one of + +.. code-block:: + + {RANGE|ROWS} frame_start + {RANGE|ROWS} BETWEEN frame_start AND frame_end + +frame_start and frame_end can be any of: + +.. code-block:: + + UNBOUNDED PRECEDING + expression PRECEDING + CURRENT ROW + expression FOLLOWING + UNBOUNDED FOLLOWING + +**ROWS mode** + +ROWS mode can be interpreted as indices of the rows in the order in which they +appear in the window partition. This ordering is determined by the ORDER BY +clause. In ROWS mode, CURRENT ROW refers to the present row at which the +function is being evaluated. Each consecutive row has increasing frame number. +The frame numbers start at 0 and increase by 1 for each row. + +**RANGE mode** + +In RANGE mode, all peer rows have the same frame number. +Rows are peers if they have the same values for the ORDER BY field. +A frame start of CURRENT ROW refers to the first peer row of the current row, +while a frame end of CURRENT ROW refers to the last peer row of the current row. +If no ORDER BY is specified, all rows are considered peers of the current row. + +**Misc** + +UNBOUNDED PRECEDING and UNBOUNDED FOLLOWING always refer to the first and +last rows of the partition in either mode. + + +Window frame indices +-------------------- + +As window functions are evaluated for each row, the Window operator provides +each function a buffer of frame_start and frame_end indices at each +WindowFunction::apply call. + +**Note**: Its possible during calculations that the frame indices are values +before or after the partition rows. In this case, the frame indices get bound +to the first and last rows of the partition. + +As an example, for frame *ROWS BETWEEEN 2 PRECEDING AND 2 FOLLOWING*, the +frame indices will be as follows: + +.. code-block:: + + row_index partition_col order_by_col frame_start frame_end + 0 1 1 0 2 + 1 1 2 0 3 + 2 1 2 0 4 + 3 1 3 1 5 + 4 1 4 2 6 + 5 1 4 3 7 + 6 1 4 4 7 + 7 1 5 5 7 + +On the other hand, for frame *RANGE BETWEEEN 2 PRECEDING AND 2 FOLLOWING*, +the frame indices will be as follows + +.. code-block:: + + row_index partition_col order_by_col frame_start frame_end + 0 1 1 0 3 + 1 1 2 0 6 + 2 1 2 0 6 + 3 1 3 0 7 + 4 1 4 1 7 + 5 1 4 1 7 + 6 1 4 1 7 + 7 1 5 3 7 + +k Range frames +-------------- + +K range window frames are a special value based window frame. + +An example of a k range frame is *RANGE BETWEEEN 5 PRECEDING AND 2 FOLLOWING*. +This frame includes all rows whose order_by keys are values between +*(current_row order_by key - 5)* to *(current_row order_by key + 2)*. + +Elaborating with an example table: + +.. code-block:: + + row_index partition_col order_by_col start_frame end_frame frame_start frame_end + 0 1 2 -3 4 0 1 + 1 1 3 -2 5 0 2 + 2 1 5 0 7 0 3 + 3 1 5 0 7 0 3 + 4 1 9 4 11 2 5 + 5 1 10 5 12 2 5 + 6 1 15 10 17 5 6 + 7 1 21 16 23 7 7 +There are some more aspects to consider when calculating the frame indices. + +One of the subtle nuances is related to *PRECEDING* and *FOLLOWING* usage. + +- *PRECEDING* range means search in the rows from current row to partition start. +- *FOLLOWING* range means search in the rows from current row to partition end. + +This implies: + +- If an ORDER BY clause is *ASCENDING*, preceding rows have values < current row + and following rows have values > current row. + So the frame *RANGE BETWEEN 5 PRECEDING AND 2 FOLLOWING* is for values + between [order_by - 5] to [order_by + 2]. + The previous table is an example of such a frame. + +- However, if the ORDER BY clause is *DESCENDING* then preceding rows have + values > current row and following rows have values < current row + + So for the same frame *RANGE BETWEEN 5 PRECEDING AND 2 FOLLOWING* with + descending the values are between [order_by + 5] to [order_by - 2]. + +Flipping the previous example for a descending order will result in the following +table. + +.. code-block:: + + row_index partition_col order_by_col start_frame end_frame frame_start frame_end + 0 1 21 26 19 0 0 + 1 1 15 20 13 1 1 + 2 1 10 15 8 1 2 + 3 1 9 14 7 2 3 + 4 1 5 10 3 2 6 + 5 1 5 10 3 2 6 + 6 1 3 8 1 4 7 + 7 1 2 7 -1 4 7 + +**k** in the range frames can be a constant, column reference or expression (like for date ranges +the bound can be date + some interval). Velox defers computing the start_value and end_value bounds +to a prior project node and expects the user to send these computed values in the column reference +for k range frames. Even if k is a constant value, the user is expected to compute the start_value +and end_value columns for the WindowNode. + +In WindowNode a kRange frame would look as follows : + +.. code-block:: + + struct Frame { + WindowType type; + BoundType startType; + TypedExprPtr startValue; + BoundType endType; + TypedExprPtr endValue; + }; + Frame kRange = { kRange, kPreceding, start_value_col, kFollowing, end_value_col}; + + +The following validations are performed for k Range frames: + +- There is a single ORDER BY column to compare the k Range value. + +- start(or end)Value in WindowNode::Frame cannot be constants if the bound type is kPreceding + or kFollowing respectively. + +- The type of the start(end)Value TypedExprPtr must be the same type as the ORDER BY column. + +The Velox Window operator computes the frame indices buffer by searching start(end)Value in the +ORDER BY column and passes the buffer in the WindowFunction::apply() call. + +**Null values in RANGE frames** + +The ORDER BY column could have NULL values. NULL values match only other NULLs for Range frames. + +NULL values are placed at the beginning or end of the ORDER BY column based on the +*NULLS FIRST/LAST* mode used. So for a row with a NULL value, the frame_start index is first +peer row with a NULL and the frame_end index is the last peer row with a NULL value. + +Rows with NULL values do not participate in the frames of the other rows. + + +Empty frames +------------ +Window frames can be valid, partial or empty during window function processing. + +Valid frames are the default case when all the rows in the window frame in order +from frame_start to frame_end are within the partition boundaries. However, it is +possible that window frames of certain rows are only partially filled or are empty. +While partial frames don't need any special treatment from the function author, +empty frames need some consideration. + +Empty frames occur when : + +* Both frame_start and frame_end fall before the first partition row. + + E.g. in frame *ROWS BETWEEN 5 PRECEDING and 2 PRECEDING* the first 2 rows + have both frame bounds before the first partition row. + +* Both frame_start and frame_end fall after the partition end row. + + E.g. in frame *ROWS BETWEEN 2 FOLLOWING and 5 FOLLOWING* the last 2 rows + have both frame bounds beyond the last partition row. + +* frame_start > frame_end row (as frame range is defined from frame_start to + frame_end). + + E.g. In frame *ROWS BETWEEN UNBOUNDED PRECEDING AND 2 PRECEDING* the intent + is to compute aggregation from the partition start row to 2 rows prior + the current one. However, for the first 2 rows the frameStart + (frame index 0 for unbounded preceding) is ahead of 2 preceding + (indices -2 and -1). + +* For frames like *ROWS BETWEEN 2 PRECEDING AND 5 PRECEDING* or + *ROWS BETWEEN 5 FOLLOWING AND 2 FOLLOWING*, frame_start > frame_end for + all rows. So all frames are empty. + +**Partial frames** + +Like illustrated in the examples above, rows could have partial window frames. + +A partial frame occurs when: + +- frame_start < frame_end (so it's not an empty frame) +- One frame end is within partition bounds and the other end outside of it. + + This means either : + + - frame_start is before the first partition row while frame_end is within + the partition. In this case, frame_start is clamped to the first partition + row. + - frame_start is within the partition while frame_end is beyond the partition. + In this case, the frame_end is clamped to the last partition row. + +Partial frames usually follow empty frames in a sliding window. + +E.g. In frame *ROWS BETWEEN 5 PRECEDING AND 2 PRECEDING*, the first 2 rows have +frame_start and frame_end before the first partition row, so they are empty. +But from 3rd - 5th row, the 5 preceding frame_start bound is outside the +partition, but 2 preceding frame_end is within the partition. So for these +3 rows frame_start is clamped to the first partition row. + +Similarly for frame *ROWS BETWEEN 2 FOLLOWING AND 5 FOLLOWING*, the last 3-5th rows +have frame_start within the partition, but frame_end beyond. So they are partial +frames. The last 2 rows have both bounds outside the partition and are empty frames.. + +Empty, partial and valid window frames can be visualized as below + +.. image:: images/empty_frames.png + :width: 600 + :align: center + +Frames with constant frame bounds (like 2 preceding) have strict sliding behavior. +So the empty frames, partial frames and valid frames cluster together and follow +(or precede) each other. + +Adhoc frames that use column values for bounds (like c1 preceding) can have +empty, partial or valid frames at any points in the partition rows. + +**Handling empty frames in window functions** + +As mentioned before, only value and aggregate window functions use frames in +their evaluation. Value functions return null values for empty frames. +Aggregate functions return the default aggregate value for empty frames. +Rank functions are not affected by empty frames. + +The most naive approach to handle empty frames is to check in the window +function logic if the frame indices are an empty frame (based on the +conditions previously described) and return the null output. However, +this could be repetitive to implement in all functions. + +To aid the calculations, the Window operator computes a +*SelectivityVector* for the rows with valid frames in each +WindowFunction::apply(..) call. The function logic can +iterate over the rows with set bits in this SelectivityVector +for evaluations. + +This SelectivityVector is passed in the validFrames argument in +the WindowFunction::apply() signature + +.. code-block:: + + virtual void apply( + const BufferPtr& peerGroupStarts, + const BufferPtr& peerGroupEnds, + const BufferPtr& frameStarts, + const BufferPtr& frameEnds, + const SelectivityVector& validFrames, + vector_size_t resultOffset, + const VectorPtr& result) = 0; + +The Window operator also clamps *partial* window frame indices to +the first or final partition row before passing them to the function. +So the Window function doesn't need any special logic for partial frames. diff --git a/velox/docs/functions.rst b/velox/docs/functions.rst index f11bf3a627e72..57ea32d5691d2 100644 --- a/velox/docs/functions.rst +++ b/velox/docs/functions.rst @@ -6,6 +6,7 @@ Presto Functions :maxdepth: 1 functions/presto/math + functions/presto/decimal functions/presto/bitwise functions/presto/comparison functions/presto/string @@ -20,6 +21,8 @@ Presto Functions functions/presto/aggregate functions/presto/window functions/presto/hyperloglog + functions/presto/uuid + functions/presto/misc Here is a list of all scalar and aggregate Presto functions available in Velox. Function names link to function descriptions. Check out coverage maps @@ -62,77 +65,88 @@ for :doc:`all ` and :doc:`most used ====================================== ====================================== ====================================== == ====================================== == ====================================== Scalar Functions Aggregate Functions Window Functions ====================================================================================================================== == ====================================== == ====================================== - :func:`abs` :func:`eq` :func:`power` :func:`approx_distinct` :func:`cume_dist` - :func:`acos` :func:`exp` :func:`quarter` :func:`approx_most_frequent` :func:`dense_rank` - :func:`all_match` :func:`filter` :func:`radians` :func:`approx_percentile` :func:`first_value` - :func:`any_match` :func:`flatten` :func:`rand` :func:`approx_set` :func:`lag` - :func:`array_average` :func:`floor` :func:`random` :func:`arbitrary` :func:`last_value` - :func:`array_constructor` :func:`format_datetime` :func:`reduce` :func:`array_agg` :func:`lead` - :func:`array_distinct` :func:`from_base` :func:`regexp_extract` :func:`avg` :func:`nth_value` - :func:`array_duplicates` :func:`from_base64` :func:`regexp_extract_all` :func:`bitwise_and_agg` :func:`ntile` - :func:`array_except` :func:`from_base64url` :func:`regexp_like` :func:`bitwise_or_agg` :func:`percent_rank` - :func:`array_frequency` :func:`from_big_endian_32` :func:`regexp_replace` :func:`bool_and` :func:`rank` - :func:`array_has_duplicates` :func:`from_big_endian_64` :func:`repeat` :func:`bool_or` :func:`row_number` - :func:`array_intersect` :func:`from_hex` :func:`replace` :func:`checksum` - :func:`array_join` :func:`from_unixtime` :func:`reverse` :func:`corr` - :func:`array_max` :func:`from_utf8` :func:`round` :func:`count` - :func:`array_min` :func:`greatest` :func:`rpad` :func:`count_if` - :func:`array_normalize` :func:`gt` :func:`rtrim` :func:`covar_pop` - :func:`array_position` :func:`gte` :func:`second` :func:`covar_samp` - :func:`array_sort` :func:`hmac_md5` :func:`sequence` :func:`entropy` - :func:`array_sort_desc` :func:`hmac_sha1` :func:`sha1` :func:`every` - :func:`array_sum` :func:`hmac_sha256` :func:`sha256` :func:`histogram` - :func:`array_union` :func:`hmac_sha512` :func:`sha512` :func:`kurtosis` - :func:`arrays_overlap` :func:`hour` :func:`shuffle` :func:`map_agg` - :func:`asin` in :func:`sign` :func:`map_union` - :func:`atan` :func:`infinity` :func:`sin` :func:`map_union_sum` - :func:`atan2` :func:`inverse_beta_cdf` :func:`slice` :func:`max` - :func:`beta_cdf` :func:`is_finite` :func:`split` :func:`max_by` - :func:`between` :func:`is_infinite` :func:`split_part` :func:`max_data_size_for_stats` - :func:`binomial_cdf` :func:`is_json_scalar` :func:`spooky_hash_v2_32` :func:`merge` - :func:`bit_count` :func:`is_nan` :func:`spooky_hash_v2_64` :func:`min` - :func:`bitwise_and` :func:`is_null` :func:`sqrt` :func:`min_by` - :func:`bitwise_arithmetic_shift_right` :func:`json_array_contains` :func:`strpos` :func:`regr_intercept` - :func:`bitwise_left_shift` :func:`json_array_length` :func:`strrpos` :func:`regr_slope` - :func:`bitwise_logical_shift_right` :func:`json_extract` :func:`subscript` :func:`set_agg` - :func:`bitwise_not` :func:`json_extract_scalar` :func:`substr` :func:`set_union` - :func:`bitwise_or` :func:`json_format` :func:`tan` :func:`skewness` - :func:`bitwise_right_shift` :func:`json_parse` :func:`tanh` :func:`stddev` - :func:`bitwise_right_shift_arithmetic` :func:`json_size` :func:`timezone_hour` :func:`stddev_pop` - :func:`bitwise_shift_left` :func:`least` :func:`timezone_minute` :func:`stddev_samp` - :func:`bitwise_xor` :func:`length` :func:`to_base` :func:`sum` - :func:`cardinality` :func:`like` :func:`to_base64` :func:`sum_data_size_for_stats` - :func:`cauchy_cdf` :func:`ln` :func:`to_base64url` :func:`var_pop` - :func:`cbrt` :func:`log10` :func:`to_big_endian_32` :func:`var_samp` - :func:`ceil` :func:`log2` :func:`to_big_endian_64` :func:`variance` - :func:`ceiling` :func:`lower` :func:`to_hex` - :func:`chi_squared_cdf` :func:`lpad` :func:`to_ieee754_64` - :func:`chr` :func:`lt` :func:`to_unixtime` - :func:`clamp` :func:`lte` :func:`to_utf8` - :func:`codepoint` :func:`ltrim` :func:`transform` - :func:`combinations` :func:`map` :func:`transform_keys` - :func:`concat` :func:`map_concat` :func:`transform_values` - :func:`contains` :func:`map_entries` :func:`trim` - :func:`cos` :func:`map_filter` :func:`trim_array` - :func:`cosh` :func:`map_from_entries` :func:`truncate` - :func:`crc32` :func:`map_keys` :func:`upper` - :func:`current_date` :func:`map_values` :func:`url_decode` - :func:`date` :func:`map_zip_with` :func:`url_encode` - :func:`date_add` :func:`md5` :func:`url_extract_fragment` - :func:`date_diff` :func:`millisecond` :func:`url_extract_host` - :func:`date_format` :func:`minus` :func:`url_extract_parameter` - :func:`date_parse` :func:`minute` :func:`url_extract_path` - :func:`date_trunc` :func:`mod` :func:`url_extract_port` - :func:`day` :func:`month` :func:`url_extract_protocol` - :func:`day_of_month` :func:`multiply` :func:`url_extract_query` - :func:`day_of_week` :func:`nan` :func:`week` - :func:`day_of_year` :func:`negate` :func:`week_of_year` - :func:`degrees` :func:`neq` :func:`width_bucket` - :func:`distinct_from` :func:`none_match` :func:`xxhash64` - :func:`divide` :func:`normal_cdf` :func:`year` - :func:`dow` not :func:`year_of_week` - :func:`doy` :func:`parse_datetime` :func:`yow` - :func:`e` :func:`pi` :func:`zip` - :func:`element_at` :func:`plus` :func:`zip_with` - :func:`empty_approx_set` :func:`pow` + :func:`abs` :func:`find_first_index` :func:`plus` :func:`any_value` :func:`cume_dist` + :func:`acos` :func:`flatten` :func:`poisson_cdf` :func:`approx_distinct` :func:`dense_rank` + :func:`all_keys_match` :func:`floor` :func:`pow` :func:`approx_most_frequent` :func:`first_value` + :func:`all_match` :func:`format_datetime` :func:`power` :func:`approx_percentile` :func:`lag` + :func:`any_keys_match` :func:`from_base` :func:`quarter` :func:`approx_set` :func:`last_value` + :func:`any_match` :func:`from_base64` :func:`radians` :func:`arbitrary` :func:`lead` + :func:`any_values_match` :func:`from_base64url` :func:`rand` :func:`array_agg` :func:`nth_value` + :func:`array_average` :func:`from_big_endian_32` :func:`random` :func:`avg` :func:`ntile` + :func:`array_constructor` :func:`from_big_endian_64` :func:`reduce` :func:`bitwise_and_agg` :func:`percent_rank` + :func:`array_distinct` :func:`from_hex` :func:`regexp_extract` :func:`bitwise_or_agg` :func:`rank` + :func:`array_duplicates` :func:`from_ieee754_32` :func:`regexp_extract_all` :func:`bitwise_xor_agg` :func:`row_number` + :func:`array_except` :func:`from_ieee754_64` :func:`regexp_like` :func:`bool_and` + :func:`array_frequency` :func:`from_iso8601_date` :func:`regexp_replace` :func:`bool_or` + :func:`array_has_duplicates` :func:`from_unixtime` :func:`remove_nulls` :func:`checksum` + :func:`array_intersect` :func:`from_utf8` :func:`repeat` :func:`corr` + :func:`array_join` :func:`gamma_cdf` :func:`replace` :func:`count` + :func:`array_max` :func:`greatest` :func:`reverse` :func:`count_if` + :func:`array_min` :func:`gt` :func:`round` :func:`covar_pop` + :func:`array_normalize` :func:`gte` :func:`rpad` :func:`covar_samp` + :func:`array_position` :func:`hamming_distance` :func:`rtrim` :func:`entropy` + :func:`array_remove` :func:`hmac_md5` :func:`second` :func:`every` + :func:`array_sort` :func:`hmac_sha1` :func:`sequence` :func:`geometric_mean` + :func:`array_sort_desc` :func:`hmac_sha256` :func:`sha1` :func:`histogram` + :func:`array_sum` :func:`hmac_sha512` :func:`sha256` :func:`kurtosis` + :func:`array_union` :func:`hour` :func:`sha512` :func:`map_agg` + :func:`arrays_overlap` in :func:`shuffle` :func:`map_union` + :func:`asin` :func:`infinity` :func:`sign` :func:`map_union_sum` + :func:`atan` :func:`inverse_beta_cdf` :func:`sin` :func:`max` + :func:`atan2` :func:`is_finite` :func:`slice` :func:`max_by` + :func:`beta_cdf` :func:`is_infinite` :func:`split` :func:`max_data_size_for_stats` + :func:`between` :func:`is_json_scalar` :func:`split_part` :func:`merge` + :func:`binomial_cdf` :func:`is_nan` :func:`split_to_map` :func:`min` + :func:`bit_count` :func:`is_null` :func:`spooky_hash_v2_32` :func:`min_by` + :func:`bitwise_and` :func:`json_array_contains` :func:`spooky_hash_v2_64` :func:`multimap_agg` + :func:`bitwise_arithmetic_shift_right` :func:`json_array_length` :func:`sqrt` :func:`reduce_agg` + :func:`bitwise_left_shift` :func:`json_extract` :func:`starts_with` :func:`regr_avgx` + :func:`bitwise_logical_shift_right` :func:`json_extract_scalar` :func:`strpos` :func:`regr_avgy` + :func:`bitwise_not` :func:`json_format` :func:`strrpos` :func:`regr_count` + :func:`bitwise_or` :func:`json_parse` :func:`subscript` :func:`regr_intercept` + :func:`bitwise_right_shift` :func:`json_size` :func:`substr` :func:`regr_r2` + :func:`bitwise_right_shift_arithmetic` :func:`laplace_cdf` :func:`tan` :func:`regr_slope` + :func:`bitwise_shift_left` :func:`last_day_of_month` :func:`tanh` :func:`regr_sxx` + :func:`bitwise_xor` :func:`least` :func:`timezone_hour` :func:`regr_sxy` + :func:`cardinality` :func:`length` :func:`timezone_minute` :func:`regr_syy` + :func:`cauchy_cdf` :func:`levenshtein_distance` :func:`to_base` :func:`set_agg` + :func:`cbrt` :func:`like` :func:`to_base64` :func:`set_union` + :func:`ceil` :func:`ln` :func:`to_base64url` :func:`skewness` + :func:`ceiling` :func:`log10` :func:`to_big_endian_32` :func:`stddev` + :func:`chi_squared_cdf` :func:`log2` :func:`to_big_endian_64` :func:`stddev_pop` + :func:`chr` :func:`lower` :func:`to_hex` :func:`stddev_samp` + :func:`clamp` :func:`lpad` :func:`to_ieee754_32` :func:`sum` + :func:`codepoint` :func:`lt` :func:`to_ieee754_64` :func:`sum_data_size_for_stats` + :func:`combinations` :func:`lte` :func:`to_unixtime` :func:`var_pop` + :func:`concat` :func:`ltrim` :func:`to_utf8` :func:`var_samp` + :func:`contains` :func:`map` :func:`transform` :func:`variance` + :func:`cos` :func:`map_concat` :func:`transform_keys` + :func:`cosh` :func:`map_entries` :func:`transform_values` + :func:`cosine_similarity` :func:`map_filter` :func:`trim` + :func:`crc32` :func:`map_from_entries` :func:`trim_array` + :func:`current_date` :func:`map_keys` :func:`truncate` + :func:`date` :func:`map_normalize` :func:`typeof` + :func:`date_add` :func:`map_subset` :func:`upper` + :func:`date_diff` :func:`map_top_n` :func:`url_decode` + :func:`date_format` :func:`map_values` :func:`url_encode` + :func:`date_parse` :func:`map_zip_with` :func:`url_extract_fragment` + :func:`date_trunc` :func:`md5` :func:`url_extract_host` + :func:`day` :func:`millisecond` :func:`url_extract_parameter` + :func:`day_of_month` :func:`minus` :func:`url_extract_path` + :func:`day_of_week` :func:`minute` :func:`url_extract_port` + :func:`day_of_year` :func:`mod` :func:`url_extract_protocol` + :func:`degrees` :func:`month` :func:`url_extract_query` + :func:`distinct_from` :func:`multimap_from_entries` :func:`week` + :func:`divide` :func:`multiply` :func:`week_of_year` + :func:`dow` :func:`nan` :func:`weibull_cdf` + :func:`doy` :func:`negate` :func:`width_bucket` + :func:`e` :func:`neq` :func:`wilson_interval_lower` + :func:`element_at` :func:`ngrams` :func:`wilson_interval_upper` + :func:`empty_approx_set` :func:`no_keys_match` :func:`xxhash64` + :func:`ends_with` :func:`no_values_match` :func:`year` + :func:`eq` :func:`none_match` :func:`year_of_week` + :func:`exp` :func:`normal_cdf` :func:`yow` + :func:`f_cdf` not :func:`zip` + :func:`filter` :func:`parse_datetime` :func:`zip_with` + :func:`find_first` :func:`pi` ====================================== ====================================== ====================================== == ====================================== == ====================================== diff --git a/velox/docs/functions/presto/aggregate.rst b/velox/docs/functions/presto/aggregate.rst index 4fdbfd0aca615..c8443653857f7 100644 --- a/velox/docs/functions/presto/aggregate.rst +++ b/velox/docs/functions/presto/aggregate.rst @@ -21,6 +21,10 @@ General Aggregate Functions Returns an arbitrary non-null value of ``x``, if one exists. +.. function:: any_value(x) -> [same as x] + + This is an alias for :func:`arbitrary`. + .. function:: array_agg(x) -> array<[same as x]> Returns an array created from the input ``x`` elements. Ignores null @@ -82,15 +86,16 @@ General Aggregate Functions each input value occurs. Supports integral, floating-point, boolean, timestamp, and date input types. -.. function:: geometric_mean(x) -> double +.. function:: geometric_mean(bigint) -> double + geometric_mean(double) -> double + geometric_mean(real) -> real Returns the `geometric mean `_ of all input values. - Supported types are BIGINT and DOUBLE. - .. function:: max_by(x, y) -> [same as x] Returns the value of ``x`` associated with the maximum value of ``y`` over all input values. + ``y`` must be an orderable type. .. function:: max_by(x, y, n) -> array([same as x]) :noindex: @@ -100,6 +105,7 @@ General Aggregate Functions .. function:: min_by(x, y) -> [same as x] Returns the value of ``x`` associated with the minimum value of ``y`` over all input values. + ``y`` must be an orderable type. .. function:: min_by(x, y, n) -> array([same as x]) :noindex: @@ -110,23 +116,35 @@ General Aggregate Functions Returns the maximum value of all input values. ``x`` must not contain nulls when it is complex type. + ``x`` must be an orderable type. + Nulls are ignored if there are any non-null inputs. + For REAL and DOUBLE types, NaN is considered greater than Infinity. .. function:: max(x, n) -> array<[same as x]> :noindex: Returns ``n`` largest values of all input values of ``x``. ``n`` must be a positive integer and not exceed 10'000. + Currently not supported for ARRAY, MAP, and ROW input types. + Nulls are not included in the output array. + For REAL and DOUBLE types, NaN is considered greater than Infinity. .. function:: min(x) -> [same as x] Returns the minimum value of all input values. ``x`` must not contain nulls when it is complex type. + ``x`` must be an orderable type. + Nulls are ignored if there are any non-null inputs. + For REAL and DOUBLE types, NaN is considered greater than Infinity. .. function:: min(x, n) -> array<[same as x]> :noindex: Returns ``n`` smallest values of all input values of ``x``. ``n`` must be a positive integer and not exceed 10'000. + Currently not supported for ARRAY, MAP, and ROW input types. + Nulls are not included in output array. + For REAL and DOUBLE types, NaN is considered greater than Infinity. .. function:: multimap_agg(K key, V value) -> map(K,array(V)) @@ -143,6 +161,27 @@ General Aggregate Functions The final state is returned. Throws an error if ``initialState`` is NULL or ``inputFunction`` or ``combineFunction`` returns a NULL. + Take care when designing ``initialState``, ``inputFunction`` and ``combineFunction``. + These need to support evaluating aggregation in a distributed manner using partial + aggregation on many nodes, followed by shuffle over group-by keys, followed by + final aggregation. Given a set of all possible values of state, make sure that + combineFunction is `commutative `_ + and `associative `_ + operation with initialState as the + `identity `_ value. + + combineFunction(s, initialState) = s for any s + + combineFunction(s1, s2) = combineFunction(s2, s1) for any s1 and s2 + + combineFunction(s1, combineFunction(s2, s3)) = combineFunction(combineFunction(s1, s2), s3) for any s1, s2, s3 + + In addition, make sure that the following holds for the inputFunction: + + inputFunction(inputFunction(initialState, x), y) = combineFunction(inputFunction(initialState, x), inputFunction(initialState, y)) for any x and y + + Check out `blog post about reduce_agg `_ for more context. + Note that reduce_agg doesn't support evaluation over sorted inputs.:: -- Compute sum (for illustration purposes only; use SUM aggregate function in production queries). @@ -295,6 +334,9 @@ Approximate Aggregate Functions value is a map containing the top elements with corresponding estimated frequency. + For BOOLEAN 'value', this function always returns 'perfect' result. + 'bucket' and 'capacity' arguments are ignored in this case. + The error of the function depends on the permutation of the values and its cardinality. We can set the capacity same as the cardinality of the underlying data to achieve the least error. @@ -395,23 +437,59 @@ Statistical Aggregate Functions where :math:`\mu` is the mean, and :math:`\sigma` is the standard deviation. +.. function:: regr_avgx(y, x) -> double + + Returns the average of the independent value in a group. ``y`` is the dependent + value. ``x`` is the independent value. + +.. function:: regr_avgy(y, x) -> double + + Returns the average of the dependent value in a group. ``y`` is the dependent + value. ``x`` is the independent value. + +.. function:: regr_count(y, x) -> double + + Returns the number of non-null pairs of input values. ``y`` is the dependent + value. ``x`` is the independent value. + .. function:: regr_intercept(y, x) -> double Returns linear regression intercept of input values. ``y`` is the dependent value. ``x`` is the independent value. +.. function:: regr_r2(y, x) -> double + + Returns the coefficient of determination of the linear regression. ``y`` is the dependent + value. ``x`` is the independent value. If regr_sxx(y, x) is 0, result is null. If regr_syy(y, x) is 0 + and regr_sxx(y, x) isn't 0, result is 1. + .. function:: regr_slope(y, x) -> double Returns linear regression slope of input values. ``y`` is the dependent value. ``x`` is the independent value. +.. function:: regr_sxx(y, x) -> double + + Returns the sum of the squares of the independent values in a group. ``y`` is the dependent + value. ``x`` is the independent value. + +.. function:: regr_sxy(y, x) -> double + + Returns the sum of the product of the dependent and independent values in a group. ``y`` is the dependent + value. ``x`` is the independent value. + +.. function:: regr_syy(y, x) -> double + + Returns the sum of the squares of the dependent values in a group. ``y`` is the dependent + value. ``x`` is the independent value. + .. function:: skewness(x) -> double Returns the skewness of all input values. .. function:: stddev(x) -> double - This is an alias for stddev_samp(). + This is an alias for :func:`stddev_samp`. .. function:: stddev_pop(x) -> double @@ -423,7 +501,7 @@ Statistical Aggregate Functions .. function:: variance(x) -> double - This is an alias for var_samp(). + This is an alias for :func:`var_samp`. .. function:: var_pop(x) -> double diff --git a/velox/docs/functions/presto/array.rst b/velox/docs/functions/presto/array.rst index ce92411930a66..3edd19c7731e4 100644 --- a/velox/docs/functions/presto/array.rst +++ b/velox/docs/functions/presto/array.rst @@ -33,14 +33,26 @@ Array Functions Returns the average of all non-null elements of the array. If there are no non-null elements, returns null. +.. function:: array_cum_sum(array(T)) -> array(T) + Returns the array whose elements are the cumulative sum of the input array, i.e. result[i] = input[1] + input[2] + + … + input[i]. If there there is null elements in the array, the cumulative sum at and after the element is null. :: + + SELECT array_cum_sum(ARRAY [1, 2, 3]) -- array[1, 3, 6] + SELECT array_cum_sum(ARRAY [1, 2, null, 3]) -- array[1, 3, null, null] + .. function:: array_distinct(array(E)) -> array(E) - Remove duplicate values from the input array. :: + Remove duplicate values from the input array. + For REAL and DOUBLE, NANs (Not-a-Number) are considered equal. :: SELECT array_distinct(ARRAY [1, 2, 3]); -- [1, 2, 3] SELECT array_distinct(ARRAY [1, 2, 1]); -- [1, 2] SELECT array_distinct(ARRAY [1, NULL, NULL]); -- [1, NULL] +.. function:: array_dupes(array(E)) -> boolean + + This is an alias for :func:`array_duplicates(array(E))` + .. function:: array_duplicates(array(E)) -> array(E) Returns a set of elements that occur more than once in array. @@ -50,7 +62,8 @@ Array Functions .. function:: array_except(array(E) x, array(E) y) -> array(E) - Returns an array of the elements in array ``x`` but not in array ``y``, without duplicates. :: + Returns an array of the elements in array ``x`` but not in array ``y``, without duplicates. + For REAL and DOUBLE, NANs (Not-a-Number) are considered equal. :: SELECT array_except(ARRAY [1, 2, 3], ARRAY [4, 5, 6]); -- [1, 2, 3] SELECT array_except(ARRAY [1, 2, 3], ARRAY [1, 2]); -- [3] @@ -68,6 +81,10 @@ Array Functions SELECT array_frequency(ARRAY ["knock", "knock", "who", "?"]); -- {"knock" -> 2, "who" -> 1, "?" -> 1} SELECT array_frequency(ARRAY []); -- {} +.. function:: array_has_dupes(array(E)) -> boolean + + This is an alias for :func:`array_has_duplicates(array(E))`. + .. function:: array_has_duplicates(array(E)) -> boolean Returns a boolean: whether array has any elements that occur more than once. @@ -77,7 +94,8 @@ Array Functions .. function:: array_intersect(array(E) x, array(E) y) -> array(E) - Returns an array of the elements in the intersection of array ``x`` and array ``y``, without duplicates. :: + Returns an array of the elements in the intersection of array ``x`` and array ``y``, without duplicates. + For REAL and DOUBLE, NANs (Not-a-Number) are considered equal. :: SELECT array_intersect(ARRAY [1, 2, 3], ARRAY[4, 5, 6]); -- [] SELECT array_intersect(ARRAY [1, 2, 2], ARRAY[1, 1, 2]); -- [1, 2] @@ -93,21 +111,31 @@ Array Functions .. function:: array_max(array(E)) -> E - Returns the maximum value of input array. :: + Returns the maximum value of input array. + NaN is considered to be greater than Infinity. + Returns NULL if array contains a NULL value. :: SELECT array_max(ARRAY [1, 2, 3]); -- 3 SELECT array_max(ARRAY [-1, -2, -2]); -- -1 SELECT array_max(ARRAY [-1, -2, NULL]); -- NULL SELECT array_max(ARRAY []); -- NULL + SELECT array_max(ARRAY [-1, nan(), NULL]); -- NULL + SELECT array_max(ARRAY[{-1, -2, -3, nan()]); -- NaN + SELECT array_max(ARRAY[{infinity(), nan()]); -- NaN .. function:: array_min(array(E)) -> E - Returns the minimum value of input array. :: + Returns the minimum value of input array. + NaN is considered to be greater than Infinity. + Returns NULL if array contains a NULL value. :: SELECT array_min(ARRAY [1, 2, 3]); -- 1 SELECT array_min(ARRAY [-1, -2, -2]); -- -2 SELECT array_min(ARRAY [-1, -2, NULL]); -- NULL SELECT array_min(ARRAY []); -- NULL + SELECT array_min(ARRAY [-1, nan(), NULL]); -- NULL + SELECT array_min(ARRAY[{-1, -2, -3, nan()]); -- -1 + SELECT array_min(ARRAY[{infinity(), nan()]); -- Infinity .. function:: array_normalize(array(E), E) -> array(E) @@ -117,56 +145,83 @@ Array Functions Tests if arrays ``x`` and ``y`` have any non-null elements in common. Returns null if there are no non-null elements in common but either array contains null. + For REAL and DOUBLE, NANs (Not-a-Number) are considered equal. + +.. function:: arrays_union(x, y) -> array + + Returns an array of the elements in the union of x and y, without duplicates. + For REAL and DOUBLE, NANs (Not-a-Number) are considered equal. .. function:: array_position(x, element) -> bigint Returns the position of the first occurrence of the ``element`` in array ``x`` (or 0 if not found). + For REAL and DOUBLE, NANs (Not-a-Number) are considered equal. .. function:: array_position(x, element, instance) -> bigint :noindex: If ``instance > 0``, returns the position of the ``instance``-th occurrence of the ``element`` in array ``x``. If ``instance < 0``, returns the position of the ``instance``-to-last occurrence of the ``element`` in array ``x``. If no matching element instance is found, 0 is returned. + For REAL and DOUBLE, NANs (Not-a-Number) are considered equal. .. function:: array_remove(x, element) -> array Remove all elements that equal ``element`` from array ``x``. + For REAL and DOUBLE, NANs (Not-a-Number) are considered equal. SELECT array_remove(ARRAY [1, 2, 3], 3); -- [1, 2] SELECT array_remove(ARRAY [2, 1, NULL], 1); -- [2, NULL] + SELECT array_remove(ARRAY [2.1, 1.1, nan()], nan()); -- [2.1, 1.1] .. function:: array_sort(array(E)) -> array(E) - Returns an array which has the sorted order of the input array x. The elements of x must - be orderable. Null elements will be placed at the end of the returned array.:: + Returns an array which has the sorted order of the input array x. E must be + an orderable type. Null elements will be placed at the end of the returned array. + May throw if E is and ARRAY or ROW type and input values contain nested nulls. + Throws if deciding the order of elements would require comparing nested null values. :: SELECT array_sort(ARRAY [1, 2, 3]); -- [1, 2, 3] SELECT array_sort(ARRAY [3, 2, 1]); -- [1, 2, 3] + SELECT array_sort(ARRAY [infinity(), -1.1, nan(), 1.1, -Infinity(), 0])); -- [-Infinity, -1.1, 0, 1.1, Infinity, NaN] SELECT array_sort(ARRAY [2, 1, NULL]; -- [1, 2, NULL] SELECT array_sort(ARRAY [NULL, 1, NULL]); -- [1, NULL, NULL] SELECT array_sort(ARRAY [NULL, 2, 1]); -- [1, 2, NULL] + SELECT array_sort(ARRAY [ARRAY [1, 2], ARRAY [2, null]]); -- [[1, 2], [2, null]] + SELECT array_sort(ARRAY [ARRAY [1, 2], ARRAY [1, null]]); -- failed: Ordering nulls is not supported .. function:: array_sort(array(T), function(T,U)) -> array(T) + :noindex: Returns the array sorted by values computed using specified lambda in ascending - order. Null elements will be placed at the end of the returned array. :: + order. U must be an orderable type. Null elements will be placed at the end of + the returned array. May throw if E is and ARRAY or ROW type and input values contain + nested nulls. Throws if deciding the order of elements would require comparing nested + null values. :: SELECT array_sort(ARRAY ['cat', 'leopard', 'mouse'], x -> length(x)); -- ['cat', 'mouse', 'leopard'] .. function:: array_sort_desc(array(E)) -> array(E) - Returns the array sorted in the descending order. The elements of the array must - be orderable. Null elements will be placed at the end of the returned array.:: + Returns the array sorted in the descending order. E must be an orderable type. + Null elements will be placed at the end of the returned array. + May throw if E is and ARRAY or ROW type and input values contain nested nulls. + Throws if deciding the order of elements would require comparing nested null values. :: SELECT array_sort_desc(ARRAY [1, 2, 3]); -- [3, 2, 1] SELECT array_sort_desc(ARRAY [3, 2, 1]); -- [3, 2, 1] SELECT array_sort_desc(ARRAY [2, 1, NULL]; -- [2, 1, NULL] SELECT array_sort_desc(ARRAY [NULL, 1, NULL]); -- [1, NULL, NULL] SELECT array_sort_desc(ARRAY [NULL, 2, 1]); -- [2, 1, NULL] + SELECT array_sort(ARRAY [ARRAY [1, 2], ARRAY [2, null]]); -- [[1, 2], [2, null]] + SELECT array_sort(ARRAY [ARRAY [1, 2], ARRAY [1, null]]); -- failed: Ordering nulls is not supported .. function:: array_sort_desc(array(T), function(T,U)) -> array(T) + :noindex: Returns the array sorted by values computed using specified lambda in descending - order. Null elements will be placed at the end of the returned array. :: + order. U must be an orderable type. Null elements will be placed at the end of + the returned array. May throw if E is and ARRAY or ROW type and input values contain + nested nulls. Throws if deciding the order of elements would require comparing nested + null values. :: SELECT array_sort_desc(ARRAY ['cat', 'leopard', 'mouse'], x -> length(x)); -- ['leopard', 'mouse', 'cat'] @@ -194,6 +249,14 @@ Array Functions .. function:: contains(x, element) -> boolean Returns true if the array ``x`` contains the ``element``. + When 'element' is of complex type, throws if 'x' or 'element' contains nested nulls + and these need to be compared to produce a result. + For REAL and DOUBLE, NANs (Not-a-Number) are considered equal. :: + + SELECT contains(ARRAY [2.1, 1.1, nan()], nan()); -- true. + SELECT contains(ARRAY[ARRAY[1, 3]], ARRAY[2, null]); -- false. + SELECT contains(ARRAY[ARRAY[2, 3]], ARRAY[2, null]); -- failed: contains does not support arrays with elements that are null or contain null + SELECT contains(ARRAY[ARRAY[2, null]], ARRAY[2, 1]); -- failed: contains does not support arrays with elements that are null or contain null .. function:: element_at(array(E), index) -> E @@ -217,6 +280,7 @@ Array Functions for no-match and first-match-is-null cases. .. function:: find_first(array(T), index, function(T,boolean)) -> E + :noindex: Returns the first element of ``array`` that matches the predicate. Returns ``NULL`` if no element matches the predicate. @@ -238,6 +302,7 @@ Array Functions Returns ``NULL`` if no such element exists. .. function:: find_first_index(array(T), index, function(T,boolean)) -> BIGINT + :noindex: Returns the 1-based index of the first element of ``array`` that matches the predicate. Returns ``NULL`` if no such element exists. @@ -245,15 +310,28 @@ Array Functions If ``index`` < 0, the search for element starts at position ``abs(index)`` counting from the end of the array, until the start of the array. :: - SELECT find_first(ARRAY[3, 4, 5, 6], 2, x -> x > 0); -- 2 - SELECT find_first(ARRAY[3, 4, 5, 6], -2, x -> x > 0); -- 3 - SELECT find_first(ARRAY[3, 4, 5, 6], 2, x -> x < 4); -- NULL - SELECT find_first(ARRAY[3, 4, 5, 6], -2, x -> x > 5); -- NULL + SELECT find_first_index(ARRAY[3, 4, 5, 6], 2, x -> x > 0); -- 2 + SELECT find_first_index(ARRAY[3, 4, 5, 6], -2, x -> x > 0); -- 3 + SELECT find_first_index(ARRAY[3, 4, 5, 6], 2, x -> x < 4); -- NULL + SELECT find_first_index(ARRAY[3, 4, 5, 6], -2, x -> x > 5); -- NULL .. function:: flatten(array(array(T))) -> array(T) Flattens an ``array(array(T))`` to an ``array(T)`` by concatenating the contained arrays. +.. function:: ngrams(array(T), n) -> array(array(T)) + + Returns `n-grams `_ for the array. + Throws if n is zero or negative. If n is greater or equal to input array, + result array contains input array as the only item. :: + + SELECT ngrams(ARRAY['foo', 'bar', 'baz', 'foo'], 2); -- [['foo', 'bar'], ['bar', 'baz'], ['baz', 'foo']] + SELECT ngrams(ARRAY['foo', 'bar', 'baz', 'foo'], 3); -- [['foo', 'bar', 'baz'], ['bar', 'baz', 'foo']] + SELECT ngrams(ARRAY['foo', 'bar', 'baz', 'foo'], 4); -- [['foo', 'bar', 'baz', 'foo']] + SELECT ngrams(ARRAY['foo', 'bar', 'baz', 'foo'], 5); -- [['foo', 'bar', 'baz', 'foo']] + SELECT ngrams(ARRAY[1, 2, 3, 4], 2); -- [[1, 2], [2, 3], [3, 4]] + SELECT ngrams(ARRAY["foo", NULL, "bar"], 2); -- [["foo", NULL], [NULL, "bar"]] + .. function:: reduce(array(T), initialState S, inputFunction(S,T,S), outputFunction(S,R)) -> R Returns a single value reduced from ``array``. ``inputFunction`` will @@ -261,7 +339,9 @@ Array Functions the element, ``inputFunction`` takes the current state, initially ``initialState``, and returns the new state. ``outputFunction`` will be invoked to turn the final state into the result value. It may be the - identity function (``i -> i``). :: + identity function (``i -> i``). + + Throws if array has more than 10,000 elements. :: SELECT reduce(ARRAY [], 0, (s, x) -> s + x, s -> s); -- 0 SELECT reduce(ARRAY [5, 20, 50], 0, (s, x) -> s + x, s -> s); -- 75 @@ -284,7 +364,7 @@ Array Functions .. function:: shuffle(array(E)) -> array(E) - Generate a random permutation of the given ``array``:: + Generate a random permutation of the given ``array`` :: SELECT shuffle(ARRAY [1, 2, 3]); -- [3, 1, 2] or any other random permutation SELECT shuffle(ARRAY [0, 0, 0]); -- [0, 0, 0] @@ -330,6 +410,14 @@ Array Functions SELECT trim_array(ARRAY[1, 2, 3, 4], 2); -- [1, 2] SELECT trim_array(ARRAY[1, 2, 3, 4], 4); -- [] +.. function:: remove_nulls(x) -> array + + Remove null values from an array ``array`` :: + + SELECT remove_nulls(ARRAY[1, NULL, 3, NULL]); -- [1, 3] + SELECT remove_nulls(ARRAY[true, false, NULL]); -- [true, false] + SELECT remove_nulls(ARRAY[ARRAY[1, 2], NULL, ARRAY[1, NULL, 3]]); -- [[1, 2], [1, null, 3]] + .. function:: zip(array(T), array(U),..) -> array(row(T,U, ...)) Returns the merge of the given arrays, element-wise into a single array of rows. @@ -341,7 +429,8 @@ Array Functions .. function:: zip_with(array(T), array(U), function(T,U,R)) -> array(R) Merges the two given arrays, element-wise, into a single array using ``function``. - If one array is shorter, nulls are appended at the end to match the length of the longer array, before applying ``function``:: + If one array is shorter, nulls are appended at the end to match the length of the + longer array, before applying ``function`` :: SELECT zip_with(ARRAY[1, 3, 5], ARRAY['a', 'b', 'c'], (x, y) -> (y, x)); -- [ROW('a', 1), ROW('b', 3), ROW('c', 5)] SELECT zip_with(ARRAY[1, 2], ARRAY[3, 4], (x, y) -> x + y); -- [4, 6] diff --git a/velox/docs/functions/presto/binary.rst b/velox/docs/functions/presto/binary.rst index 47c7ea7205b7b..8b4ddc26832ea 100644 --- a/velox/docs/functions/presto/binary.rst +++ b/velox/docs/functions/presto/binary.rst @@ -8,7 +8,26 @@ Binary Functions .. function:: from_base64(string) -> varbinary - Decodes binary data from the base64 encoded ``string``. + Decodes a Base64-encoded ``string`` back into its original binary form. + This function is capable of handling both fully padded and non-padded Base64 encoded strings. + Partially padded Base64 strings are not supported and will result in an error. + + Examples + -------- + Query with padded Base64 string: + :: + SELECT from_base64('SGVsbG8gV29ybGQ='); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] + + Query with non-padded Base64 string: + :: + SELECT from_base64('SGVsbG8gV29ybGQ'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] + + Query with partial-padded Base64 string: + :: + SELECT from_base64('SGVsbG8gV29ybGQgZm9yIHZlbG94IQ='); -- Error : Base64::decode() - invalid input string: string length is not a multiple of 4. + + In the above examples, both the fully padded and non-padded Base64 strings ('SGVsbG8gV29ybGQ=' and 'SGVsbG8gV29ybGQ') decode to the binary representation of the text 'Hello World'. + While, partial-padded Base64 string 'SGVsbG8gV29ybGQgZm9yIHZlbG94IQ=' will lead to an velox error. .. function:: from_base64url(string) -> varbinary @@ -26,6 +45,16 @@ Binary Functions Decodes binary data from the hex encoded ``string``. +.. function:: from_ieee754_32(binary) -> real + + Decodes the 32-bit big-endian ``binary`` in IEEE 754 single-precision floating-point format. + Throws a user error if input size is shorter / longer than 32 bits. + +.. function:: from_ieee754_64(binary) -> double + + Decodes the 64-bit big-endian ``binary`` in IEEE 754 double-precision floating-point format. + Throws a user error if input size is shorter / longer than 64 bits. + .. function:: hmac_md5(binary, key) -> varbinary Computes the HMAC with md5 of ``binary`` with the given ``key``. @@ -46,10 +75,30 @@ Binary Functions Returns the length of ``binary`` in bytes. +.. function:: lpad(binary, size, padbinary) -> varbinary + :noindex: + + Left pads ``binary`` to ``size`` bytes with ``padbinary``. + If ``size`` is less than the length of ``binary``, the result is + truncated to ``size`` characters. ``size`` must not be negative + and ``padbinary`` must be non-empty. ``size`` has a maximum value of 1 MiB. + In the case of ``size`` being smaller than the length of ``binary``, + ``binary`` will be truncated from the right to fit the ``size``. + .. function:: md5(binary) -> varbinary Computes the md5 hash of ``binary``. +.. function:: rpad(binary, size, padbinary) -> varbinary + :noindex: + + Right pads ``binary`` to ``size`` bytes with ``padbinary``. + If ``size`` is less than the length of ``binary``, the result is + truncated to ``size`` characters. ``size`` must not be negative + and ``padbinary`` must be non-empty. ``size`` has a maximum value of 1 MiB. + In the case of ``size`` being smaller than the length of ``binary``, + ``binary`` will be truncated from the right to fit the ``size``. + .. function:: sha1(binary) -> varbinary Computes the SHA-1 hash of ``binary``. @@ -90,6 +139,10 @@ Binary Functions Encodes ``binary`` into a hex string representation. +.. function:: to_ieee754_32(real) -> varbinary + + Encodes ``real`` in a 32-bit big-endian binary according to IEEE 754 single-precision floating-point format. + .. function:: to_ieee754_64(double) -> varbinary Encodes ``double`` in a 64-bit big-endian binary according to IEEE 754 double-precision floating-point format. diff --git a/velox/docs/functions/presto/bitwise.rst b/velox/docs/functions/presto/bitwise.rst index e2657f07dcb1f..c22445bdec3ea 100644 --- a/velox/docs/functions/presto/bitwise.rst +++ b/velox/docs/functions/presto/bitwise.rst @@ -12,42 +12,51 @@ Bitwise Functions SELECT bit_count(-7, 64); -- 62 SELECT bit_count(-7, 8); -- 6 -.. function:: bitwise_and(x, y) -> [bigint] +.. function:: bitwise_and(x, y) -> bigint Returns the bitwise AND of ``x`` and ``y`` in 2's complement representation. -.. function:: bitwise_arithmetic_shift_right(x, shift) -> [bigint]`` +.. function:: bitwise_arithmetic_shift_right(x, shift) -> bigint Returns the arithmetic right shift operation on ``x`` shifted by ``shift`` in 2’s complement representation. + ``shift`` must not be negative. -.. function:: bitwise_left_shift(x, shift) -> [bigint]`` +.. function:: bitwise_left_shift(x, shift) -> [same as x] - Returns the left shifted value of ``x``. Here x can be of type ``TINYINT`` , ``SMALLINT``, ``INTEGER`` and ``BIGINT``. + Returns the left shifted value of ``x``. + Supported types of x are: ``TINYINT`` , ``SMALLINT``, ``INTEGER`` and ``BIGINT``. + ``shift`` is an ``INTEGER``. -.. function:: bitwise_logical_shift_right(x, shift, bits) -> [bigint]`` +.. function:: bitwise_logical_shift_right(x, shift, bits) -> bigint Returns the logical right shift operation on ``x`` (treated as ``bits``-bit integer) shifted by ``shift``. + ``shift`` must not be negative. -.. function:: bitwise_not(x) -> [bigint] +.. function:: bitwise_not(x) -> bigint Returns the bitwise NOT of ``x`` in 2's complement representation. -.. function:: bitwise_or(x, y) -> [bigint] +.. function:: bitwise_or(x, y) -> bigint Returns the bitwise OR of ``x`` and ``y`` in 2's complement representation. -.. function:: bitwise_right_shift(x, shift) -> [bigint]`` +.. function:: bitwise_right_shift(x, shift) -> [same as x] - Returns the logical right shifted value of ``x``. Here x can be of type ``TINYINT``, ``SMALLINT``, ``INTEGER`` and ``BIGINT``. + Returns the logical right shifted value of ``x``. + Supported types of x are: ``TINYINT``, ``SMALLINT``, ``INTEGER`` and ``BIGINT``. + ``shift`` is an ``INTEGER``. -.. function:: bitwise_right_shift_arithmetic(x, shift) -> [bigint]`` +.. function:: bitwise_right_shift_arithmetic(x, shift) -> [same as x] Returns the arithmetic right shift value of ``x``. + Supported types of x are: ``TINYINT``, ``SMALLINT``, ``INTEGER`` and ``BIGINT``. + ``shift`` is an ``INTEGER``. -.. function:: bitwise_shift_left(x, shift, bits) -> [bigint]`` +.. function:: bitwise_shift_left(x, shift, bits) -> bigint Returns the left shift operation on ``x`` (treated as ``bits``-bit integer) shifted by ``shift``. + ``shift`` must not be negative. -.. function:: bitwise_xor(x, y) -> [bigint]`` +.. function:: bitwise_xor(x, y) -> bigint Returns the bitwise XOR of ``x`` and ``y`` in 2's complement representation. diff --git a/velox/docs/functions/presto/conversion.rst b/velox/docs/functions/presto/conversion.rst index 099af8c2f9d21..b24117ae3068a 100644 --- a/velox/docs/functions/presto/conversion.rst +++ b/velox/docs/functions/presto/conversion.rst @@ -30,7 +30,7 @@ are supported if the conversion of their element types are supported. In additio supported conversions to/from JSON are listed in :doc:`json`. .. list-table:: - :widths: 25 25 25 25 25 25 25 25 25 25 25 25 25 + :widths: 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 :header-rows: 1 * - @@ -42,10 +42,13 @@ supported conversions to/from JSON are listed in :doc:`json`. - real - double - varchar + - varbinary - timestamp - timestamp with time zone - date + - interval day to second - decimal + - ipaddress * - tinyint - Y - Y @@ -55,10 +58,13 @@ supported conversions to/from JSON are listed in :doc:`json`. - Y - Y - Y + - + - - - - - Y + - * - smallint - Y - Y @@ -68,10 +74,13 @@ supported conversions to/from JSON are listed in :doc:`json`. - Y - Y - Y + - + - - - - - Y + - * - integer - Y - Y @@ -81,10 +90,13 @@ supported conversions to/from JSON are listed in :doc:`json`. - Y - Y - Y + - + - - - - - Y + - * - bigint - Y - Y @@ -94,10 +106,13 @@ supported conversions to/from JSON are listed in :doc:`json`. - Y - Y - Y + - + - - - - - Y + - * - boolean - Y - Y @@ -107,10 +122,13 @@ supported conversions to/from JSON are listed in :doc:`json`. - Y - Y - Y + - - - - - + - Y + - * - real - Y - Y @@ -120,10 +138,13 @@ supported conversions to/from JSON are listed in :doc:`json`. - Y - Y - Y + - - - - - + - Y + - * - double - Y - Y @@ -133,10 +154,13 @@ supported conversions to/from JSON are listed in :doc:`json`. - Y - Y - Y + - - - - - + - Y + - * - varchar - Y - Y @@ -146,10 +170,29 @@ supported conversions to/from JSON are listed in :doc:`json`. - Y - Y - Y + - + - Y + - Y - Y - - Y + - Y + * - varbinary + - + - + - + - + - + - + - + - + - + - + - + - - + - + - Y * - timestamp - - @@ -159,12 +202,14 @@ supported conversions to/from JSON are listed in :doc:`json`. - - - Y + - - Y - Y - Y - - * - timestamp with time zone - + - + * - timestamp with time zone - - - @@ -173,11 +218,30 @@ supported conversions to/from JSON are listed in :doc:`json`. - - - Y + - + - Y + - - Y - - + - * - date - + - + - + - + - + - + - + - Y + - + - Y + - Y + - + - + - + - + * - interval day to second - - - @@ -186,6 +250,10 @@ supported conversions to/from JSON are listed in :doc:`json`. - - - Y + - + - + - + - - - - @@ -198,10 +266,29 @@ supported conversions to/from JSON are listed in :doc:`json`. - Y - Y - Y + - + - + - - - + - Y - + * - ipaddress + - + - + - + - + - + - + - + - Y - Y + - + - + - + - + - + - Cast to Integral Types ---------------------- @@ -263,16 +350,12 @@ supported cases. SELECT cast(nan() as bigint); -- 0 -From strings +From VARCHAR ^^^^^^^^^^^^ Casting a string to an integral type is allowed if the string represents an integral number within the range of the result type. By default, casting from strings that represent floating-point numbers is not allowed. - -If cast_to_int_by_truncate is set to true, and the string represents a floating-point number, -the decimal part will be truncated for casting to an integer. - Casting from invalid input values throws. Valid examples @@ -283,36 +366,12 @@ Valid examples SELECT cast('+1' as tinyint); -- 1 SELECT cast('-1' as tinyint); -- -1 -Valid examples if cast_to_int_by_truncate=true - -:: - - SELECT cast('12345.67' as tinyint); -- 12345 - SELECT cast('1.2' as tinyint); -- 1 - SELECT cast('-1.8' as tinyint); -- -1 - SELECT cast('1.' as tinyint); -- 1 - SELECT cast('-1.' as tinyint); -- -1 - SELECT cast('0.' as tinyint); -- 0 - SELECT cast('.' as tinyint); -- 0 - SELECT cast('-.' as tinyint); -- 0 - Invalid examples -:: - - SELECT cast('1234567' as tinyint); -- Out of range - SELECT cast('1a' as tinyint); -- Invalid argument - SELECT cast('' as tinyint); -- Invalid argument - SELECT cast('1,234,567' as bigint); -- Invalid argument - SELECT cast('1'234'567' as bigint); -- Invalid argument - SELECT cast('nan' as bigint); -- Invalid argument - SELECT cast('infinity' as bigint); -- Invalid argument - -Invalid examples if cast_to_int_by_truncate=false - :: SELECT cast('12345.67' as tinyint); -- Invalid argument + SELECT cast('12345.67' as bigint); -- Invalid argument SELECT cast('1.2' as tinyint); -- Invalid argument SELECT cast('-1.8' as tinyint); -- Invalid argument SELECT cast('1.' as tinyint); -- Invalid argument @@ -324,20 +383,19 @@ Invalid examples if cast_to_int_by_truncate=false From decimal ^^^^^^^^^^^^ -By default, the decimal part is rounded. If cast_to_int_by_truncate is enabled, the decimal part will be truncated for casting to an integer. +The decimal part is rounded. Valid examples :: - SELECT cast(2.56 decimal(6, 2) as integer); -- 2 /* cast_to_int_by_truncate enabled */ - SELECT cast(2.56 decimal(6, 2) as integer); -- 3 /* cast_to_int_by_truncate disabled */ + SELECT cast(2.56 decimal(6, 2) as integer); -- 3 SELECT cast(3.46 decimal(6, 2) as integer); -- 3 Invalid examples :: - + SELECT cast(214748364890 decimal(12, 2) as integer); -- Out of range Cast to Boolean @@ -363,11 +421,14 @@ Valid examples SELECT cast(nan() as boolean); -- true SELECT cast(infinity() as boolean); -- true SELECT cast(0.0000000000001 as boolean); -- true + SELECT cast(0.5 as boolean); -- true + SELECT cast(-0.5 as boolean); -- true -From strings +From VARCHAR ^^^^^^^^^^^^ -There is a set of strings allowed to be casted to boolean. Casting from other strings to boolean throws. +The strings `t, f, 1, 0, true, false` and their upper case equivalents are allowed to be casted to boolean. +Casting from other strings to boolean throws. Valid examples @@ -379,6 +440,8 @@ Valid examples SELECT cast('true' as boolean); -- true (case insensitive) SELECT cast('f' as boolean); -- false (case insensitive) SELECT cast('false' as boolean); -- false (case insensitive) + SELECT cast('F' as boolean); -- false (case insensitive) + SELECT cast('T' as boolean); -- true (case insensitive) Invalid examples @@ -391,6 +454,7 @@ Invalid examples SELECT cast('-1' as boolean); -- Invalid argument SELECT cast('tr' as boolean); -- Invalid argument SELECT cast('tru' as boolean); -- Invalid argument + SELECT cast('No' as boolean); -- Invalid argument Cast to Floating-Point Types ---------------------------- @@ -416,7 +480,7 @@ behavior. SELECT cast(1.7E308 as real); -- Presto returns Infinity but Velox throws SELECT cast(-1.7E308 as real); -- Presto returns -Infinity but Velox throws -From strings +From VARCHAR ^^^^^^^^^^^^ Casting a string to real is allowed if the string represents an integral or @@ -429,27 +493,32 @@ Valid examples SELECT cast('1.' as real); -- 1.0 SELECT cast('1' as real); -- 1.0 SELECT cast('1.7E308' as real); -- Infinity - SELECT cast('infinity' as real); -- Infinity (case insensitive) - SELECT cast('-infinity' as real); -- -Infinity (case insensitive) - SELECT cast('nan' as real); -- NaN (case insensitive) + SELECT cast('Infinity' as real); -- Infinity (case sensitive) + SELECT cast('-Infinity' as real); -- -Infinity (case sensitive) + SELECT cast('NaN' as real); -- NaN (case sensitive) Invalid examples :: - SELECT cast('1.7E308' as real); -- Out of range SELECT cast('1.2a' as real); -- Invalid argument SELECT cast('1.2.3' as real); -- Invalid argument + SELECT cast('infinity' as real); -- Invalid argument + SELECT cast('-infinity' as real); -- -Invalid argument + SELECT cast('inf' as real); -- Invalid argument + SELECT cast('InfiNiTy' as real); -- Invalid argument + SELECT cast('INFINITY' as real); -- Invalid argument + SELECT cast('nAn' as real); -- Invalid argument + SELECT cast('nan' as real); -- Invalid argument -There are a few corner cases where Velox behaves differently from Presto. -Presto throws INVALID_CAST_ARGUMENT on these queries, while Velox allows these -conversions. We keep the Velox behaivor by intention because it is more -consistent with other supported cases of cast. +Below cases are supported in Presto, but throw in Velox. :: - SELECT cast('InfiNiTy' as real); -- Infinity - SELECT cast('nAn' as real); -- NaN + SELECT cast('1.2f' as real); -- 1.2 + SELECT cast('1.2f' as double); -- 1.2 + SELECT cast('1.2d' as real); -- 1.2 + SELECT cast('1.2d' as double); -- 1.2 From decimal ^^^^^^^^^^^^ @@ -468,9 +537,11 @@ Invalid example SELECT cast(decimal '300.001' as tinyint); -- Out of range -Cast to String +Cast to VARCHAR -------------- +Casting from scalar types to string is allowed. + Valid examples :: @@ -481,38 +552,225 @@ Valid examples SELECT cast(nan() as varchar); -- 'NaN' SELECT cast(infinity() as varchar); -- 'Infinity' SELECT cast(true as varchar); -- 'true' - SELECT cast(timestamp '1970-01-01 00:00:00' as varchar); -- '1970-01-01T00:00:00.000' + SELECT cast(timestamp '1970-01-01 00:00:00' as varchar); -- '1970-01-01 00:00:00.000' + SELECT cast(timestamp '2024-06-01 11:37:15.123 America/New_York' as varchar); -- '2024-06-01 11:37:15.123 America/New_York' SELECT cast(cast(22.51 as DECIMAL(5, 3)) as varchar); -- '22.510' SELECT cast(cast(-22.51 as DECIMAL(4, 2)) as varchar); -- '-22.51' SELECT cast(cast(0.123 as DECIMAL(3, 3)) as varchar); -- '0.123' SELECT cast(cast(1 as DECIMAL(6, 2)) as varchar); -- '1.00' + SELECT cast(cast(0 as DECIMAL(6, 2)) as varchar); -- '0.00' + +From Floating-Point Types +^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, casting a real or double to string returns standard notation if the magnitude of input value is greater than +or equal to 10 :superscript:`-3` but less than 10 :superscript:`7`, and returns scientific notation otherwise. + +Positive zero returns '0.0' and negative zero returns '-0.0'. Positive infinity returns 'Infinity' and negative infinity +returns '-Infinity'. Positive and negative NaN returns 'NaN'. + +If legacy_cast configuration property is true, the result is standard notation for all input value. + +Valid examples if legacy_cast = false, + +:: + + SELECT cast(double '123456789.01234567' as varchar); -- '1.2345678901234567E8' + SELECT cast(double '10000000.0' as varchar); -- '1.0E7' + SELECT cast(double '12345.0' as varchar); -- '12345.0' + SELECT cast(double '-0.001' as varchar); -- '-0.001' + SELECT cast(double '-0.00012' as varchar); -- '-1.2E-4' + SELECT cast(double '0.0' as varchar); -- '0.0' + SELECT cast(double '-0.0' as varchar); -- '-0.0' + SELECT cast(infinity() as varchar); -- 'Infinity' + SELECT cast(-infinity() as varchar); -- '-Infinity' + SELECT cast(nan() as varchar); -- 'NaN' + SELECT cast(-nan() as varchar); -- 'NaN' + + SELECT cast(real '123456780.0' as varchar); -- '1.2345678E8' + SELECT cast(real '10000000.0' as varchar); -- '1.0E7' + SELECT cast(real '12345.0' as varchar); -- '12345.0' + SELECT cast(real '-0.001' as varchar); -- '-0.001' + SELECT cast(real '-0.00012' as varchar); -- '-1.2E-4' + SELECT cast(real '0.0' as varchar); -- '0.0' + SELECT cast(real '-0.0' as varchar); -- '-0.0' + +Valid examples if legacy_cast = true, + +:: + + SELECT cast(double '123456789.01234567' as varchar); -- '123456789.01234567' + SELECT cast(double '10000000.0' as varchar); -- '10000000.0' + SELECT cast(double '-0.001' as varchar); -- '-0.001' + SELECT cast(double '-0.00012' as varchar); -- '-0.00012' + + SELECT cast(real '123456780.0' as varchar); -- '123456784.0' + SELECT cast(real '10000000.0' as varchar); -- '10000000.0' + SELECT cast(real '12345.0' as varchar); -- '12345.0' + SELECT cast(real '-0.00012' as varchar); -- '-0.00011999999696854502' + + +From DATE +^^^^^^^^^ + +Casting DATE to VARCHAR returns an ISO-8601 formatted string: YYYY-MM-DD. + +:: + + SELECT cast(date('2024-03-14') as varchar); -- '2024-03-14' + + +From TIMESTAMP +^^^^^^^^^^^^^^ + +By default, casting a timestamp to a string returns ISO 8601 format with space as separator +between date and time, and the year part is padded with zeros to 4 characters. + +If legacy_cast configuration property is true, the result string uses character 'T' +as separator between date and time and the year part is not padded. + +Valid examples if legacy_cast = false, + +:: + + SELECT cast(timestamp '1970-01-01 00:00:00' as varchar); -- '1970-01-01 00:00:00.000' + SELECT cast(timestamp '2000-01-01 12:21:56.129' as varchar); -- '2000-01-01 12:21:56.129' + SELECT cast(timestamp '384-01-01 08:00:00.000' as varchar); -- '0384-01-01 08:00:00.000' + SELECT cast(timestamp '10000-02-01 16:00:00.000' as varchar); -- '10000-02-01 16:00:00.000' + SELECT cast(timestamp '-10-02-01 10:00:00.000' as varchar); -- '-0010-02-01 10:00:00.000' + +Valid examples if legacy_cast = true, + +:: + + SELECT cast(timestamp '1970-01-01 00:00:00' as varchar); -- '1970-01-01T00:00:00.000' + SELECT cast(timestamp '2000-01-01 12:21:56.129' as varchar); -- '2000-01-01T12:21:56.129' + SELECT cast(timestamp '384-01-01 08:00:00.000' as varchar); -- '384-01-01T08:00:00.000' + SELECT cast(timestamp '-10-02-01 10:00:00.000' as varchar); -- '-10-02-01T10:00:00.000' + +From INTERVAL DAY TO SECOND +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Casting INTERVAL DAY TO SECOND to VARCHAR returns a string formatted as +'[sign]D HH:MM:SS.ZZZ', where 'sign' is an optional '-' sign if interval is negative, D +is the number of whole days in the interval, HH is then number of hours between 00 and +24, MM is the number of minutes between 00 and 59, SS is the number of seconds between +00 and 59, and zzz is the number of milliseconds between 000 and 999. + +:: + + SELECT cast(interval '1' day as varchar); -- '1 00:00:00.000' + SELECT cast(interval '123456' second as varchar); -- '1 10:17:36.000' + SELECT cast(now() - date('2024-03-01') as varchar); -- '35 09:15:54.092' + SELECT cast(date('2024-03-01') - now() as varchar); -- '-35 09:16:20.598' + +From IPADDRESS +^^^^^^^^^^^^^^ + +Casting from IPADDRESS to VARCHAR returns a string formatted as x.x.x.x for IPV4 formatted IPV6 addresses. +For all other IPV6 addresses it will be formatted in compressed alternate form IPV6 defined in `RFC 4291#section-2.2 `_ + +IPV4: + +:: + + SELECT cast(ipaddress '1.2.3.4' as varchar); -- '1.2.3.4' + +IPV6: + +:: + + SELECT cast(ipaddress '2001:0db8:0000:0000:0000:ff00:0042:8329' as varchar); -- '2001:db8::ff00:42:8329' + SELECT cast(ipaddress '0:0:0:0:0:0:13.1.68.3' as varchar); -- '::13.1.68.3' + +IPV4 mapped IPV6: + +:: + + SELECT cast(ipaddress '::ffff:ffff:ffff' as varchar); -- '255.255.255.255' + +Cast to VARBINARY +----------------- + +From IPADDRESS +^^^^^^^^^^^^^^ + +Returns the IPV6 address as a 16 byte varbinary string in network byte order. + +Internally, the type is a pure IPv6 address. Support for IPv4 is handled using the IPv4-mapped IPv6 address range `(RFC 4291#section-2.5.5.2) `_. +When creating an IPADDRESS, IPv4 addresses will be mapped into that range. + +IPV6: + +:: + + SELECT cast(ipaddress '2001:0db8:0000:0000:0000:ff00:0042:8329' as varbinary); -- 0x20010db8000000000000ff0000428329 + +IPV4: + +:: + + SELECT cast('1.2.3.4' as ipaddress); -- 0x00000000000000000000ffff01020304 + +IPV4 mapped IPV6: + +:: + + SELECT cast('::ffff:ffff:ffff' as ipaddress); -- 0x00000000000000000000ffffffffffff Cast to TIMESTAMP ----------------- -From strings +From VARCHAR ^^^^^^^^^^^^ Casting from a string to timestamp is allowed if the string represents a -timestamp in the format `YYYY-MM-DD` followed by an optional `hh:mm:ssZZ`. -Casting from invalid input values throws. +timestamp in the format `YYYY-MM-DD` followed by an optional `hh:mm:ss.MS`. +Seconds and milliseconds are optional. Casting from invalid input values throws. -Valid examples +Valid examples: :: SELECT cast('1970-01-01' as timestamp); -- 1970-01-01 00:00:00 - SELECT cast('1970-01-01 00:00:00' as timestamp); -- 1970-01-01 00:00:00 + SELECT cast('1970-01-01 00:00:00.123' as timestamp); -- 1970-01-01 00:00:00.123 SELECT cast('1970-01-01 02:01' as timestamp); -- 1970-01-01 02:01:00 SELECT cast('1970-01-01 00:00:00-02:00' as timestamp); -- 1970-01-01 02:00:00 -Invalid example +Invalid example: :: SELECT cast('2012-Oct-23' as timestamp); -- Invalid argument -From date +Optionally, strings may also contain timezone information at the end. Timezone +information may be offsets in the format `+01:00` or `-02:00`, for example, or +timezone names, like `UTC`, `Z`, `America/Los_Angeles` and others, +`as defined here `_. + +For example, these strings contain valid timezone information: + +:: + + SELECT cast('1970-01-01 00:00:00 +09:00' as timestamp); + SELECT cast('1970-01-01 00:00:00 UTC' as timestamp); + SELECT cast('1970-01-01 00:00:00 America/Sao_Paulo' as timestamp); + +If timezone information is specified in the string, the returned timestamp +is adjusted to the corresponding timezone. Otherwise, the timestamp is +assumed to be in the client session timezone, and adjusted accordingly +based on the value of `adjust_timestamp_to_session_timezone`, as described below. + +The space between the hour and timezone definition is optional. + +:: + + SELECT cast('1970-01-01 00:00 Z' as timestamp); + SELECT cast('1970-01-01 00:00Z' as timestamp); + +Are both valid. + +From DATE ^^^^^^^^^ Casting from date to timestamp is allowed. @@ -530,26 +788,26 @@ From TIMESTAMP WITH TIME ZONE The results depend on whether configuration property `adjust_timestamp_to_session_timezone` is set or not. If set to true, input timezone is ignored and timestamp is returned as is. For example, -"1970-01-01 00:00:00.000 America/Los_Angeles" becomes "1970-01-01 00:00:00.000". +"1970-01-01 00:00:00.000 America/Los_Angeles" becomes "1970-01-01 08:00:00.000". Otherwise, timestamp is shifted by the offset of the timezone. For example, -"1970-01-01 00:00:00.000 America/Los_Angeles" becomes "1969-12-31 16:00:00.000". +"1970-01-01 00:00:00.000 America/Los_Angeles" becomes "1970-01-01 00:00:00.000". Valid examples :: -- `adjust_timestamp_to_session_timezone` is true - SELECT cast(timestamp '1970-01-01 00:00:00 America/Los_Angeles' as timestamp); -- 1970-01-01 00:00:00.000 - SELECT cast(timestamp '2012-03-09 10:00:00 Asia/Chongqing' as timestamp); -- 2012-03-09 10:00:00.000 - SELECT cast(from_unixtime(0, '+06:00') as timestamp); -- 1970-01-01 00:00:00.000 - SELECT cast(from_unixtime(0, '-02:00') as timestamp); -- 1970-01-01 00:00:00.000 + SELECT to_unixtime(cast(timestamp '1970-01-01 00:00:00 America/Los_Angeles' as timestamp)); -- 28800.0 (1970-01-01 08:00:00.000) + SELECT to_unixtime(cast(timestamp '2012-03-09 10:00:00 Asia/Chongqing' as timestamp)); -- 1.3312584E9 (2012-03-09 02:00:00.000) + SELECT to_unixtime(cast(from_unixtime(0, '+06:00') as timestamp)); -- 0.0 (1970-01-01 00:00:00.000) + SELECT to_unixtime(cast(from_unixtime(0, '-02:00') as timestamp)); -- 0.0 (1970-01-01 00:00:00.000) -- `adjust_timestamp_to_session_timezone` is false - SELECT cast(timestamp '1970-01-01 00:00:00 America/Los_Angeles' as timestamp); -- 1969-12-31 16:00:00.000 - SELECT cast(timestamp '2012-03-09 10:00:00 Asia/Chongqing' as timestamp); -- 2012-03-09 18:00:00.000 - SELECT cast(from_unixtime(0, '+06:00') as timestamp); -- 1970-01-01 06:00:00.000 - SELECT cast(from_unixtime(0, '-02:00') as timestamp); -- 1969-12-31 22:00:00.000 + SELECT to_unixtime(cast(timestamp '1970-01-01 00:00:00 America/Los_Angeles' as timestamp)); -- 0.0 (1970-01-01 00:00:00.000) + SELECT to_unixtime(cast(timestamp '2012-03-09 10:00:00 Asia/Chongqing' as timestamp)); -- 1.3312872E9 (2012-03-09 10:00:00.000) + SELECT to_unixtime(cast(from_unixtime(0, '+06:00') as timestamp)); -- 21600.0 (1970-01-01 06:00:00.000) + SELECT to_unixtime(cast(from_unixtime(0, '-02:00') as timestamp)); -- -7200.0 (1969-12-31 22:00:00.000) Cast to TIMESTAMP WITH TIME ZONE -------------------------------- @@ -580,43 +838,37 @@ Valid examples SELECT cast(timestamp '2012-03-09 10:00:00' as timestamp with time zone); -- 2012-03-09 10:00:00.000 America/Los_Angeles SELECT cast(from_unixtime(0) as timestamp with time zone); -- 1970-01-01 00:00:00.000 America/Los_Angeles -Cast to Date ------------- +From DATE +^^^^^^^^^ -From strings -^^^^^^^^^^^^ +The results depend on `session_timestamp`. -By default, only ISO 8601 strings are supported: `[+-]YYYY-MM-DD`. +Valid examples -If cast_string_to_date_is_iso_8601 is set to false, all Spark supported patterns are allowed. -See the documentation for cast_string_to_date_is_iso_8601 in :ref:`Expression Evaluation Configuration` -for the full list of supported patterns. +:: -Casting from invalid input values throws. + -- session_timezone = America/Los_Angeles + SELECT cast(date '2024-06-01' as timestamp with time zone); -- 2024-06-01 00:00:00.000 America/Los_Angeles -Valid examples + -- session_timezone = Asia/Shanghai + SELECT cast(date '2024-06-01' as timestamp with time zone); -- 2024-06-01 00:00:00.000 Asia/Shanghai -**cast_string_to_date_is_iso_8601=true** +Cast to Date +------------ -:: +From VARCHAR +^^^^^^^^^^^^ - SELECT cast('1970-01-01' as date); -- 1970-01-01 +Only ISO 8601 strings are supported: `[+-]YYYY-MM-DD`. Casting from invalid input values throws. -**cast_string_to_date_is_iso_8601=false** +Valid examples :: - SELECT cast('1970' as date); -- 1970-01-01 - SELECT cast('1970-01' as date); -- 1970-01-01 SELECT cast('1970-01-01' as date); -- 1970-01-01 - SELECT cast('1970-01-01T123' as date); -- 1970-01-01 - SELECT cast('1970-01-01 ' as date); -- 1970-01-01 - SELECT cast('1970-01-01 (BC)' as date); -- 1970-01-01 Invalid examples -**cast_string_to_date_is_iso_8601=true** - :: SELECT cast('2012' as date); -- Invalid argument @@ -628,14 +880,6 @@ Invalid examples SELECT cast('2012.10.23' as date); -- Invalid argument SELECT cast('2012-10-23 ' as date); -- Invalid argument -**cast_string_to_date_is_iso_8601=false** - -:: - - SELECT cast('2012-Oct-23' as date); -- Invalid argument - SELECT cast('2012/10/23' as date); -- Invalid argument - SELECT cast('2012.10.23' as date); -- Invalid argument - From TIMESTAMP ^^^^^^^^^^^^^^ @@ -649,13 +893,40 @@ Valid examples SELECT cast(timestamp '1970-01-01 00:00:00' as date); -- 1970-01-01 SELECT cast(timestamp '1970-01-01 23:59:59' as date); -- 1970-01-01 +From TIMESTAMP WITH TIME ZONE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Casting from TIMESTAMP WITH TIME ZONE to DATE is allowed. If present, +the part of `hh:mm:ss` in the input is ignored. + +Session time zone does not affect the result. + +Valid examples + +:: + + SELECT CAST(timestamp '2024-06-01 01:38:00 America/New_York' as DATE); -- 2024-06-01 + Cast to Decimal --------------- +From boolean type +^^^^^^^^^^^^^^^^^ + +Casting a boolean number to decimal of given precision and scale is allowed. +True value is converted to 1 and false to 0. + +Valid examples + +:: + + SELECT cast(true as decimal(4, 2)); -- decimal '1.00' + SELECT cast(false as decimal(8, 2)); -- decimal '0' + From integral types ^^^^^^^^^^^^^^^^^^^ -Casting an integral numberto a decimal of given precision and scale is allowed +Casting an integral number to a decimal of given precision and scale is allowed if the input value can be represented by the precision and scale. Casting from invalid input values throws. @@ -674,6 +945,33 @@ Invalid examples SELECT cast(123 as decimal(6, 4)); -- Out of range SELECT cast(123 as decimal(4, 2)); -- Out of range +From floating-point types +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Casting a floating-point number to a decimal of given precision and scale is allowed +if the input value can be represented by the precision and scale. When the given +scale is less than the number of decimal places, the floating-point value is rounded. +The conversion precision is up to 15 for double and 6 for real according to the +significant decimal digits precision they provide. Casting from NaN or infinite value +throws. + +Valid example + +:: + + SELECT cast(0.12 as decimal(4, 4)); -- decimal '0.1200' + SELECT cast(0.12 as decimal(4, 1)); -- decimal '0.1' + SELECT cast(0.19 as decimal(4, 1)); -- decimal '0.2' + SELECT cast(0.123456789123123 as decimal(38, 18)); -- decimal '0.123456789123123000' + SELECT cast(real '0.123456' as decimal(38, 18)); -- decimal '0.123456000000000000' + +Invalid example + +:: + + SELECT cast(123.12 as decimal(6, 4)); -- Out of range + SELECT cast(99999.99 as decimal(6, 2)); -- Out of range + From decimal ^^^^^^^^^^^^ @@ -694,3 +992,149 @@ Invalid example SELECT cast(decimal '-1000.000' as decimal(6, 4)); -- Out of range SELECT cast(decimal '123456789' as decimal(9, 1)); -- Out of range + +From varchar +^^^^^^^^^^^^ + +Casting varchar to a decimal of given precision and scale is allowed +if the input value can be represented by the precision and scale. When casting from +a larger scale to a smaller one, the fraction part is rounded. Casting from invalid input value throws. + +Valid example + +:: + + SELECT cast('9999999999.99' as decimal(12, 2)); -- decimal '9999999999.99' + SELECT cast('1.556' as decimal(12, 2)); -- decimal '1.56' + SELECT cast('1.554' as decimal(12, 2)); -- decimal '1.55' + SELECT cast('-1.554' as decimal(12, 2)); -- decimal '-1.55' + SELECT cast('+09' as decimal(12, 2)); -- decimal '9.00' + SELECT cast('9.' as decimal(12, 2)); -- decimal '9.00' + SELECT cast('.9' as decimal(12, 2)); -- decimal '0.90' + SELECT cast('3E+2' as decimal(12, 2)); -- decimal '300.00' + SELECT cast('3E+00002' as decimal(12, 2)); -- decimal '300.00' + SELECT cast('3e+2' as decimal(12, 2)); -- decimal '300.00' + SELECT cast('31.423e+2' as decimal(12, 2)); -- decimal '3142.30' + SELECT cast('1.2e-2' as decimal(12, 2)); -- decimal '0.01' + SELECT cast('1.2e-5' as decimal(12, 2)); -- decimal '0.00' + SELECT cast('0000.123' as decimal(12, 2)); -- decimal '0.12' + SELECT cast('.123000000' as decimal(12, 2)); -- decimal '0.12' + +Invalid example + +:: + + SELECT cast('1.23e67' as decimal(38, 0)); -- Value too large + SELECT cast('0.0446a' as decimal(9, 1)); -- Value is not a number + SELECT cast('' as decimal(9, 1)); -- Value is not a number + SELECT cast('23e-5d' as decimal(9, 1)); -- Value is not a number + SELECT cast('1.23 ' as decimal(38, 0)); -- Value is not a number + SELECT cast(' -3E+2' as decimal(12, 2)); -- Value is not a number + SELECT cast('-3E+2.1' as decimal(12, 2)); -- Value is not a number + SELECT cast('3E+' as decimal(12, 2)); -- Value is not a number + +Cast to IPADDRESS +----------------- + +From VARCHAR +^^^^^^^^^^^^ + +To cast a varchar to IPAddress input string must be in the form of either +IPV4 or IPV6. + +For IPV4 it must be in the form of: +x.x.x.x where each x is an integer value between 0-255. + +For IPV6 it must follow any of the forms defined in `RFC 4291#section-2.2 `_. + +Full form: + +:: + + 2001:0DB8:0000:0000:0008:0800:200C:417A + 2001:DB8:0:0:8:800:200C:417A + +Compressed form: +:: + 2001:DB8::8:800:200C:417A + +Alternate form: +:: + 0:0:0:0:0:0:13.1.68.3 + ::13.1.68.3 + +Internally, the type is a pure IPv6 address. Support for IPv4 is handled using the IPv4-mapped IPv6 address range `(RFC 4291#section-2.5.5.2) `_. +When creating an IPADDRESS, IPv4 addresses will be mapped into that range. + +When formatting an IPADDRESS, any address within the mapped range will be formatted as an IPv4 address. +Other addresses will be formatted as IPv6 using the canonical format defined in `RFC 5952 `_. + +Valid examples: + +:: + + SELECT cast('2001:0db8:0000:0000:0000:ff00:0042:8329' as ipaddress); -- ipaddress '2001:db8::ff00:42:8329' + SELECT cast('1.2.3.4' as ipaddress); -- ipaddress '1.2.3.4' + SELECT cast('::ffff:ffff:ffff' as ipaddress); -- ipaddress '255.255.255.255' + +Invalid examples: + +:: + + SELECT cast('2001:db8::1::1' as ipaddress); -- Invalid IP address '2001:db8::1::1' + SELECT cast('789.1.1.1' as ipaddress); -- Invalid IP address '789.1.1.1' + +From VARBINARY +^^^^^^^^^^^^^^ + +To cast a varbinary to IPAddress it must be either IPV4(4 Bytes) +or IPV6(16 Bytes) in network byte order. + +IPV4: + +:: + +[01, 02, 03, 04] -> 1.2.3.4 + +IPV6: + +:: + +[0x20, 0x01, 0x0d, 0xb8 0x00, 0x00, 0x00, 0x00 0x00 0x00, 0xff, 0x00, 0x00, 0x42, 0x83, 0x29] -> 2001:db8::ff00:42:8329 + +Internally, the type is a pure IPv6 address. Support for IPv4 is handled using the IPv4-mapped IPv6 address range `(RFC 4291#section-2.5.5.2) `_. +When creating an IPADDRESS, IPv4 addresses will be mapped into that range. + +When formatting an IPADDRESS, any address within the mapped range will be formatted as an IPv4 address. +Other addresses will be formatted as IPv6 using the canonical format defined in `RFC 5952 `_. + +IPV6 mapped IPV4 address: + +:: + +[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04] -> 1.2.3.4 + +Valid examples: + +:: + + SELECT cast(from_hex('20010db8000000000000ff0000428329') as ipaddress); -- ipaddress '2001:db8::ff00:42:8329' + SELECT cast(from_hex('01020304') as ipaddress); -- ipaddress '1.2.3.4' + SELECT cast(from_hex('00000000000000000000ffff01020304') as ipaddress); -- ipaddress '1.2.3.4' + +Invalid examples: + +:: + + SELECT cast(from_hex('f000001100') as ipaddress); -- Invalid IP address binary length: 5 + +Miscellaneous +------------- + +.. function:: typeof(x) -> varchar + + Returns the name of the type of x:: + + SELECT typeof(123); -- integer + SELECT typeof(1.5); -- double + SELECT typeof(array[1,2,3]); -- array(integer) diff --git a/velox/docs/functions/presto/coverage.rst b/velox/docs/functions/presto/coverage.rst index 91bb1c791a075..127297b7e197e 100644 --- a/velox/docs/functions/presto/coverage.rst +++ b/velox/docs/functions/presto/coverage.rst @@ -29,6 +29,7 @@ Here is a list of all scalar and aggregate Presto functions with functions that table.coverage tr:nth-child(3) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(3) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(3) td:nth-child(9) {background-color: #6BA81E;} + table.coverage tr:nth-child(4) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(4) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(4) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(4) td:nth-child(4) {background-color: #6BA81E;} @@ -38,6 +39,7 @@ Here is a list of all scalar and aggregate Presto functions with functions that table.coverage tr:nth-child(5) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(5) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(5) td:nth-child(9) {background-color: #6BA81E;} + table.coverage tr:nth-child(6) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(6) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(6) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(6) td:nth-child(9) {background-color: #6BA81E;} @@ -47,6 +49,7 @@ Here is a list of all scalar and aggregate Presto functions with functions that table.coverage tr:nth-child(7) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(7) td:nth-child(9) {background-color: #6BA81E;} table.coverage tr:nth-child(8) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(8) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(8) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(8) td:nth-child(9) {background-color: #6BA81E;} table.coverage tr:nth-child(9) td:nth-child(1) {background-color: #6BA81E;} @@ -69,107 +72,119 @@ Here is a list of all scalar and aggregate Presto functions with functions that table.coverage tr:nth-child(12) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(12) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(12) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(12) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(12) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(13) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(13) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(13) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(13) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(14) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(14) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(14) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(14) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(15) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(15) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(15) td:nth-child(4) {background-color: #6BA81E;} - table.coverage tr:nth-child(16) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(16) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(16) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(16) td:nth-child(4) {background-color: #6BA81E;} - table.coverage tr:nth-child(18) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(17) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(17) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(17) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(17) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(18) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(18) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(18) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(19) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(19) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(19) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(19) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(19) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(19) td:nth-child(7) {background-color: #6BA81E;} - table.coverage tr:nth-child(20) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(20) td:nth-child(2) {background-color: #6BA81E;} - table.coverage tr:nth-child(20) td:nth-child(4) {background-color: #6BA81E;} + table.coverage tr:nth-child(20) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(20) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(20) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(21) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(21) td:nth-child(2) {background-color: #6BA81E;} - table.coverage tr:nth-child(21) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(21) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(21) td:nth-child(7) {background-color: #6BA81E;} + table.coverage tr:nth-child(22) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(22) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(22) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(22) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(22) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(23) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(23) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(23) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(23) td:nth-child(4) {background-color: #6BA81E;} + table.coverage tr:nth-child(23) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(23) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(24) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(24) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(24) td:nth-child(4) {background-color: #6BA81E;} - table.coverage tr:nth-child(24) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(25) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(25) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(25) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(25) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(25) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(26) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(26) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(26) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(26) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(26) td:nth-child(5) {background-color: #6BA81E;} - table.coverage tr:nth-child(27) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(27) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(27) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(27) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(27) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(27) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(28) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(28) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(28) td:nth-child(4) {background-color: #6BA81E;} - table.coverage tr:nth-child(28) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(28) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(29) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(29) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(29) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(29) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(30) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(30) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(30) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(30) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(30) td:nth-child(7) {background-color: #6BA81E;} - table.coverage tr:nth-child(31) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(31) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(31) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(31) td:nth-child(4) {background-color: #6BA81E;} + table.coverage tr:nth-child(31) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(32) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(32) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(32) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(32) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(32) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(32) td:nth-child(7) {background-color: #6BA81E;} + table.coverage tr:nth-child(33) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(33) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(34) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(33) td:nth-child(4) {background-color: #6BA81E;} + table.coverage tr:nth-child(34) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(34) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(34) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(34) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(35) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(35) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(35) td:nth-child(4) {background-color: #6BA81E;} + table.coverage tr:nth-child(35) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(36) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(36) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(36) td:nth-child(4) {background-color: #6BA81E;} + table.coverage tr:nth-child(36) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(37) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(37) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(38) td:nth-child(2) {background-color: #6BA81E;} - table.coverage tr:nth-child(38) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(38) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(38) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(38) td:nth-child(7) {background-color: #6BA81E;} - table.coverage tr:nth-child(39) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(39) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(39) td:nth-child(7) {background-color: #6BA81E;} + table.coverage tr:nth-child(40) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(40) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(40) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(40) td:nth-child(7) {background-color: #6BA81E;} + table.coverage tr:nth-child(41) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(41) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(41) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(41) td:nth-child(7) {background-color: #6BA81E;} - table.coverage tr:nth-child(42) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(42) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(42) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(42) td:nth-child(7) {background-color: #6BA81E;} - table.coverage tr:nth-child(43) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(43) td:nth-child(2) {background-color: #6BA81E;} - table.coverage tr:nth-child(43) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(43) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(43) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(44) td:nth-child(1) {background-color: #6BA81E;} @@ -177,91 +192,118 @@ Here is a list of all scalar and aggregate Presto functions with functions that table.coverage tr:nth-child(44) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(44) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(45) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(45) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(45) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(45) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(45) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(46) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(46) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(46) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(46) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(46) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(47) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(47) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(47) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(48) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(48) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(48) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(49) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(49) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(49) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(50) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(50) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(50) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(51) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(51) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(51) td:nth-child(5) {background-color: #6BA81E;} - table.coverage tr:nth-child(51) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(52) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(52) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(52) td:nth-child(5) {background-color: #6BA81E;} - table.coverage tr:nth-child(52) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(53) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(53) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(53) td:nth-child(5) {background-color: #6BA81E;} - table.coverage tr:nth-child(53) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(54) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(54) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(54) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(54) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(55) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(55) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(55) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(55) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(56) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(56) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(56) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(56) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(56) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(57) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(57) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(57) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(57) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(57) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(58) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(58) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(58) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(58) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(58) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(59) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(59) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(59) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(59) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(59) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(60) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(60) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(60) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(60) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(60) td:nth-child(7) {background-color: #6BA81E;} + table.coverage tr:nth-child(61) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(61) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(61) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(61) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(61) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(62) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(62) td:nth-child(2) {background-color: #6BA81E;} - table.coverage tr:nth-child(62) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(62) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(62) td:nth-child(7) {background-color: #6BA81E;} + table.coverage tr:nth-child(63) td:nth-child(2) {background-color: #6BA81E;} table.coverage tr:nth-child(63) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(64) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(64) td:nth-child(2) {background-color: #6BA81E;} - table.coverage tr:nth-child(64) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(64) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(64) td:nth-child(7) {background-color: #6BA81E;} - table.coverage tr:nth-child(65) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(65) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(65) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(65) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(65) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(66) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(66) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(66) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(67) td:nth-child(1) {background-color: #6BA81E;} - table.coverage tr:nth-child(67) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(67) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(68) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(68) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(68) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(68) td:nth-child(7) {background-color: #6BA81E;} + table.coverage tr:nth-child(69) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(69) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(69) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(70) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(70) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(70) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(70) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(71) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(71) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(71) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(71) td:nth-child(7) {background-color: #6BA81E;} + table.coverage tr:nth-child(72) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(72) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(72) td:nth-child(5) {background-color: #6BA81E;} - table.coverage tr:nth-child(73) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(73) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(73) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(73) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(74) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(75) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(74) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(74) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(75) td:nth-child(3) {background-color: #6BA81E;} - table.coverage tr:nth-child(76) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(75) td:nth-child(5) {background-color: #6BA81E;} + table.coverage tr:nth-child(75) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(76) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(76) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(77) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(77) td:nth-child(3) {background-color: #6BA81E;} + table.coverage tr:nth-child(78) td:nth-child(1) {background-color: #6BA81E;} + table.coverage tr:nth-child(78) td:nth-child(3) {background-color: #6BA81E;} .. table:: @@ -271,81 +313,82 @@ Here is a list of all scalar and aggregate Presto functions with functions that ======================================== ======================================== ======================================== ======================================== ======================================== == ======================================== == ======================================== Scalar Functions Aggregate Functions Window Functions ================================================================================================================================================================================================================ == ======================================== == ======================================== - :func:`abs` :func:`date_format` :func:`is_finite` :func:`regexp_extract` st_point :func:`approx_distinct` :func:`cume_dist` - :func:`acos` :func:`date_parse` :func:`is_infinite` :func:`regexp_extract_all` st_pointn :func:`approx_most_frequent` :func:`dense_rank` - :func:`all_match` :func:`date_trunc` :func:`is_json_scalar` :func:`regexp_like` st_points :func:`approx_percentile` :func:`first_value` - any_keys_match :func:`day` :func:`is_nan` :func:`regexp_replace` st_polygon :func:`approx_set` :func:`lag` - :func:`any_match` :func:`day_of_month` is_subnet_of regexp_split st_relate :func:`arbitrary` :func:`last_value` - any_values_match :func:`day_of_week` jaccard_index regress st_startpoint :func:`array_agg` :func:`lead` - :func:`array_average` :func:`day_of_year` :func:`json_array_contains` reidentification_potential st_symdifference :func:`avg` :func:`nth_value` - array_cum_sum :func:`degrees` json_array_get remove_nulls st_touches :func:`bitwise_and_agg` :func:`ntile` - :func:`array_distinct` :func:`dow` :func:`json_array_length` render st_union :func:`bitwise_or_agg` :func:`percent_rank` - :func:`array_duplicates` :func:`doy` :func:`json_extract` :func:`repeat` st_within :func:`bool_and` :func:`rank` - :func:`array_except` :func:`e` :func:`json_extract_scalar` :func:`replace` st_x :func:`bool_or` :func:`row_number` - :func:`array_frequency` :func:`element_at` :func:`json_format` :func:`reverse` st_xmax :func:`checksum` - :func:`array_has_duplicates` :func:`empty_approx_set` :func:`json_parse` rgb st_xmin classification_fall_out - :func:`array_intersect` ends_with :func:`json_size` :func:`round` st_y classification_miss_rate - :func:`array_join` enum_key key_sampling_percent :func:`rpad` st_ymax classification_precision - :func:`array_max` :func:`exp` laplace_cdf :func:`rtrim` st_ymin classification_recall - array_max_by expand_envelope last_day_of_month scale_qdigest starts_with classification_thresholds - :func:`array_min` f_cdf :func:`least` :func:`second` :func:`strpos` convex_hull_agg - array_min_by features :func:`length` secure_random :func:`strrpos` :func:`corr` - :func:`array_normalize` :func:`filter` levenshtein_distance :func:`sequence` :func:`substr` :func:`count` - :func:`array_position` :func:`filter` line_interpolate_point :func:`sha1` :func:`tan` :func:`count_if` - array_remove find_first line_locate_point :func:`sha256` :func:`tanh` :func:`covar_pop` - :func:`array_sort` find_first_index :func:`ln` :func:`sha512` tdigest_agg :func:`covar_samp` - :func:`array_sort_desc` :func:`flatten` localtime :func:`shuffle` :func:`timezone_hour` differential_entropy - :func:`array_sum` flatten_geometry_collections localtimestamp :func:`sign` :func:`timezone_minute` :func:`entropy` - :func:`array_union` :func:`floor` :func:`log10` simplify_geometry :func:`to_base` evaluate_classifier_predictions - :func:`arrays_overlap` fnv1_32 :func:`log2` :func:`sin` :func:`to_base64` :func:`every` - :func:`asin` fnv1_64 :func:`lower` :func:`slice` :func:`to_base64url` geometric_mean - :func:`atan` fnv1a_32 :func:`lpad` spatial_partitions :func:`to_big_endian_32` geometry_union_agg - :func:`atan2` fnv1a_64 :func:`ltrim` :func:`split` :func:`to_big_endian_64` :func:`histogram` - bar :func:`format_datetime` :func:`map` :func:`split_part` to_geometry khyperloglog_agg - :func:`beta_cdf` :func:`from_base` :func:`map_concat` split_to_map :func:`to_hex` :func:`kurtosis` - bing_tile from_base32 :func:`map_entries` split_to_multimap to_ieee754_32 learn_classifier - bing_tile_at :func:`from_base64` :func:`map_filter` :func:`spooky_hash_v2_32` :func:`to_ieee754_64` learn_libsvm_classifier - bing_tile_children :func:`from_base64url` :func:`map_from_entries` :func:`spooky_hash_v2_64` to_iso8601 learn_libsvm_regressor - bing_tile_coordinates :func:`from_big_endian_32` :func:`map_keys` :func:`sqrt` to_milliseconds learn_regressor - bing_tile_parent :func:`from_big_endian_64` map_normalize st_area to_spherical_geography make_set_digest - bing_tile_polygon :func:`from_hex` map_remove_null_values st_asbinary :func:`to_unixtime` :func:`map_agg` - bing_tile_quadkey from_ieee754_32 map_subset st_astext :func:`to_utf8` :func:`map_union` - bing_tile_zoom_level from_ieee754_64 map_top_n st_boundary :func:`transform` :func:`map_union_sum` - bing_tiles_around from_iso8601_date map_top_n_keys st_buffer :func:`transform_keys` :func:`max` - :func:`binomial_cdf` from_iso8601_timestamp map_top_n_values st_centroid :func:`transform_values` :func:`max_by` - :func:`bit_count` :func:`from_unixtime` :func:`map_values` st_contains :func:`trim` :func:`merge` - :func:`bitwise_and` :func:`from_utf8` :func:`map_zip_with` st_convexhull :func:`trim_array` merge_set_digest - :func:`bitwise_arithmetic_shift_right` gamma_cdf :func:`md5` st_coorddim :func:`truncate` :func:`min` - :func:`bitwise_left_shift` geometry_as_geojson merge_hll st_crosses typeof :func:`min_by` - :func:`bitwise_logical_shift_right` geometry_from_geojson merge_khll st_difference uniqueness_distribution multimap_agg - :func:`bitwise_not` geometry_invalid_reason :func:`millisecond` st_dimension :func:`upper` numeric_histogram - :func:`bitwise_or` geometry_nearest_points :func:`minute` st_disjoint :func:`url_decode` qdigest_agg - :func:`bitwise_right_shift` geometry_to_bing_tiles :func:`mod` st_distance :func:`url_encode` reduce_agg - :func:`bitwise_right_shift_arithmetic` geometry_to_dissolved_bing_tiles :func:`month` st_endpoint :func:`url_extract_fragment` :func:`regr_intercept` - :func:`bitwise_shift_left` geometry_union multimap_from_entries st_envelope :func:`url_extract_host` :func:`regr_slope` - :func:`bitwise_xor` great_circle_distance murmur3_x64_128 st_envelopeaspts :func:`url_extract_parameter` :func:`set_agg` - :func:`cardinality` :func:`greatest` myanmar_font_encoding st_equals :func:`url_extract_path` :func:`set_union` - :func:`cauchy_cdf` hamming_distance myanmar_normalize_unicode st_exteriorring :func:`url_extract_port` :func:`skewness` - :func:`cbrt` hash_counts :func:`nan` st_geometries :func:`url_extract_protocol` spatial_partitioning - :func:`ceil` :func:`hmac_md5` ngrams st_geometryfromtext :func:`url_extract_query` :func:`stddev` - :func:`ceiling` :func:`hmac_sha1` no_keys_match st_geometryn uuid :func:`stddev_pop` - :func:`chi_squared_cdf` :func:`hmac_sha256` no_values_match st_geometrytype value_at_quantile :func:`stddev_samp` - :func:`chr` :func:`hmac_sha512` :func:`none_match` st_geomfrombinary values_at_quantiles :func:`sum` - classify :func:`hour` :func:`normal_cdf` st_interiorringn :func:`week` tdigest_agg - :func:`codepoint` :func:`infinity` normalize st_interiorrings :func:`week_of_year` :func:`var_pop` - color intersection_cardinality now st_intersection weibull_cdf :func:`var_samp` - :func:`combinations` :func:`inverse_beta_cdf` :func:`parse_datetime` st_intersects :func:`width_bucket` :func:`variance` - :func:`concat` inverse_binomial_cdf parse_duration st_isclosed wilson_interval_lower - :func:`contains` inverse_cauchy_cdf parse_presto_data_size st_isempty wilson_interval_upper - :func:`cos` inverse_chi_squared_cdf :func:`pi` st_isring word_stem - :func:`cosh` inverse_f_cdf pinot_binary_decimal_to_double st_issimple :func:`xxhash64` - cosine_similarity inverse_gamma_cdf poisson_cdf st_isvalid :func:`year` - :func:`crc32` inverse_laplace_cdf :func:`pow` st_length :func:`year_of_week` - :func:`current_date` inverse_normal_cdf :func:`power` st_linefromtext :func:`yow` - current_time inverse_poisson_cdf quantile_at_value st_linestring :func:`zip` - current_timestamp inverse_weibull_cdf :func:`quarter` st_multipoint :func:`zip_with` - current_timezone ip_prefix :func:`radians` st_numgeometries - :func:`date` ip_subnet_max :func:`rand` st_numinteriorring - :func:`date_add` ip_subnet_min :func:`random` st_numpoints - :func:`date_diff` ip_subnet_range :func:`reduce` st_overlaps + :func:`abs` :func:`date_diff` :func:`is_finite` :func:`regexp_extract` st_overlaps :func:`approx_distinct` :func:`cume_dist` + :func:`acos` :func:`date_format` :func:`is_infinite` :func:`regexp_extract_all` st_point :func:`approx_most_frequent` :func:`dense_rank` + :func:`all_match` :func:`date_parse` :func:`is_json_scalar` :func:`regexp_like` st_pointn :func:`approx_percentile` :func:`first_value` + :func:`any_keys_match` :func:`date_trunc` :func:`is_nan` :func:`regexp_replace` st_points :func:`approx_set` :func:`lag` + :func:`any_match` :func:`day` is_subnet_of regexp_split st_polygon :func:`arbitrary` :func:`last_value` + :func:`any_values_match` :func:`day_of_month` jaccard_index regress st_relate :func:`array_agg` :func:`lead` + :func:`array_average` :func:`day_of_week` :func:`json_array_contains` reidentification_potential st_startpoint :func:`avg` :func:`nth_value` + array_cum_sum :func:`day_of_year` json_array_get :func:`remove_nulls` st_symdifference :func:`bitwise_and_agg` :func:`ntile` + :func:`array_distinct` :func:`degrees` :func:`json_array_length` render st_touches :func:`bitwise_or_agg` :func:`percent_rank` + :func:`array_duplicates` :func:`dow` :func:`json_extract` :func:`repeat` st_union :func:`bool_and` :func:`rank` + :func:`array_except` :func:`doy` :func:`json_extract_scalar` :func:`replace` st_within :func:`bool_or` :func:`row_number` + :func:`array_frequency` :func:`e` :func:`json_format` replace_first st_x :func:`checksum` + :func:`array_has_duplicates` :func:`element_at` :func:`json_parse` :func:`reverse` st_xmax classification_fall_out + :func:`array_intersect` :func:`empty_approx_set` :func:`json_size` rgb st_xmin classification_miss_rate + :func:`array_join` :func:`ends_with` key_sampling_percent :func:`round` st_y classification_precision + array_least_frequent enum_key :func:`laplace_cdf` :func:`rpad` st_ymax classification_recall + :func:`array_max` :func:`exp` :func:`last_day_of_month` :func:`rtrim` st_ymin classification_thresholds + array_max_by expand_envelope :func:`least` scale_qdigest :func:`starts_with` convex_hull_agg + :func:`array_min` :func:`f_cdf` :func:`length` :func:`second` :func:`strpos` :func:`corr` + array_min_by features :func:`levenshtein_distance` secure_rand :func:`strrpos` :func:`count` + :func:`array_normalize` :func:`filter` line_interpolate_point secure_random :func:`substr` :func:`count_if` + :func:`array_position` :func:`filter` line_locate_point :func:`sequence` :func:`tan` :func:`covar_pop` + :func:`array_remove` :func:`find_first` :func:`ln` :func:`sha1` :func:`tanh` :func:`covar_samp` + :func:`array_sort` :func:`find_first_index` localtime :func:`sha256` tdigest_agg differential_entropy + :func:`array_sort_desc` :func:`flatten` localtimestamp :func:`sha512` :func:`timezone_hour` :func:`entropy` + :func:`array_sum` flatten_geometry_collections :func:`log10` :func:`shuffle` :func:`timezone_minute` evaluate_classifier_predictions + array_top_n :func:`floor` :func:`log2` :func:`sign` :func:`to_base` :func:`every` + :func:`array_union` fnv1_32 :func:`lower` simplify_geometry to_base32 :func:`geometric_mean` + :func:`arrays_overlap` fnv1_64 :func:`lpad` :func:`sin` :func:`to_base64` geometry_union_agg + :func:`asin` fnv1a_32 :func:`ltrim` :func:`slice` :func:`to_base64url` :func:`histogram` + :func:`atan` fnv1a_64 :func:`map` spatial_partitions :func:`to_big_endian_32` khyperloglog_agg + :func:`atan2` :func:`format_datetime` :func:`map_concat` :func:`split` :func:`to_big_endian_64` :func:`kurtosis` + bar :func:`from_base` :func:`map_entries` :func:`split_part` to_geometry learn_classifier + :func:`beta_cdf` from_base32 :func:`map_filter` :func:`split_to_map` :func:`to_hex` learn_libsvm_classifier + bing_tile :func:`from_base64` :func:`map_from_entries` split_to_multimap :func:`to_ieee754_32` learn_libsvm_regressor + bing_tile_at :func:`from_base64url` :func:`map_keys` :func:`spooky_hash_v2_32` :func:`to_ieee754_64` learn_regressor + bing_tile_children :func:`from_big_endian_32` map_keys_by_top_n_values :func:`spooky_hash_v2_64` to_iso8601 make_set_digest + bing_tile_coordinates :func:`from_big_endian_64` :func:`map_normalize` :func:`sqrt` to_milliseconds :func:`map_agg` + bing_tile_parent :func:`from_hex` map_remove_null_values st_area to_spherical_geography :func:`map_union` + bing_tile_polygon :func:`from_ieee754_32` :func:`map_subset` st_asbinary :func:`to_unixtime` :func:`map_union_sum` + bing_tile_quadkey :func:`from_ieee754_64` :func:`map_top_n` st_astext :func:`to_utf8` :func:`max` + bing_tile_zoom_level :func:`from_iso8601_date` map_top_n_keys st_boundary trail :func:`max_by` + bing_tiles_around from_iso8601_timestamp map_top_n_values st_buffer :func:`transform` :func:`merge` + :func:`binomial_cdf` :func:`from_unixtime` :func:`map_values` st_centroid :func:`transform_keys` merge_set_digest + :func:`bit_count` :func:`from_utf8` :func:`map_zip_with` st_contains :func:`transform_values` :func:`min` + :func:`bitwise_and` :func:`gamma_cdf` :func:`md5` st_convexhull :func:`trim` :func:`min_by` + :func:`bitwise_arithmetic_shift_right` geometry_as_geojson merge_hll st_coorddim :func:`trim_array` :func:`multimap_agg` + :func:`bitwise_left_shift` geometry_from_geojson merge_khll st_crosses :func:`truncate` noisy_avg_gaussian + :func:`bitwise_logical_shift_right` geometry_invalid_reason :func:`millisecond` st_difference :func:`typeof` noisy_count_gaussian + :func:`bitwise_not` geometry_nearest_points :func:`minute` st_dimension uniqueness_distribution noisy_count_if_gaussian + :func:`bitwise_or` geometry_to_bing_tiles :func:`mod` st_disjoint :func:`upper` noisy_sum_gaussian + :func:`bitwise_right_shift` geometry_to_dissolved_bing_tiles :func:`month` st_distance :func:`url_decode` numeric_histogram + :func:`bitwise_right_shift_arithmetic` geometry_union :func:`multimap_from_entries` st_endpoint :func:`url_encode` qdigest_agg + :func:`bitwise_shift_left` great_circle_distance murmur3_x64_128 st_envelope :func:`url_extract_fragment` :func:`reduce_agg` + :func:`bitwise_xor` :func:`greatest` myanmar_font_encoding st_envelopeaspts :func:`url_extract_host` :func:`regr_avgx` + :func:`cardinality` :func:`hamming_distance` myanmar_normalize_unicode st_equals :func:`url_extract_parameter` :func:`regr_avgy` + :func:`cauchy_cdf` hash_counts :func:`nan` st_exteriorring :func:`url_extract_path` :func:`regr_count` + :func:`cbrt` :func:`hmac_md5` :func:`ngrams` st_geometries :func:`url_extract_port` :func:`regr_intercept` + :func:`ceil` :func:`hmac_sha1` :func:`no_keys_match` st_geometryfromtext :func:`url_extract_protocol` :func:`regr_r2` + :func:`ceiling` :func:`hmac_sha256` :func:`no_values_match` st_geometryn :func:`url_extract_query` :func:`regr_slope` + :func:`chi_squared_cdf` :func:`hmac_sha512` :func:`none_match` st_geometrytype uuid :func:`regr_sxx` + :func:`chr` :func:`hour` :func:`normal_cdf` st_geomfrombinary value_at_quantile :func:`regr_sxy` + classify :func:`infinity` normalize st_interiorringn values_at_quantiles :func:`regr_syy` + :func:`codepoint` intersection_cardinality now st_interiorrings :func:`week` :func:`set_agg` + color :func:`inverse_beta_cdf` :func:`parse_datetime` st_intersection :func:`week_of_year` :func:`set_union` + :func:`combinations` inverse_binomial_cdf parse_duration st_intersects :func:`weibull_cdf` :func:`skewness` + :func:`concat` inverse_cauchy_cdf parse_presto_data_size st_isclosed :func:`width_bucket` spatial_partitioning + :func:`contains` inverse_chi_squared_cdf :func:`pi` st_isempty :func:`wilson_interval_lower` :func:`stddev` + :func:`cos` inverse_f_cdf pinot_binary_decimal_to_double st_isring :func:`wilson_interval_upper` :func:`stddev_pop` + :func:`cosh` inverse_gamma_cdf :func:`poisson_cdf` st_issimple word_stem :func:`stddev_samp` + :func:`cosine_similarity` inverse_laplace_cdf :func:`pow` st_isvalid :func:`xxhash64` :func:`sum` + :func:`crc32` inverse_normal_cdf :func:`power` st_length :func:`year` tdigest_agg + :func:`current_date` inverse_poisson_cdf quantile_at_value st_linefromtext :func:`year_of_week` :func:`var_pop` + current_time inverse_weibull_cdf :func:`quarter` st_linestring :func:`yow` :func:`var_samp` + current_timestamp ip_prefix :func:`radians` st_multipoint :func:`zip` :func:`variance` + current_timezone ip_subnet_max :func:`rand` st_numgeometries :func:`zip_with` + :func:`date` ip_subnet_min :func:`random` st_numinteriorring + :func:`date_add` ip_subnet_range :func:`reduce` st_numpoints ======================================== ======================================== ======================================== ======================================== ======================================== == ======================================== == ======================================== diff --git a/velox/docs/functions/presto/datetime.rst b/velox/docs/functions/presto/datetime.rst index 4d4e459875eb2..516da0d778a25 100644 --- a/velox/docs/functions/presto/datetime.rst +++ b/velox/docs/functions/presto/datetime.rst @@ -1,7 +1,89 @@ ===================================== -Date and Time Functions +Date and Time Functions and Operators ===================================== +Date and Time Operators +----------------------- + +.. list-table:: + :widths: 15 60 25 + :header-rows: 1 + + * - Operator + - Example + - Result + * - ``+`` + - ``interval '1' second + interval '1' hour`` + - ``0 01:00:01.000`` + * - ``+`` + - ``timestamp '1970-01-01 00:00:00.000' + interval '1' second`` + - ``1970-01-01 00:00:01.000`` + * - ``-`` + - ``interval '1' hour - interval '1' second`` + - ``0 00:59:59.000`` + * - ``-`` + - ``timestamp '1970-01-01 00:00:00.000' - interval '1' second`` + - ``1969-12-31 23:59:59.000`` + * - ``*`` + - ``interval '1' second * 2`` + - ``0 00:00:02.000`` + * - ``*`` + - ``2 * interval '1' second`` + - ``0 00:00:02.000`` + * - ``*`` + - ``interval '1' second * 0.001`` + - ``0 00:00:00.001`` + * - ``*`` + - ``0.001 * interval '1' second`` + - ``0 00:00:00.001`` + * - ``/`` + - ``interval '15' second / 1.5`` + - ``0 00:00:10.000`` + +.. function:: plus(x, y) -> [same as x] + + Returns the sum of ``x`` and ``y``. Both ``x`` and ``y`` are intervals day + to second or one of them can be timestamp. For addition of two intervals day to + second, returns ``-106751991167 07:12:55.808`` when the addition overflows + in positive and returns ``106751991167 07:12:55.807`` when the addition + overflows in negative. When addition of a timestamp with an interval day to + second, overflowed results are wrapped around. + +.. function:: minus(x, y) -> [same as x] + + Returns the result of subtracting ``y`` from ``x``. Both ``x`` and ``y`` + are intervals day to second or ``x`` can be timestamp. For subtraction of + two intervals day to second, returns ``-106751991167 07:12:55.808`` when + the subtraction overflows in positive and returns ``106751991167 07:12:55.807`` + when the subtraction overflows in negative. For subtraction of an interval + day to second from a timestamp, overflowed results are wrapped around. + +.. function:: multiply(interval day to second, x) -> interval day to second + + Returns the result of multiplying ``interval day to second`` by ``x``. + ``x`` can be a bigint or double. Returns ``0`` when ``x`` is NaN. Returns + ``106751991167 07:12:55.807`` when ``x`` is infinity or when the + multiplication overflow in positive. Returns ``-106751991167 07:12:55.808`` + when ``x`` is -infinity or when the multiplication overflow in negiative. + +.. function:: multiply(x, interval day to second) -> interval day to second + + Returns the result of multiplying ``x`` by ``interval day to second``. + Same as ``multiply(interval day to second, x)``. + +.. function:: divide(interval day to second, x) -> interval day to second + + Returns the result of ``interval day to second`` divided by ``x``. ``x`` is + a double. Returns ``0`` when ``x`` is NaN or is infinity. Returns + ``106751991167 07:12:55.807`` when ``x`` is ``0.0`` and + ``interval day to second`` is not ``0``, or when the division overflows in + positive. Returns ``-106751991167 07:12:55.808`` when ``x`` is ``-0.0`` and + ``interval day to second`` is not ``0``, or when the division overflows in + negiative. + +Date and Time Functions +----------------------- + .. function:: current_date() -> date Returns the current date. @@ -10,6 +92,56 @@ Date and Time Functions This is an alias for ``CAST(x AS date)``. +.. function:: from_iso8601_date(string) -> date + + Parses the ISO 8601 formatted ``string`` into a ``date``. + + Accepts formats described by the following syntax:: + + date = yyyy ['-' MM ['-' dd]] + + Examples of valid input strings: + + * '2012' + * '2012-4' + * '2012-04' + * '2012-4-7' + * '2012-04-07' + * '2012-04-07 ' + +.. function:: from_iso8601_timestamp(string) -> timestamp with time zone + + Parses the ISO 8601 formatted string into a timestamp with time zone. + + Accepts formats described by the following syntax:: + + datetime = time | date-opt-time + time = 'T' time-element [offset] + date-opt-time = date-element ['T' [time-element] [offset]] + date-element = yyyy ['-' MM ['-' dd]] + time-element = HH [minute-element] | [fraction] + minute-element = ':' mm [second-element] | [fraction] + second-element = ':' ss [fraction] + fraction = ('.' | ',') digit+ + offset = 'Z' | (('+' | '-') HH [':' mm [':' ss [('.' | ',') SSS]]]) + + Examples of valid input strings: + + * '2012' + * '2012-4' + * '2012-04' + * '2012-4-7' + * '2012-04-07' + * '2012-04-07 ' + * '2012-04T01:02' + * 'T01:02:34' + * 'T01:02:34,123' + * '2012-04-07T01:02:34' + * '2012-04-07T01:02:34.123' + * '2012-04-07T01:02:34,123' + * '2012-04-07T01:02:34.123Z' + * '2012-04-07T01:02:34.123-05:00' + .. function:: from_unixtime(unixtime) -> timestamp Returns the UNIX timestamp ``unixtime`` as a timestamp. @@ -20,6 +152,27 @@ Date and Time Functions Returns the UNIX timestamp ``unixtime`` as a timestamp with time zone using ``string`` for the time zone. +.. function:: from_unixtime(unixtime, hours, minutes) -> timestamp with time zone + + Returns the UNIX timestamp ``unixtime`` as a timestamp with time zone + using ``hours`` and ``minutes`` for the time zone offset. + The offset must be in [-14:00, 14:00] range. + +.. function:: to_iso8601(x) -> varchar + + Formats ``x`` as an ISO 8601 string. Supported types for ``x`` are: + DATE, TIMESTAMP, TIMESTAMP WITH TIME ZONE. + + Example results:: + + SELECT to_iso8601(current_date); -- 2024-06-06 + SELECT to_iso8601(now()); -- 2024-06-06T20:25:46.726-07:00 + SELECT to_iso8601(now() + interval '6' month); -- 2024-12-06T20:27:11.992-08:00 + +.. function:: to_milliseconds(interval) -> bigint + + Returns the day-to-second ``interval`` as milliseconds. + .. function:: to_unixtime(timestamp) -> double Returns ``timestamp`` as a UNIX timestamp. @@ -61,6 +214,7 @@ Unit Description ``minute`` ``Minutes`` ``hour`` ``Hours`` ``day`` ``Days`` +``week`` ``Weeks`` ``month`` ``Months`` ``quarter`` ``Quarters of a year`` ``year`` ``Years`` @@ -141,7 +295,16 @@ The functions in this section leverage a native cpp implementation that follows a format string compatible with JodaTime’s `DateTimeFormat `_ pattern format. The symbols currently supported are ``y``, ``Y``, ``M`` , ``d``, -``H``, ``m``, ``s``, ``S``, and ``Z``. +``H``, ``m``, ``s``, ``S``, ``z`` and ``Z``. + +``z`` represents a timezone name (3-letter format), and ``Z`` a timezone offset +specified using the format ``+00``, ``+00:00`` or ``+0000`` (or ``-``). ``Z`` +also accepts ``UTC``, ``UCT``, ``GMT``, and ``GMT0`` as valid representations +of GMT. + +.. function:: format_datetime(timestamp, format) -> varchar + + Formats ``timestamp`` as a string using ``format``. .. function:: parse_datetime(string, format) -> timestamp with time zone @@ -161,6 +324,8 @@ arbitrary large timestamps. Returns the day of the month from ``x``. + The supported types for ``x`` are DATE, TIMESTAMP, TIMESTAMP WITH TIME ZONE, INTERVAL DAY TO SECOND. + .. function:: day_of_month(x) -> bigint This is an alias for :func:`day`. @@ -186,6 +351,8 @@ arbitrary large timestamps. .. function:: hour(x) -> bigint Returns the hour of the day from ``x``. The value ranges from 0 to 23. + Supported types for ``x`` are: DATE, TIMESTAMP, TIMESTAMP WITH TIME ZONE, + INTERVAL DAY TO SECOND¶. .. function:: last_day_of_month(x) -> date @@ -193,15 +360,18 @@ arbitrary large timestamps. .. function:: millisecond(x) -> int64 - Returns the millisecond of the second from ``x``. + Returns the millisecond of the second from ``x``. Supported types for ``x`` are: + DATE, TIMESTAMP, TIMESTAMP WITH TIME ZONE, INTERVAL DAY TO SECOND¶. .. function:: minute(x) -> bigint - Returns the minute of the hour from ``x``. + Returns the minute of the hour from ``x``. Supported types for ``x`` are: + DATE, TIMESTAMP, TIMESTAMP WITH TIME ZONE, INTERVAL DAY TO SECOND¶. .. function:: month(x) -> bigint - Returns the month of the year from ``x``. + Returns the month of the year from ``x``. Supported types for ``x`` are: + DATE, TIMESTAMP, TIMESTAMP WITH TIME ZONE, INTERVAL YEAR TO MONTH. .. function:: quarter(x) -> bigint @@ -209,7 +379,8 @@ arbitrary large timestamps. .. function:: second(x) -> bigint - Returns the second of the minute from ``x``. + Returns the second of the minute from ``x``. Supported types for ``x`` are: + DATE, TIMESTAMP, TIMESTAMP WITH TIME ZONE, INTERVAL DAY TO SECOND¶. .. function:: timezone_hour(timestamp) -> bigint @@ -231,7 +402,8 @@ arbitrary large timestamps. .. function:: year(x) -> bigint - Returns the year from ``x``. + Returns the year from ``x``. Supported types for ``x`` are: + DATE, TIMESTAMP, TIMESTAMP WITH TIME ZONE, INTERVAL YEAR TO MONTH. .. function:: year_of_week(x) -> bigint @@ -240,3 +412,60 @@ arbitrary large timestamps. .. function:: yow(x) -> bigint This is an alias for :func:`year_of_week`. + +.. _presto-time-zones: + +Time Zones +---------- + +Velox has full support for time zone rules, which are needed to perform date/time +calculations correctly. Typically, the session time zone is used for temporal +calculations. This is the time zone of the client computer that submits the query, if +available. Otherwise, it is the time zone of the server running the Presto coordinator. + +Queries that operate with time zones that follow daylight saving can produce unexpected +results. For example, if we run the following query in the `America/Los Angeles` time +zone: :: + + SELECT date_add('hour', 24, cast('2014-03-08 09:00:00' as timestamp)); + -- 2014-03-09 10:00:00.000 + +The timestamp appears to only advance 23 hours. This is because on March 9th clocks in +`America/Los Angeles` are turned forward 1 hour, so March 9th only has 23 hours. To +advance the day part of the timestamp, use the `day` unit instead: :: + + SELECT date_add('day', 1, cast('2014-03-08 09:00:00' as timestamp)); + -- 2014-03-09 09:00:00.000 + +This works because the :func:`date_add` function treats the timestamp as list of fields, adds +the value to the specified field and then rolls any overflow into the next higher field. + +Time zones are also necessary for parsing and printing timestamps. Queries that use this +functionality can also produce unexpected results. For example, on the same machine: :: + + SELECT cast('2014-03-09 02:30:00' as timestamp); + +The above query causes an error because there was no 2:30 AM on March 9th in +`America/Los_Angeles` due to a daylight saving time transition. + +Similarly, the following query has two possible outcomes due to a daylight saving time +transition: :: + + SELECT cast('2014-11-02 01:30:00' as timestamp); + -- 2014-11-02 08:30:00.000 + +It can be interpreted as `2014-11-02 01:30:00 PDT`, or `2014-11-02 01:30:00 PST`, which are +`2014-11-02 08:30:00 UTC` or `2014-11-02 09:30:00 UTC` respectively. The former one is +picked to be consistent with Presto. + +**Timezone Name Parsing**: When parsing strings that contain timezone names, the +list of supported timezones follow the definition `here +`_. + +**Timezone Conversion**: The ``AT TIME ZONE`` operator sets the time zone of a timestamp: :: + + SELECT timestamp '2012-10-31 01:00 UTC'; + -- 2012-10-31 01:00:00.000 UTC + + SELECT timestamp '2012-10-31 01:00 UTC' AT TIME ZONE 'America/Los_Angeles'; + -- 2012-10-30 18:00:00.000 America/Los_Angeles diff --git a/velox/docs/functions/presto/decimal.rst b/velox/docs/functions/presto/decimal.rst new file mode 100644 index 0000000000000..539398e71242f --- /dev/null +++ b/velox/docs/functions/presto/decimal.rst @@ -0,0 +1,370 @@ +================= +Decimal Operators +================= + +DECIMAL type is designed to represent floating point numbers precisely. +Mathematical operations on decimal values are exact, except for division. On +the other hand, DOUBLE and REAL types are designed to represent floating point +numbers approximately. Mathematical operations on double and real values are +approximate. + +For example, the number 5,000,000,000,000,000 can be represented using DOUBLE. +However, the number 5,000,000,000,000,000.15 cannot be represented using +DOUBLE, but it can be represented using DECIMAL. See +https://en.wikipedia.org/wiki/Double-precision_floating-point_format for more +details. + +DECIMAL type has 2 parameters: precision and scale. Precision is the total +number of digits used to represent the number. Scale is the number of digits +after the decimal point. Naturally, scale must not exceed precision. In +addition, precision cannot exceed 38. + +:: + + decimal(p, s) + + p >= 1 && p <= 38 + s >= 0 && s <= p + +The number 5,000,000,000,000,000.15 can be represented using DECIMAL(18, 2). +This number needs at least 18 digits (precision) of which at least 2 digits +must appear after the decimal point (scale). This number can be represented +using any DECIMAL type where scale >= 2 and precision is >= scale + 16. + +Note: This definition of precision and scale may appear counterintuitive. It is +not uncommon to think about the number of digits after the decimal point as +precision and the number of digits before the decimal point as scale. + +Addition and Subtraction +------------------------ + +To represent the results of adding two decimal numbers we need to use max +(s1, s2) digits after the decimal point and max(p1 - s1, p2 - s2) + 1 digits +before the decimal point. + +:: + + p = max(p1 - s1, p2 - s2) + 1 + max(s1, s2) + s = max(s1, s2) + +It is easiest to understand this formula by thinking about column addition where +we place two numbers one under the other and line up decimal points. + +:: + + 1.001 + 9999.5 + ----------- + 10000.501 + +We can see that the result needs max(s1, s2) digits after the decimal point and +max(p1 - s1, p2 - s2) + 1 digits before the decimal point. + +The precision of the result may exceed 38. There are two options. One option is +to say that addition and subtraction is supported as long as p <= 38 and reject +operations that produce p > 38. Another option is to cap p at 38 and allow the +operation to succeed as long as the actual result can be represented using 38 +digits. In this case, users experience runtime errors when the actual result +cannot be represented using 38 digits. Presto implements the second option. Velox +implementation matches Presto. + +Multiplication +-------------- + +To represent the results of multiplying two decimal numbers we need s1 + s2 +digits after the decimal point and p1 + p2 digits overall. + +:: + + p = p1 + p2 + s = s1 + s2 + +To multiply two numbers we can multiply them as integers ignoring the decimal +points, then add up the number of digits after the decimal point in the +original numbers and place the decimal point that many digits away in the +result. + +To multiply 0.01 with 0.001, we can multiply 1 with 1, then place the decimal +point 5 digits to the left: 0.00001. Hence, the scale of the result is the sum +of scales of the inputs. + +When multiplying two integers with p1 and p2 digits respectively we get a result +that is strictly less than 10^p1 * 10^p2 = 10^(p1 + p2). Hence, we need at most +p1 + p2 digits to represent the result. + +Both scale and precision of the result may exceed 38. There are two options +again. One option is to say that multiplication is supported as long as p <= +38 (by definition, s does not exceed p and therefore does not exceed 38 if p <= +38). Another option is to cap p and s at 38 and allow operation to succeed as +long as the actual result can be represented as a decimal(38, s). In this case, +users experience runtime errors when the actual result cannot be represented as a +decimal(38, s). Presto implements a third option. Reject the operation if s +exceeds 38 and cap p at 38 when s <= 38. In this case some operations are rejected +outright while others are allowed to proceed, but may produce runtime errors. Velox +implementation matches Presto. + +Division +-------- + +Perfect division is not possible. For example, 1 / 3 cannot be represented as a +decimal value. + +When dividing a number with p1 digits over a number of s2 scale, the biggest result requires s2 extra digits before the +decimal point. To get the largest number we must divide by 0.0000001, which effectively is a multiplication by 10^s2. +Hence, precision of the result needs to be at least p1 + s2. + +Presto also chooses to extend the scale of the result to a maximum of scales of +the inputs. + +:: + + s = max(s1, s2) + +To support increased scale, the result precision needs to be extended by the +difference in s1 and s. + +:: + + p = p1 + s2 + max(0, s2 - s1) + +Like in Addition, the precision of the result may exceed 38. The choices are the +same. Presto chooses to cap p at 38 and allow runtime errors. + +Let’s say `a` is of type decimal(p1, s1) with unscaled value `A` and `b` is of +type decimal(p2, s2) with unscaled value B. + +:: + + a = A / 10^s1 + b = B / 10^s2 + +The result type precision and scale are: + +:: + + s = max(s1, s2) + p = p1 + s2 + max(0, s2 - s1) + +The result 'r' has 's' digits after the decimal point and unscaled value R. We +derive the value of R as follows: + +:: + + r = a / b = (A / 10^s1) / (B / 10^s2) = A * 10^(s2 - s1) / B + r = R / 10^s + R = r * 10^s = A * 10^(s + s2 - s1) / B + +To compute R, first rescale A using the rescale factor :code:`(s + s2 - s1)`, +then divide by B and round to the nearest whole. This method works as long as +rescale factor does not exceed 38. If :code:`s + s2 - s1` exceeds 38, an error +is raised. + +The formula for the scale of the result is a choice. Presto chose max(s1, s2). +Other systems made different choices. + +It is not clear why Presto chose max(s1, s2). Perhaps, the thinking was to +assume that user's desired accuracy is the max of input scales. However, one +could also say that desired accuracy is the scale of the dividend. In SQL, +literal values get their types assigned by the actual number of digits after +the decimal point. Hence, in the following SQL 1.2 has scale 1 and 0.01 has +scale 2. One may argue that user's intention is to work with accuracy of 2 +digits after the decimal point, hence, max(s1, s2). + +:: + + SELECT 1.2 / 0.01 + +Modulus +------- + +For the modulus operation :code:`a % b`, when a and b are integers, the result +`r` is less than `b` and less than or equal to `a`. Hence the number of digits +needed to represent `r` is no more than the minimum of the number of digits +needed to represent `a` or `b`. We can extend this to decimal inputs `a` and +`b` by computing the modulus of their unscaled values. However, we should +first make sure that `a` and `b` have the same scale. This can be achieved by +scaling up the input with lesser scale by the difference in the inputs' scales, +so both `a` and `b` have scale s. Once `a` and `b` have the same scale, we +compute the modulus of their unscaled values, A and B. `r` has s digits after +the decimal point, and since `r` does not need any more digits than the +minimum number of digits needed to represent `a` or `b`, the result precision +needs to be increased by the smaller of the differences in the precision and +scale of either inputs. Hence the result type precision and scale are: + +:: + + s = max(s1, s2) + p = min(p2 - s2, p1 - s1) + max(s1, s2) + +To compute R, we first rescale A and B to 's': + +:: + + A = a * 10^s1 + B = b * 10^s2 + + A' = a * 10^s + B' = b * 10^s + +Then we compute modulus of the rescaled values: + +:: + + R = A' % B' = r * 10^s + +For example, say `a` = 12.3 and `b` = 1.21, `r` = :code:`a % b` is calculated +as follows: + +:: + + s = max(1, 2) = 2 + p = min(2, 1) + s = 3 + + A = 12.3 * 10^1 = 123 + B = 1.21 * 10^2 = 121 + + A' = 12.3 * 10^2 = 1230 + B' = 1.21 * 10^2 = 121 + + R = 1230 % 121 = 20 = 0.20 * 100 + +Decimal Functions +----------------- + +.. function:: abs(x: decimal(p, s)) -> r: decimal(p, s) + + Returns absolute value of x (r = `|x|`). + +.. function:: divide(x: decimal(p1, s1), y: decimal(p2, s2)) -> r: decimal(p, s) + + Returns the result of dividing x by y (r = x / y). + + x and y are decimal values with possibly different precisions and scales. The + precision and scale of the result are calculated as follows: + :: + + p = min(38, p1 + s2 + max(0, s2 - s1)) + s = max(s1, s2) + + Throws if y is zero, or result cannot be represented using precision calculated + above, or rescale factor `max(s1, s2) - s1 + s2` exceeds 38. + +.. function:: floor(x: decimal(p, s)) -> r: decimal(pr, 0) + + Returns 'x' rounded down to the nearest integer. The scale of the result is 0. + The precision is calculated as: + :: + + pr = min(38, p - s + min(s, 1)) + +.. function:: minus(x: decimal(p1, s1), y: decimal(p2, s2)) -> r: decimal(p, s) + + Returns the result of subtracting y from x (r = x - y). + + x and y are decimal values with possibly different precisions and scales. The + precision and scale of the result are calculated as follows: + :: + + p = min(38, max(p1 - s1, p2 - s2) + 1 + max(s1, s2)) + s = max(s1, s2) + + Throws if result cannot be represented using precision calculated above. + +.. function:: modulus(x: decimal(p1, s1), y: decimal(p2, s2)) -> r: decimal(p, s) + + Returns the remainder from division of x by y (r = x % y). + + x and y are decimal values with possibly different precisions and scales. The + precision and scale of the result are calculated as follows: + :: + + p = min(p2 - s2, p1 - s1) + max(s1, s2) + s = max(s1, s2) + + Throws if y is zero. + +.. function:: multiply(x: decimal(p1, s1), y: decimal(p2, s2)) -> r: decimal(p, s) + + Returns the result of multiplying x by y (r = x * y). + + x and y are decimal values with possibly different precisions and scales. The + precision and scale of the result are calculated as follows: + :: + + p = min(38, p1 + p2) + s = s1 + s2 + + The operation is not supported if s1 + s2 exceeds 38. + + Throws if result cannot be represented using precision calculated above. + +.. function:: negate(x: decimal(p, s)) -> r: decimal(p, s) + + Returns negated value of x (r = -x). + +.. function:: plus(x: decimal(p1, s1), y: decimal(p2, s2)) -> r: decimal(p, s) + + Returns the result of adding x to y (r = x + y). + + x and y are decimal values with possibly different precisions and scales. The + precision and scale of the result are calculated as follows: + :: + + p = min(38, max(p1 - s1, p2 - s2) + 1 + max(s1, s2)) + s = max(s1, s2) + + Throws if result cannot be represented using precision calculated above. + +.. function:: round(x: decimal(p, s)) -> r: decimal(rp, 0) + + Returns 'x' rounded to the nearest integer. The scale of the result is 0. + The precision is calculated as: + :: + + pr = min(38, p - s + min(s, 1)) + +.. function:: round(x: decimal(p, s), d: integer) -> r: decimal(rp, s) + + Returns 'x' rounded to 'd' decimal places. The scale of the result is + the same as the scale of the input. The precision is calculated as: + :: + + p = min(38, p + 1) + + 'd' can be positive, zero or negative. Returns 'x' unmodified if 'd' exceeds + the scale of the input. + + :: + + SELECT round(123.45, 0); -- 123.00 + SELECT round(123.45, 1); -- 123.50 + SELECT round(123.45, 2); -- 123.45 + SELECT round(123.45, 3); -- 123.45 + SELECT round(123.45, -1); -- 120.00 + SELECT round(123.45, -2); -- 100.00 + SELECT round(123.45, -10); -- 0.00 + +.. function:: truncate(x: decimal(p, s)) -> r: decimal(rp, 0) + + Returns 'x' rounded to integer by dropping digits after decimal point. + The scale of the result is 0. The precision is calculated as: + :: + + pr = max(p - s, 1) + +.. function:: truncate(x: decimal(p, s), d: integer) -> r: decimal(rp, s) + + Returns ``x`` truncated to ``d`` decimal places. + The precision and scale of the result are the same as the precision and scale of the input. + ``d`` can be positive, zero or negative. + When ``d`` is negative truncates ``-d`` digits left of the decimal point. + Returns ``x`` unmodified if ``d`` exceeds the scale of the input. + :: + + SELECT truncate(999.45, 0); -- 999.00 + SELECT truncate(999.45, 1); -- 999.40 + SELECT truncate(999.45, 2); -- 999.45 + SELECT truncate(999.45, 3); -- 999.45 + SELECT truncate(999.45, -1); -- 990.00 + SELECT truncate(999.45, -2); -- 900.00 + SELECT truncate(999.45, -10); -- 0.00 diff --git a/velox/docs/functions/presto/json.rst b/velox/docs/functions/presto/json.rst index 18a8a47ec31ec..36cc09ce8255e 100644 --- a/velox/docs/functions/presto/json.rst +++ b/velox/docs/functions/presto/json.rst @@ -37,6 +37,8 @@ be JSON. Behaviors of the casts are shown with the examples below: SELECT CAST('abc' AS JSON); -- JSON '"abc"' SELECT CAST(true AS JSON); -- JSON 'true' SELECT CAST(1.234 AS JSON); -- JSON '1.234' + SELECT CAST(-0.00012 AS JSON); -- JSON '-1.2E-4' + SELECT CAST(10000000.0 AS JSON); -- JSON '1.0E7' SELECT CAST(ARRAY[1, 23, 456] AS JSON); -- JSON '[1,23,456]' SELECT CAST(ARRAY[1, NULL, 456] AS JSON); -- JSON '[1,null,456]' SELECT CAST(ARRAY[ARRAY[1, 23], ARRAY[456]] AS JSON); -- JSON '[[1,23],[456]]' @@ -52,6 +54,12 @@ Another thing to be aware of is that when casting from ROW to JSON, the result is a JSON array rather than a JSON object. This is because positions are more important than names for rows in SQL. +Also note that casting from REAL or DOUBLE returns the JSON text represented +in standard notation if the magnitude of input value is greater than or equal +to 10 :superscript:`-3` but less than 10 :superscript:`7`, and returns the JSON +text in scientific notation otherwise. The standard and scientific notation +always has the fractional part, such as ``10.0``. + Finally, keep in mind that casting a VARCHAR string to JSON does not directly turn the original string into JSON type. Instead, it creates a JSON text representing the original string. This JSON text is different from the original @@ -110,6 +118,29 @@ JSON Functions SELECT json_array_contains('[1, 2, 3]', 2); +.. function:: json_array_get(json_array, index) -> json + + Returns the element at the specified index into the ``json_array``. + The index is zero-based:: + + SELECT json_array_get('[1, 2, 3]', 0); -- JSON '1' + SELECT json_array_get('[1, 2, 3]', 1); -- JSON '2' + + This function also supports negative indexes for fetching element indexed + from the end of an array:: + + SELECT json_array_get('[1, 2, 3]', -1); -- JSON '3' + SELECT json_array_get('[1, 2, 3]', -2); -- JSON '2' + + If the element at the specified index doesn't exist, the function returns null:: + + SELECT json_array_get('[1, 2, 3]', 10); -- NULL + SELECT json_array_get('[1, 2, 3]', -10); -- NULL + + If ``json_array`` is not an array, the function returns null:: + + SELECT json_array_get('{"a": 10, "b": 11}', 1); -- NULL + .. function:: json_array_length(json) -> bigint Returns the array length of ``json`` (a string containing a JSON diff --git a/velox/docs/functions/presto/map.rst b/velox/docs/functions/presto/map.rst index f6a0c81d6f724..f0cade6e121af 100644 --- a/velox/docs/functions/presto/map.rst +++ b/velox/docs/functions/presto/map.rst @@ -39,7 +39,8 @@ Map Functions .. function:: map(array(K), array(V)) -> map(K,V) :noindex: - Returns a map created using the given key/value arrays. Keys are not allowed to be null or to contain nulls. :: + Returns a map created using the given key/value arrays. Keys are not allowed to be null or to contain nulls. + For REAL and DOUBLE, NaNs (Not-a-Number) are considered equal. :: SELECT map(ARRAY[1,3], ARRAY[2,4]); -- {1 -> 2, 3 -> 4} @@ -70,6 +71,58 @@ Map Functions SELECT map_from_entries(ARRAY[(1, 'x'), (2, 'y')]); -- {1 -> 'x', 2 -> 'y'} +.. function:: map_normalize(map(varchar,double)) -> map(varchar,double) + + Returns the map with the same keys but all non-null values scaled proportionally + so that the sum of values becomes 1. Map entries with null values remain unchanged. + + When total sum of non-null values is zero, null values remain null, + zero, NaN, Infinity and -Infinity values become NaN, + positive values become Infinity, negative values become -Infinity.:: + + SELECT map_normalize(map(array['a', 'b', 'c'], array[1, 4, 5])); -- {a=0.1, b=0.4, c=0.5} + SELECT map_normalize(map(array['a', 'b', 'c', 'd'], array[1, null, 4, 5])); -- {a=0.1, b=null, c=0.4, d=0.5} + SELECT map_normalize(map(array['a', 'b', 'c'], array[1, 0, -1])); -- {a=Infinity, b=NaN, c=-Infinity} + +.. function:: map_remove_null_values(map(K,V)) -> map(K,V) + + Returns a map by removing all the keys in input map with null values. If input + is null, output is null. If input map is empty, output map is empty. + + SELECT map_remove_null_values(MAP(ARRAY['ab', 'bc', 'cd'], ARRAY[null, null, null])); -- {} + SELECT map_remove_null_values(MAP(ARRAY[], ARRAY[])); -- {} + SELECT map_remove_null_values(MAP(ARRAY[1, 2, 3], ARRAY[3, 4, NULL])); -- {1=3, 2=4} + SELECT map_remove_null_values(NULL); -- NULL + +.. function:: map_subset(map(K,V), array(k)) -> map(K,V) + + Constructs a map from those entries of ``map`` for which the key is in the array given + For keys containing REAL and DOUBLE, NANs (Not-a-Number) are considered equal. :: + + SELECT map_subset(MAP(ARRAY[1,2], ARRAY['a','b']), ARRAY[10]); -- {} + SELECT map_subset(MAP(ARRAY[1,2], ARRAY['a','b']), ARRAY[1]); -- {1->'a'} + SELECT map_subset(MAP(ARRAY[1,2], ARRAY['a','b']), ARRAY[1,3]); -- {1->'a'} + SELECT map_subset(MAP(ARRAY[1,2], ARRAY['a','b']), ARRAY[]); -- {} + SELECT map_subset(MAP(ARRAY[], ARRAY[]), ARRAY[1,2]); -- {} + +.. function:: map_top_n(map(K,V), n) -> map(K, V) + + Truncates map items. Keeps only the top N elements by value. Keys are used to break ties with the max key being chosen. Both keys and values should be orderable. + + ``n`` must be a non-negative BIGINT value.:: + + SELECT map_top_n(map(ARRAY['a', 'b', 'c'], ARRAY[2, 3, 1]), 2) --- {'b' -> 3, 'a' -> 2} + SELECT map_top_n(map(ARRAY['a', 'b', 'c'], ARRAY[NULL, 3, NULL]), 2) --- {'b' -> 3, 'a' -> NULL} + +.. function:: map_top_n_keys(map(K,V), n) -> array(K) + + Constructs an array of the top N keys. Keys should be orderable. + + ``n`` must be a non-negative BIGINT value.:: + + SELECT map_top_n_keys(map(ARRAY['a', 'b', 'c'], ARRAY[1, 2, 3]), 2) --- ['c', 'b'] + SELECT map_top_n_keys(map(ARRAY['a', 'b', 'c'], ARRAY[1, 2, 3]), 0) --- [] + .. function:: map_keys(x(K,V)) -> array(K) Returns all the keys in the map ``x``. @@ -93,6 +146,12 @@ Map Functions MAP(ARRAY['a', 'b', 'c'], ARRAY[1, 2, 3]), (k, v1, v2) -> k || CAST(v1/v2 AS VARCHAR)); +.. function:: multimap_from_entries(array(row(K,V))) -> map(K,array(V)) + + Returns a multimap created from the given array of entries. Each key can be associated with multiple values. :: + + SELECT multimap_from_entries(ARRAY[(1, 'x'), (2, 'y'), (1, 'z')]); -- {1 -> ['x', 'z'], 2 -> ['y']} + .. function:: no_keys_match(x(K,V), function(K, boolean)) -> boolean Returns whether no keys of a map match the given predicate. Returns true if none of the keys match the predicate (a special case is when the map is empty); false if one or more keys match; NULL if the predicate function returns NULL for one or more keys and false for all other keys. :: @@ -108,7 +167,8 @@ Map Functions .. function:: subscript(map(K, V), key) -> V :noindex: - Returns value for given ``key``. Throws if the key is not contained in the map. + Returns value for given ``key``. Return null if the key is not contained in the map. + For REAL and DOUBLE, NaNs (Not-a-Number) are considered equal and can be used as keys. Corresponds to SQL subscript operator []. SELECT name_to_age_map['Bob'] AS bob_age; diff --git a/velox/docs/functions/presto/math.rst b/velox/docs/functions/presto/math.rst index 9077da198b566..c394182c66151 100644 --- a/velox/docs/functions/presto/math.rst +++ b/velox/docs/functions/presto/math.rst @@ -27,6 +27,17 @@ Mathematical Functions verified for performance reasons. Returns ``high`` for all values of ``x`` when ``low`` is greater than ``high``. +.. function:: cosine_similarity(map(varchar, double), map(varchar, double)) -> double + + Returns the `cosine similarity `_ between the vectors represented as map(varchar, double). + If any input map is empty, the function returns NaN. + + SELECT cosine_similarity(MAP(ARRAY['a'], ARRAY[1.0]), MAP(ARRAY['a'], ARRAY[2.0])); -- 1.0 + + SELECT cosine_similarity(MAP(ARRAY['a', 'b'], ARRAY[1.0, 2.0]), MAP(ARRAY['a', 'b'], ARRAY[NULL, 3.0])); -- NULL + + SELECT cosine_similarity(MAP(ARRAY[], ARRAY[]), MAP(ARRAY['a', 'b'], ARRAY[2, 3])); -- NaN + .. function:: degrees(x) -> double Converts angle x in radians to degrees. @@ -128,6 +139,18 @@ Mathematical Functions Returns ``x`` rounded to ``d`` decimal places. +.. function:: secure_rand() -> double + + This is an alias for :func:`secure_random()`. + +.. function:: secure_random() -> double + + Returns a cryptographically secure random value in the range 0.0 <= x < 1.0. + +.. function:: secure_random(lower, upper) -> [same as input] + + Returns a cryptographically secure random value in the range lower <= x < upper, where lower < upper. + .. function:: sign(x) -> [same as x] Returns the signum function of ``x``. For both integer and floating point arguments, it returns: @@ -148,14 +171,17 @@ Mathematical Functions Returns the base-``radix`` representation of ``x``. ``radix`` must be between 2 and 36. -.. function:: truncate(x) -> double +.. function:: truncate(x) -> [same as x] Returns x rounded to integer by dropping digits after decimal point. + Supported types of ``x`` are: REAL and DOUBLE. -.. function:: truncate(x, n) -> double +.. function:: truncate(x, n) -> [same as x] :noindex: Returns x truncated to n decimal places. n can be negative to truncate n digits left of the decimal point. + Supported types of ``x`` are: REAL and DOUBLE. + ``n`` is an INTEGER. .. function:: width_bucket(x, bound1, bound2, n) -> bigint @@ -275,6 +301,13 @@ Probability Functions: cdf Compute the Gamma cdf with given shape and scale parameters: P(N < value; shape, scale). The shape and scale parameters must be positive real numbers. The value must be a non-negative real number. +.. function:: inverse_normal_cdf(mean, sd, p) -> double + + Compute the inverse of the Normal cdf with given mean and standard + deviation (sd) for the cumulative probability (p): P(N < n). The mean must be + a real value and the standard deviation must be a real and positive value (both of type DOUBLE). + The probability p must lie on the interval (0, 1). + .. function:: laplace_cdf(mean, scale, value) -> double Compute the Laplace cdf with given mean and scale parameters: P(N < value; mean, scale). @@ -293,6 +326,11 @@ Probability Functions: cdf The lambda parameter must be a positive real number (of type DOUBLE) and value must be a non-negative integer. +.. function:: weibull_cdf(a, b, value) -> double + + Compute the Weibull cdf with given parameters a, b: P(N <= value). The ``a`` + and ``b`` parameters must be positive doubles and ``value`` must also be a double. + ==================================== Probability Functions: inverse_cdf ==================================== @@ -303,6 +341,22 @@ Probability Functions: inverse_cdf probability (p): P(N < n). The a, b parameters must be positive real values (all of type DOUBLE). The probability p must lie on the interval [0, 1]. +.. function:: inverse_weibull_cdf(a, b, p) -> double + + Compute the inverse of the Weibull cdf with given parameters ``a``, ``b`` for the probability ``p``. + The ``a``, ``b`` parameters must be positive double values. The probability ``p`` must be a double + on the interval [0, 1]. + +.. function:: inverse_cauchy_cdf(median, scale, p) -> double + + Compute the inverse of the Cauchy cdf with given parameters ``median`` and ``scale`` (gamma) for the probability p. + The scale parameter must be a positive double. The probability ``p`` must be a double on the interval [0, 1]. + +.. function:: inverse_laplace_cdf(mean, scale, p) -> double + + Compute the inverse of the Laplace cdf with given ``mean`` and ``scale`` parameters for the cumulative probability (p): P(N < n). + The mean must be a real value and the scale must be a positive real value (both of type DOUBLE). + The probability ``p`` must lie on the interval [0, 1]. ==================================== Statistical Functions diff --git a/velox/docs/functions/presto/misc.rst b/velox/docs/functions/presto/misc.rst new file mode 100644 index 0000000000000..f9de2a4f0df27 --- /dev/null +++ b/velox/docs/functions/presto/misc.rst @@ -0,0 +1,20 @@ +======================= +Miscellaneous Functions +======================= + +.. function:: fail(message) + + Throws user error with the specified message. + +.. function:: fail(code, message) + + Throws user error with the specified message. Ignores 'code' argument. + +.. function:: fail(json) + + Throws user error with the message specified in 'message' field of the JSON. + +.. function:: fail(code, message) + + Throws user error with the message specified in 'message' field of the JSON. + Ignores 'code' argument. diff --git a/velox/docs/functions/presto/most_used_coverage.rst b/velox/docs/functions/presto/most_used_coverage.rst index 7b5bbc65e3380..9a9e42a40a924 100644 --- a/velox/docs/functions/presto/most_used_coverage.rst +++ b/velox/docs/functions/presto/most_used_coverage.rst @@ -83,6 +83,7 @@ Here is a list of most used scalar and aggregate Presto functions with functions table.coverage tr:nth-child(12) td:nth-child(7) {background-color: #6BA81E;} table.coverage tr:nth-child(13) td:nth-child(1) {background-color: #6BA81E;} table.coverage tr:nth-child(13) td:nth-child(2) {background-color: #6BA81E;} + table.coverage tr:nth-child(13) td:nth-child(3) {background-color: #6BA81E;} table.coverage tr:nth-child(13) td:nth-child(4) {background-color: #6BA81E;} table.coverage tr:nth-child(13) td:nth-child(5) {background-color: #6BA81E;} table.coverage tr:nth-child(13) td:nth-child(7) {background-color: #6BA81E;} @@ -131,7 +132,7 @@ Here is a list of most used scalar and aggregate Presto functions with functions :func:`length` :func:`date` :func:`date_trunc` :func:`date_parse` st_y :func:`approx_percentile` :func:`from_unixtime` :func:`is_nan` :func:`date_diff` bing_tile_at st_x :func:`avg` :func:`transform` :func:`rand` :func:`array_max` :func:`array_union` now :func:`map_agg` - :func:`to_unixtime` :func:`filter` from_iso8601_date :func:`reverse` :func:`truncate` :func:`min_by` + :func:`to_unixtime` :func:`filter` :func:`from_iso8601_date` :func:`reverse` :func:`truncate` :func:`min_by` :func:`regexp_like` :func:`sqrt` :func:`json_extract` :func:`array_intersect` :func:`stddev` :func:`array_join` :func:`least` :func:`mod` :func:`repeat` :func:`set_agg` :func:`replace` :func:`json_parse` :func:`array_distinct` st_geometryfromtext :func:`histogram` diff --git a/velox/docs/functions/presto/regexp.rst b/velox/docs/functions/presto/regexp.rst index c032413fb3e5c..d7d41eb8361d3 100644 --- a/velox/docs/functions/presto/regexp.rst +++ b/velox/docs/functions/presto/regexp.rst @@ -7,21 +7,27 @@ supports only a subset of PCRE syntax and in particular does not support backtracking and associated features (e.g. back references). See https://github.com/google/re2/wiki/Syntax for more information. +Compiling regular expressions is CPU intensive. Hence, each function is +limited to 20 different expressions per instance and thread of execution. + .. function:: like(string, pattern) -> boolean like(string, pattern, escape) -> boolean Evaluates if the ``string`` matches the ``pattern``. Patterns can contain regular characters as well as wildcards. Wildcard characters can be escaped - using the single character specified for the ``escape`` parameter. - Matching is case sensitive. + using the single character specified for the ``escape`` parameter. Only ASCII + characters are supported for the ``escape`` parameter. Matching is case sensitive. Note: The wildcard '%' represents 0, 1 or multiple characters and the wildcard '_' represents exactly one character. Note: Each function instance allow for a maximum of 20 regular expressions to - be compiled throughout the lifetime of the query. Not all Patterns requires - compilation of regular expressions; for example a pattern 'aa' does not. - Only those that require the compilation of regular expressions are counted. + be compiled per thread of execution. Not all patterns require + compilation of regular expressions. Patterns 'hello', 'hello%', '_hello__%', + '%hello', '%__hello_', '%hello%', where 'hello', 'velox' + contains only regular characters and '_' wildcards are evaluated without + using regular expressions. Only those patterns that require the compilation of + regular expressions are counted towards the limit. SELECT like('abc', '%b%'); -- true SELECT like('a_c', '%#_%', '#'); -- true @@ -34,14 +40,22 @@ See https://github.com/google/re2/wiki/Syntax for more information. SELECT regexp_extract('1a 2b 14m', '\d+'); -- 1 .. function:: regexp_extract(string, pattern, group) -> varchar - :noindex: + :noindex: Finds the first occurrence of the regular expression ``pattern`` in ``string`` and returns the capturing group number ``group``:: SELECT regexp_extract('1a 2b 14m', '(\d+)([a-z]+)', 2); -- 'a' -.. function:: regexp_extract_all(string, pattern, group) -> array(varchar) +.. function:: regexp_extract_all(string, pattern) -> array(varchar): + + Returns the substring(s) matched by the regular expression ``pattern`` + in ``string``:: + + SELECT regexp_extract_all('1a 2b 14m', '\d+'); -- [1, 2, 14] + +.. function:: regexp_extract_all(string, pattern, group) -> array(varchar): + :noindex: Finds all occurrences of the regular expression ``pattern`` in ``string`` and returns the capturing group number ``group``:: @@ -69,7 +83,7 @@ See https://github.com/google/re2/wiki/Syntax for more information. SELECT regexp_replace('1a 2b 14m', '\d+[ab] '); -- '14m' .. function:: regexp_replace(string, pattern, replacement) -> varchar - :noindex: + :noindex: Replaces every instance of the substring matched by the regular expression ``pattern`` in ``string`` with ``replacement``. Capturing groups can be referenced in @@ -78,3 +92,20 @@ See https://github.com/google/re2/wiki/Syntax for more information. backslash (``\$``):: SELECT regexp_replace('1a 2b 14m', '(\d+)([ab]) ', '3c$2 '); -- '3ca 3cb 14m' + +.. function:: regexp_replace(string, pattern, function) -> varchar + + Replaces every instance of the substring matched by the regular expression + ``pattern`` in ``string`` using ``function``. The lambda expression + ``function`` is invoked for each match with the capturing groups passed as an + array. Capturing group numbers start at 1; there is no group for the entire match + (if you need this, surround the entire expression with parenthesis). :: + + SELECT regexp_replace('new york', '(\w)(\w*)', x -> upper(x[1]) || lower(x[2])); --'New York' + +.. function:: regexp_split(string, pattern) -> array(varchar): + + Splits ``string`` using the regular expression ``pattern`` and returns an + array. Trailing empty strings are preserved:: + + SELECT regexp_split('1a 2b 14m', '\s*[a-z]+\s*'); -- [1, 2, 14, ] diff --git a/velox/docs/functions/presto/string.rst b/velox/docs/functions/presto/string.rst index cf2e19c972f71..9040b382f31e9 100644 --- a/velox/docs/functions/presto/string.rst +++ b/velox/docs/functions/presto/string.rst @@ -40,15 +40,25 @@ String Functions .. function:: from_utf8(binary) -> varchar Decodes a UTF-8 encoded string from ``binary``. Invalid UTF-8 sequences - are replaced with the Unicode replacement character ``U+FFFD``. + are replaced with the Unicode replacement character ``U+FFFD``. Each + invalid UTF-8 codepoint, including those of multi-byte long, is replaced + with one replacement character. .. function:: from_utf8(binary, replace) -> varchar :noindex: Decodes a UTF-8 encoded string from ``binary``. Invalid UTF-8 sequences are - replaced with `replace`. The `replace` argument can be either Unicode code - point (bigint), a single character or empty string. When `replace` is an - empty string invalid characters are removed. + replaced with `replace`. Each invalid UTF-8 codepoint, including those of + multi-byte long, is replaced with one replacement character. The `replace` + argument can be either Unicode code point (bigint), a single character or + empty string. When `replace` is an empty string invalid characters are + removed. + +.. function:: hamming_distance(string1, string2) -> bigint + + Returns the Hamming distance of ``string1`` and ``string2``, + i.e. the number of positions at which the corresponding characters are different. + Note that the two strings must have the same length. .. function:: length(string) -> bigint @@ -98,7 +108,12 @@ String Functions .. function:: reverse(string) -> varchar :noindex: - Reverses ``string``. + Returns input string with characters in reverse order. + +.. function:: reverse(varbinary) -> varbinary + :noindex: + + Returns input binary with bytes in reversed order. .. function:: rpad(string, size, padstring) -> varchar @@ -146,10 +161,25 @@ String Functions each pair into key and value. Note that ``entryDelimiter`` and ``keyValueDelimiter`` are interpreted literally, i.e., as full string matches. - entryDelimiter and keyValueDelimiter must not be empty and must not be the same. + ``entryDelimiter`` and ``keyValueDelimiter`` must not be empty and must not be the same. + ``entryDelimiter`` is allowed to be the trailing character. Raises an error if there are duplicate keys. +.. function:: split_to_map(string, entryDelimiter, keyValueDelimiter, function(K,V1,V2,R)) -> map + + Splits ``string`` by ``entryDelimiter`` and ``keyValueDelimiter`` and returns a map. + ``entryDelimiter`` splits ``string`` into key-value pairs. ``keyValueDelimiter`` splits + each pair into key and value. Note that ``entryDelimiter`` and ``keyValueDelimiter`` are + interpreted literally, i.e., as full string matches. + + ``function(K,V1,V2,R)`` is used to decide whether to keep first or last value for + duplicate keys. (k, v1, v2) -> v1 keeps first value. (k, v1, v2) -> v2 keeps last + value. Arbitrary functions are not supported. :: + + SELECT(split_to_map('a:1;b:2;a:3', ';', ':', (k, v1, v2) -> v1)); -- {"a": "1", "b": "2"} + SELECT(split_to_map('a:1;b:2;a:3', ';', ':', (k, v1, v2) -> v2)); -- {"a": "3", "b": "2"} + .. function:: starts_with(string, substring) -> boolean Returns whether ``string`` starts with ``substring``. @@ -181,7 +211,7 @@ String Functions ``instance`` must be a positive number. Positions start with ``1``. If not found, ``0`` is returned. It takes into account overlapping strings when counting occurrences. :: - + SELECT strrpos('aaa', 'aa', 2); -- 1 .. function:: substr(string, start) -> varchar @@ -239,9 +269,70 @@ String Functions Converts ``string`` to uppercase. +.. function:: word_stem(word) -> varchar + + Returns the stem of ``word`` in the English language. If the ``word`` is not an English word, + the ``word`` in lowercase is returned. + +.. function:: word_stem(word, lang) -> varchar + + Returns the stem of ``word`` in the ``lang`` language. This function supports the following languages: + + =========== ================ + lang Language + =========== ================ + ``ca`` ``Catalan`` + ``da`` ``Danish`` + ``de`` ``German`` + ``en`` ``English`` + ``es`` ``Spanish`` + ``eu`` ``Basque`` + ``fi`` ``Finnish`` + ``fr`` ``French`` + ``hu`` ``Hungarian`` + ``hy`` ``Armenian`` + ``ir`` ``Irish`` + ``it`` ``Italian`` + ``lt`` ``Lithuanian`` + ``nl`` ``Dutch`` + ``no`` ``Norwegian`` + ``pt`` ``Portuguese`` + ``ro`` ``Romanian`` + ``ru`` ``Russian`` + ``sv`` ``Swedish`` + ``tr`` ``Turkish`` + =========== ================ + + If the specified ``lang`` is not supported, this function throws a user error. + + Unicode Functions ----------------- +.. function:: normalize(string) -> varchar + + Transforms ``string`` with NFC normalization form. + +.. function:: normalize(string, form) -> varchar + + Reference: https://unicode.org/reports/tr15/#Norm_Forms + Transforms ``string`` with the specified normalization form. + ``form`` must be be one of the following keywords: + + ======== =========== + Form Description + ======== =========== + ``NFD`` Canonical Decomposition + ``NFC`` Canonical Decomposition, followed by Canonical Composition + ``NFKD`` Compatibility Decomposition + ``NFKC`` Compatibility Decomposition, followed by Canonical Composition + ======== =========== + + .. note:: + + This SQL-standard function has special syntax and requires + specifying ``form`` as a keyword, not as a string. + .. function:: to_utf8(string) -> varbinary Encodes ``string`` into a UTF-8 varbinary representation. diff --git a/velox/docs/functions/presto/url.rst b/velox/docs/functions/presto/url.rst index 183b81503d328..01b590dbe6cb1 100644 --- a/velox/docs/functions/presto/url.rst +++ b/velox/docs/functions/presto/url.rst @@ -2,10 +2,10 @@ URL Functions ============= -Extraction Functions --------------------- +Introduction +------------ -The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 2396 `_). The following syntax is supported: +The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 3986 `_). The following syntax is supported: .. code-block:: bash @@ -14,6 +14,40 @@ The URL extraction functions extract components from HTTP URLs (or any valid URI The extracted components do not contain URI syntax separators such as ``:`` , ``?`` and ``#``. +Consider for example the below URI: + +.. code-block:: + + http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related + + scheme = http + authority = www.ics.uci.edu + path = /pub/ietf/uri/ + query = k1=v1 + fragment = Related + + +Invalid URI's +------------- + +Well formed URI's should not contain ascii whitespace. `Percent-encoded URI's `_ should be followed by two hexadecimal +digits after the percent character "%". All the url extract functions will return null when passed an invalid uri. + +.. code-block:: + + # Examples of url functions with Invalid URI's. + + # Invalid URI due to whitespace + SELECT url_extract_path('foo '); -- NULL (1 row) + SELECT url_extract_host('http://www.foo.com '); -- NULL (1 row) + + # Invalid URI due to improper escaping of '%' + SELECT url_extract_path('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row) + SELECT url_extract_host('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row) + +Extraction Functions +-------------------- + .. function:: url_extract_fragment(url) -> varchar Returns the fragment identifier from ``url``. @@ -32,7 +66,7 @@ The extracted components do not contain URI syntax separators such as ``:`` , `` .. function:: url_extract_port(url) -> bigint - Returns the port number from ``url``. + Returns the port number from ``url``. Returns NULL if port is missing. .. function:: url_extract_protocol(url) -> varchar diff --git a/velox/docs/functions/presto/uuid.rst b/velox/docs/functions/presto/uuid.rst new file mode 100644 index 0000000000000..a8cbdc5be511a --- /dev/null +++ b/velox/docs/functions/presto/uuid.rst @@ -0,0 +1,7 @@ +============== +UUID Functions +============== + +.. function:: uuid() -> uuid + + Returns a pseudo randomly generated UUID (type 4). diff --git a/velox/docs/functions/presto/window.rst b/velox/docs/functions/presto/window.rst index e0549ec942302..6d25073cb6788 100644 --- a/velox/docs/functions/presto/window.rst +++ b/velox/docs/functions/presto/window.rst @@ -62,6 +62,8 @@ More details: If no frame is specified, a default frame of RANGE UNBOUNDED PRECEDING is used. + More details about window frames can be found at :doc:`/develop/window` + SQL Example ___________ @@ -116,6 +118,12 @@ within the window partition. Value functions _______________ +Value functions provide an option to specify how null values should be treated when evaluating the +function. Nulls can either be ignored (``IGNORE NULLS``) or respected (``RESPECT NULLS``). By default, +null values are respected. If ``IGNORE NULLS`` is specified, all rows where the value expression is +null are excluded from the calculation. If ``IGNORE NULLS`` is specified and the value expression is +null for all rows, the ``default_value`` is returned, or if it is not specified, ``null`` is returned. + .. function:: first_value(x) -> [same as input] Returns the first value of the window. @@ -130,19 +138,31 @@ Returns the value at the specified offset from the beginning of the window. Offs can be any scalar expression. If the offset is null or greater than the number of values in the window, null is returned. It is an error for the offset to be zero or negative. -.. function:: lag(x[, offset [, default_value]]) -> [same as input] +.. function:: lag(x[, offset[, default_value]]) -> [same as input] + +Returns the value at ``offset`` rows before the current row in the window partition. +Offsets start at ``0``, which is the current row. The default ``offset`` is ``1``. +The offset can be a constant value or a column reference. If the offset is ``null``, ``null`` is +returned. If the offset refers to a row that is not within the partition, the +``default_value`` is returned, or if ``default_value`` is not specified ``null`` +is returned. + +If ``IGNORE NULLS`` is specified, ``null`` values are ignored during offset counting. +If not enough non-null values are found during offset counting, ``default_value`` +is returned. -Returns the value at ``offset`` rows before the current row in the partition. If -there is no such row, the ``default_value`` is returned, or if it is not -specified ``null`` is returned. Offsets start at ``0``, which is the current -row. The default ``offset`` is ``1``. +.. function:: lead(x[, offset[, default_value]]) -> [same as input] -.. function:: lead(x[, offset [, default_value]]) -> [same as input] +Returns the value at ``offset`` rows after the current row in the window partition. +Offsets start at ``0``, which is the current row. The default ``offset`` is ``1``. +The offset can be a constant value or a column reference. If the offset is ``null``, ``null`` is +returned. If the offset refers to a row that is not within the partition, the +``default_value`` is returned, or if ``default_value`` is not specified ``null`` +is returned. -Returns the value at ``offset`` rows after the current row in the partition. If -there is no such row, the ``default_value`` is returned, or if it is not -specified ``null`` is returned. Offsets start at ``0``, which is the current -row. The default ``offset`` is ``1``. +If ``IGNORE NULLS`` is specified, ``null`` values are ignored during offset counting. +If not enough non-null values are found during offset counting, ``default_value`` +is returned. Aggregate functions ___________________ diff --git a/velox/docs/functions/spark/aggregate.rst b/velox/docs/functions/spark/aggregate.rst index 6a2ed2427f706..b81c05460081c 100644 --- a/velox/docs/functions/spark/aggregate.rst +++ b/velox/docs/functions/spark/aggregate.rst @@ -53,6 +53,34 @@ General Aggregate Functions ``hash`` cannot be null. +.. spark:function:: collect_list(x) -> array<[same as x]> + + Returns an array created from the input ``x`` elements. Ignores null + inputs, and returns an empty array when all inputs are null. + +.. spark:function:: collect_set(x) -> array<[same as x]> + + Returns an array consisting of all unique values from the input ``x`` elements excluding NULLs. + Returns empty array if input is empty or all NULL. + + Example:: + + SELECT collect_set(i) + FROM ( + VALUES + (1), + (null) + ) AS t(i); + -- ARRAY[1] + + SELECT collect_set(elements) + FROM ( + VALUES + ARRAY[1, 2], + ARRAY[1, null] + ) AS t(elements); + -- ARRAY[ARRAY[1, 2], ARRAY[1, null]] + .. spark:function:: first(x) -> x Returns the first value of `x`. @@ -61,6 +89,12 @@ General Aggregate Functions Returns the first non-null value of `x`. +.. spark:function:: kurtosis(x) -> double + + Returns the Pearson's kurtosis of all input values. When the count of `x` is not empty, + a non-null output will be generated. When the value of `m2` in the accumulator is 0, a null + output will be generated. + .. spark:function:: last(x) -> x Returns the last value of `x`. @@ -69,16 +103,104 @@ General Aggregate Functions Returns the last non-null value of `x`. -.. spark:function:: max_by(x, y) -> x +.. spark:function:: max(x) -> [same as x] + + Returns the maximum value of ``x``. + ``x`` must be an orderable type. + +.. spark:function:: max_by(x, y) -> [same as x] Returns the value of `x` associated with the maximum value of `y`. Note: Spark provides a non-strictly comparator which is greater than or equals to. - Eg. SELECT max_by(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 50)) AS tab(x, y); - > c -.. spark:function:: min_by(x, y) -> x + Example:: + + SELECT max_by(x, y) + FROM ( + VALUES + ('a', 10), + ('b', 50), + ('c', 50) + ) AS t(x, y); + + Returns c + +.. spark:function:: min(x) -> [same as x] + + Returns the minimum value of ``x``. + ``x`` must be an orderable type. + +.. spark:function:: min_by(x, y) -> [same as x] Returns the value of `x` associated with the minimum value of `y`. Note: Spark provides a non-strictly comparator which is less than or equals to. - Eg. SELECT min_by(x, y) FROM VALUES (('a', 10)), (('b', 10)), (('c', 50)) AS tab(x, y); - > b \ No newline at end of file + + Example:: + + SELECT min_by(x, y) + FROM ( + VALUES + ('a', 10), + ('b', 10), + ('c', 50) + ) AS t(x, y); + + Returns b + +.. spark:function:: mode(x) -> [same as x] + + Returns the most frequent value for the values within ``x``. + NULL values are ignored. If all the values are NULL, or + there are 0 rows, returns NULL. + If multiple values have the same greatest frequency, the + return value could be any one of them. + + Example:: + + SELECT mode(x) + FROM ( + VALUES + (0), (10), (10), (null), (null), (null) + ) AS t(x); + + Returns 10 + +.. spark:function:: regr_replacement(x) -> double + + Returns the `m2` (the sum of the second central moment) of input values. + +.. spark:function:: skewness(x) -> double + + Returns the skewness of all input values. When the count of `x` is greater than or equal to 1, + a non-null output will be generated. When the value of `m2` in the accumulator is 0, a null + output will be generated. + +.. spark:function:: sum(x) -> bigint|double|real + + Returns the sum of `x`. + + Supported types are TINYINT, SMALLINT, INTEGER, BIGINT, REAL, DOUBLE and DECIMAL. + + When x is of type DOUBLE, the result type is DOUBLE. + When x is of type REAL, the result type is REAL. + When x is of type DECIMAL(p, s), the result type is DECIMAL(p + 10, s), where (p + 10) is capped at 38. + + For all other input types, the result type is BIGINT. + + Note: + When all input values is NULL, for all input types, the result is NULL. + + For DECIMAL type, when an overflow occurs in the accumulation, it returns NULL. For REAL and DOUBLE type, it + returns Infinity. For all other input types, when the sum of input values exceeds its limit, it cycles to the + overflowed value rather than raising an error. + + Example:: + + SELECT SUM(x) + FROM ( + VALUES + (9223372036854775807L), + (1L) + ) AS t(x); + + Returns -9223372036854775808 diff --git a/velox/docs/functions/spark/array.rst b/velox/docs/functions/spark/array.rst index 2183f4f301c24..356232e96639d 100644 --- a/velox/docs/functions/spark/array.rst +++ b/velox/docs/functions/spark/array.rst @@ -21,46 +21,125 @@ Array Functions SELECT array_contains(array(1, 2, 3), 2); -- true +.. spark:function:: array_distinct(array(E)) -> array(E) + + Remove duplicate values from the input array. :: + + SELECT array_distinct(ARRAY [1, 2, 3]); -- [1, 2, 3] + SELECT array_distinct(ARRAY [1, 2, 1]); -- [1, 2] + SELECT array_distinct(ARRAY [1, NULL, NULL]); -- [1, NULL] + +.. spark:function:: array_except(array(E) x, array(E) y) -> array(E) + + Returns an array of the elements in array ``x`` but not in array ``y``, without duplicates. :: + + SELECT array_except(ARRAY [1, 2, 3], ARRAY [4, 5, 6]); -- [1, 2, 3] + SELECT array_except(ARRAY [1, 2, 3], ARRAY [1, 2]); -- [3] + SELECT array_except(ARRAY [1, 2, 2], ARRAY [1, 1, 2]); -- [] + SELECT array_except(ARRAY [1, 2, 2], ARRAY [1, 3, 4]); -- [2] + SELECT array_except(ARRAY [1, NULL, NULL], ARRAY [1, 1, NULL]); -- [] + +.. spark:function:: array_insert(array(E), pos, E, legacyNegativeIndex) -> array(E) + + Places new element into index ``pos`` of the input ``array``. Returns NULL if the input ``array`` or + ``pos`` is NULL. Array indices are 1-based and exception is thrown when ``pos`` is 0. The maximum + negative index is -1. When ``legacyNegativeIndex`` is true, -1 points to the last but one position. + Otherwise, -1 points to the last position. Index above array size appends the array or prepends the + array if index is negative, with 'null' elements. :: + + SELECT array_insert(NULL, 1, 0, false); -- NULL + SELECT array_insert(NULL, 1, 0, true); -- NULL + SELECT array_insert(array(1, 2), NULL, 0, false); -- NULL + SELECT array_insert(array(1, 2), NULL, 0, true); -- NULL + SELECT array_insert(array(1, 2), 1, 0, false); -- [0, 1, 2] + SELECT array_insert(array(1, 2), 1, 0, true); -- [0, 1, 2] + SELECT array_insert(array(1, 2), 4, 0, false); -- [1, 2, NULL, 0] + SELECT array_insert(array(1, 2), 4, 0, true); -- [1, 2, NULL, 0] + SELECT array_insert(array(1, 2), -1, 0, false); -- [1, 2, 0] + SELECT array_insert(array(1, 2), -1, 0, true); -- [1, 0, 2] + SELECT array_insert(array(1, 2), -4, 0, false); -- [0, NULL, 1, 2] + SELECT array_insert(array(1, 2), -4, 0, true); -- [0, NULL, NULL, 1, 2] + .. spark:function:: array_intersect(array(E), array(E1)) -> array(E2) Returns an array of the elements in the intersection of array1 and array2, without duplicates. :: SELECT array_intersect(array(1, 2, 3), array(1, 3, 5)); -- [1,3] -.. function:: array_max(array(E)) -> E +.. spark:function:: array_max(array(E)) -> E Returns maximum non-NULL element of the array. Returns NULL if array is empty or all elements are NULL. When E is DOUBLE or REAL, returns NaN if any element is NaN. :: - SELECT array_max(ARRAY [1, 2, 3]); -- 3 - SELECT array_max(ARRAY [-1, -2, -2]); -- -1 - SELECT array_max(ARRAY [-1, -2, NULL]); -- -1 - SELECT array_max(ARRAY []); -- NULL - SELECT array_max(ARRAY [-0.0001, -0.0002, -0.0003, float('nan')]); -- NaN + SELECT array_max(array(1, 2, 3)); -- 3 + SELECT array_max(array(-1, -2, -2)); -- -1 + SELECT array_max(array(-1, -2, NULL)); -- -1 + SELECT array_max(array()); -- NULL + SELECT array_max(array(-0.0001, -0.0002, -0.0003, float('nan'))); -- NaN -.. function:: array_min(array(E)) -> E +.. spark:function:: array_min(array(E)) -> E Returns minimum non-NULL element of the array. Returns NULL if array is empty or all elements are NULL. When E is DOUBLE or REAL, NaN value is considered greater than any non-NaN value. :: - SELECT array_min(ARRAY [1, 2, 3]); -- 1 - SELECT array_min(ARRAY [-1, -2, -2]); -- -2 - SELECT array_min(ARRAY [-1, -2, NULL]); -- -2 - SELECT array_min(ARRAY [NULL, NULL]); -- NULL - SELECT array_min(ARRAY []); -- NULL - SELECT array_min(ARRAY [4.0, float('nan')]); -- 4.0 - SELECT array_min(ARRAY [NULL, float('nan')]); -- NaN + SELECT array_min(array(1, 2, 3)); -- 1 + SELECT array_min(array(-1, -2, -2)); -- -2 + SELECT array_min(array(-1, -2, NULL)); -- -2 + SELECT array_min(array(NULL, NULL)); -- NULL + SELECT array_min(array()); -- NULL + SELECT array_min(array(4.0, float('nan')]); -- 4.0 + SELECT array_min(array(NULL, float('nan'))); -- NaN + +.. spark:function:: array_position(x, element) -> bigint + + Returns the position (1-based) of the first occurrence of the ``element`` in array ``x`` (or 0 if not found). :: + + SELECT array_position(array(1, 2, 3), 2); -- 2 + SELECT array_position(array(1, 2, 3), 4); -- 0 + SELECT array_position(array(1, 2, 3, 2), 2); -- 2 + +.. spark:function:: array_remove(x, element) -> array + + Remove all elements that equal ``element`` from array ``x``. Returns NULL as result if ``element`` is NULL. + If array ``x`` is empty array, returns empty array. If all elements in array ``x`` are NULL but ``element`` is not NULL, + returns array ``x``. :: + + SELECT array_remove(array(1, 2, 3), 3); -- [1, 2] + SELECT array_remove(array(2, 1, NULL), 1); -- [2, NULL] + SELECT array_remove(array(1, 2, NULL), NULL); -- NULL + SELECT array_remove(array(), 1); -- [] + SELECT array_remove(array(NULL, NULL), -1); -- [NULL, NULL] + +.. spark:function:: array_repeat(element, count) -> array(E) + + Returns an array containing ``element`` ``count`` times. If ``count`` is negative or zero, + returns empty array. If ``element`` is NULL, returns an array containing ``count`` NULLs. + If ``count`` is NULL, returns NULL as result. Throws an exception if ``count`` exceeds 10'000. :: + + SELECT array_repeat(100, 3); -- [100, 100, 100] + SELECT array_repeat(NULL, 3); -- [NULL, NULL, NULL] + SELECT array_repeat(100, NULL); -- NULL + SELECT array_repeat(100, 0); -- [] + SELECT array_repeat(100, -1); -- [] .. spark:function:: array_sort(array(E)) -> array(E) Returns an array which has the sorted order of the input array(E). The elements of array(E) must be orderable. Null elements will be placed at the end of the returned array. :: - SELECT array_sort(ARRAY [1, 2, 3]); -- [1, 2, 3] - SELECT array_sort(ARRAY [3, 2, 1]); -- [1, 2, 3] - SELECT array_sort(ARRAY [2, 1, NULL]; -- [1, 2, NULL] - SELECT array_sort(ARRAY [NULL, 1, NULL]); -- [1, NULL, NULL] - SELECT array_sort(ARRAY [NULL, 2, 1]); -- [1, 2, NULL] + SELECT array_sort(array(1, 2, 3)); -- [1, 2, 3] + SELECT array_sort(array(3, 2, 1)); -- [1, 2, 3] + SELECT array_sort(array(2, 1, NULL); -- [1, 2, NULL] + SELECT array_sort(array(NULL, 1, NULL)); -- [1, NULL, NULL] + SELECT array_sort(array(NULL, 2, 1)); -- [1, 2, NULL] + +.. spark::function:: arrays_zip(array(T), array(U),..) -> array(row(T,U, ...)) + + Returns the merge of the given arrays, element-wise into a single array of rows. + The M-th element of the N-th argument will be the N-th field of the M-th output element. + If the arguments have an uneven length, missing values are filled with ``NULL`` :: + + SELECT arrays_zip(ARRAY[1, 2], ARRAY['1b', null, '3b']); -- [ROW(1, '1b'), ROW(2, null), ROW(null, '3b')] .. spark:function:: concat(array(E), array(E1), ..., array(En)) -> array(E, E1, ..., En) @@ -68,6 +147,15 @@ Array Functions SELECT concat(array(1, 2, 3), array(4, 5), array(6)); -- [1, 2, 3, 4, 5, 6] +.. spark:function:: exists(array(T), function(T, boolean)) → boolean + + Returns whether at least one element of an array matches the given predicate. + + Returns true if one or more elements match the predicate; + Returns false if none of the elements matches (a special case is when the array is empty); + Returns NULL if the predicate function returns NULL for one or more elements and false for all other elements. + Throws an exception if the predicate fails for one or more elements and returns false or NULL for the rest. + .. spark:function:: filter(array(E), func) -> array(E) Filters the input array using the given predicate. :: @@ -76,24 +164,81 @@ Array Functions SELECT filter(array(0, 2, 3), (x, i) -> x > i); -- [2, 3] SELECT filter(array(0, null, 2, 3, null), x -> x IS NOT NULL); -- [0, 2, 3] +.. spark:function:: flatten(array(array(E))) -> array(E) + + Transforms an array of arrays into a single array. + Returns NULL if the input is NULL or any of the nested arrays is NULL. :: + + SELECT flatten(array(array(1, 2), array(3, 4))); -- [1, 2, 3, 4] + SELECT flatten(array(array(1, 2), array(3, NULL))); -- [1, 2, 3, NULL] + SELECT flatten(array(array(1, 2), NULL, array(3, 4))); -- NULL + +.. spark:function:: forall(array(T), function(T, boolean)) → boolean + + Returns whether all elements of an array match the given predicate. + + Returns true if all the elements match the predicate (a special case is when the array is empty); + Returns false if one or more elements don’t match; + Returns NULL if the predicate function returns NULL for one or more elements and true for all other elements. + Throws an exception if the predicate fails for one or more elements and returns true or NULL for the rest. + +.. spark:function:: get(array(E), index) -> E + + Returns an element of the array at the specified 0-based index. + Returns NULL if index points outside of the array boundaries. :: + + SELECT get(array(1, 2, 3), 0); -- 1 + SELECT get(array(1, 2, 3), 3); -- NULL + SELECT get(array(1, 2, 3), -1); -- NULL + SELECT get(array(1, 2, 3), NULL); -- NULL + SELECT get(array(1, 2, NULL), 2); -- NULL + .. spark:function:: in(value, array(E)) -> boolean Returns true if value matches at least one of the elements of the array. Supports BOOLEAN, REAL, DOUBLE, BIGINT, VARCHAR, TIMESTAMP, DATE input types. -.. spark:function:: size(array(E)) -> bigint +.. spark:function:: shuffle(array(E), seed) -> array(E) - Returns the size of the array. Returns null for null input - if :doc:`spark.legacy_size_of_null <../../configs>` is set to false. - Otherwise, returns -1 for null input. + Generates a random permutation of the given ``array`` using a seed derived + from the parameter ``seed`` and the configuration `spark.partition_id`. + ``seed`` must be constant. :: + + SELECT shuffle(array(1, 2, 3), 0); -- [3, 1, 2] + SELECT shuffle(array(0, 0, 0), 0); -- [0, 0, 0] + SELECT shuffle(array(1, NULL, 1, NULL, 2), 0); -- [2, 1, NULL, NULL, 1] + +.. spark:function:: size(array(E), legacySizeOfNull) -> integer + + Returns the size of the array. Returns null for null input if `legacySizeOfNull` + is set to false. Otherwise, returns -1 for null input. :: + + SELECT size(array(1, 2, 3), true); -- 3 + SELECT size(NULL, true); -- -1 + SELECT size(NULL, false); -- NULL + +.. spark:function:: slice(array(E), start, length) -> array(E) + + Returns a subarray starting at 1-based index ``start`` or from end if negative, with ``length`` elements. + Returns elements between ``start`` and the end of the array if ``start + length`` is outside of the array. + Returns empty array if ``start`` point outside of the array or ``length`` is 0. + Throws exception if ``start`` is 0 or ``length`` is negative. :: + + SELECT slice(array(1, 2, 3, 4), 2, 2); -- [2, 3] + SELECT slice(array(1, 2, 3, 4), -2, 2); -- [3, 4] + SELECT slice(array(1, 2, 3, 4), 5, 1); -- [] + SELECT slice(array(1, 2, 3, 4), 2, 5); -- [2, 3, 4] + SELECT slice(array(1, 2, 3, 4), 2, 0); -- [] + SELECT slice(array(1, 2, 3, 4), 1, -1); -- error: The value of length argument of slice() function should not be negative + SELECT slice(array(1, 2, 3, 4), 0, 1); -- error: SQL array indices start at 1 .. spark:function:: sort_array(array(E)) -> array(E) Returns an array which has the sorted order of the input array. The elements of array must be orderable. Null elements will be placed at the beginning of the returned array. :: - SELECT sort_array(ARRAY [1, 2, 3]); -- [1, 2, 3] - SELECT sort_array(ARRAY [NULL, 2, 1]); -- [NULL, 1, 2] + SELECT sort_array(array(1, 2, 3)); -- [1, 2, 3] + SELECT sort_array(array(NULL, 2, 1)); -- [NULL, 1, 2] .. spark:function:: sort_array(array(E), ascendingOrder) -> array(E) :noindex: @@ -102,9 +247,9 @@ Array Functions be orderable. Null elements will be placed at the beginning of the returned array in ascending order or at the end of the returned array in descending order. :: - SELECT sort_array(ARRAY [3, 2, 1], true); -- [1, 2, 3] - SELECT sort_array(ARRAY [2, 1, NULL, true]; -- [NULL, 1, 2] - SELECT sort_array(ARRAY [NULL, 1, NULL], false); -- [1, NULL, NULL] + SELECT sort_array(array(3, 2, 1), true); -- [1, 2, 3] + SELECT sort_array(array(2, 1, NULL, true); -- [NULL, 1, 2] + SELECT sort_array(array(NULL, 1, NULL), false); -- [1, NULL, NULL] .. spark:function:: transform(array(E), function) -> array(E) @@ -112,3 +257,14 @@ Array Functions SELECT transform(array(1, 2, 3), x -> x + 1); -- [2,3,4] SELECT transform(array(1, 2, 3), (x, i) -> x + i); -- [1,3,5] + +.. spark:function:: zip_with(array(T), array(U), function(T,U,R)) -> array(R) + + Merges the two given arrays, element-wise, into a single array using ``function``. + If one array is shorter, nulls are appended at the end to match the length of the + longer array, before applying ``function`` :: + + SELECT zip_with(ARRAY[1, 3, 5], ARRAY['a', 'b', 'c'], (x, y) -> (y, x)); -- [ROW('a', 1), ROW('b', 3), ROW('c', 5)] + SELECT zip_with(ARRAY[1, 2], ARRAY[3, 4], (x, y) -> x + y); -- [4, 6] + SELECT zip_with(ARRAY['a', 'b', 'c'], ARRAY['d', 'e', 'f'], (x, y) -> concat(x, y)); -- ['ad', 'be', 'cf'] + SELECT zip_with(ARRAY['a'], ARRAY['d', null, 'f'], (x, y) -> coalesce(x, y)); -- ['a', null, 'f'] diff --git a/velox/docs/functions/spark/binary.rst b/velox/docs/functions/spark/binary.rst index 7d6b30c2dad89..48ce669467f5d 100644 --- a/velox/docs/functions/spark/binary.rst +++ b/velox/docs/functions/spark/binary.rst @@ -2,21 +2,29 @@ Binary Functions ================ +.. spark:function:: crc32(binary) -> bigint + + Computes the crc32 checksum of ``binary``. + .. spark:function:: hash(x, ...) -> integer - Computes the hash of one or more input values using seed value of 42. + Computes the hash of one or more input values using seed value of 42. For + multiple arguments, their types can be different. .. spark:function:: hash_with_seed(seed, x, ...) -> integer - Computes the hash of one or more input values using specified seed. + Computes the hash of one or more input values using specified seed. For + multiple arguments, their types can be different. .. spark:function:: xxhash64(x, ...) -> bigint Computes the xxhash64 of one or more input values using seed value of 42. + For multiple arguments, their types can be different. .. spark:function:: xxhash64_with_seed(seed, x, ...) -> bigint - Computes the xxhash64 of one or more input values using specified seed. + Computes the xxhash64 of one or more input values using specified seed. For + multiple arguments, their types can be different. .. spark:function:: md5(x) -> varbinary @@ -26,7 +34,7 @@ Binary Functions Returns TRUE if ``bloomFilter`` might contain ``value``. - ``bloomFilter`` is a VARBINARY computed using ::spark::function::`bloom_filter_agg` aggregate function. + ``bloomFilter`` is a VARBINARY computed using ::spark:function::`bloom_filter_agg` aggregate function. ``value`` is a BIGINT. .. spark:function:: sha1(x) -> varchar diff --git a/velox/docs/functions/spark/bitwise.rst b/velox/docs/functions/spark/bitwise.rst index 2dac8234fe9bb..5924b36aa428c 100644 --- a/velox/docs/functions/spark/bitwise.rst +++ b/velox/docs/functions/spark/bitwise.rst @@ -6,11 +6,25 @@ Bitwise Functions Returns the bitwise AND of ``x`` and ``y`` in 2's complement representation. Corresponds to Spark's operator ``&``. + Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT. + +.. spark:function:: bitwise_not(x) -> [same as input] + + Returns the bitwise NOT of ``x`` in 2's complement representation. + Corresponds to Spark's operator ``~``. + Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT. .. spark:function:: bitwise_or(x, y) -> [same as input] Returns the bitwise OR of ``x`` and ``y`` in 2's complement representation. + Corresponds to Spark's operator ``|``. + Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT. + +.. spark:function:: bitwise_xor(x, y) -> [same as input] + + Returns the bitwise exclusive OR of ``x`` and ``y`` in 2's complement representation. Corresponds to Spark's operator ``^``. + Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT. .. spark:function:: bit_count(x) -> integer diff --git a/velox/docs/functions/spark/comparison.rst b/velox/docs/functions/spark/comparison.rst index 0985e4dab7db2..9b62e68c918b0 100644 --- a/velox/docs/functions/spark/comparison.rst +++ b/velox/docs/functions/spark/comparison.rst @@ -10,18 +10,27 @@ Comparison Functions .. spark:function:: equalnullsafe(x, y) -> boolean - Returns true if x is equal to y. Supports all scalar types. The - types of x and y must be the same. Unlike :spark:func:`equalto` returns true if both inputs - are NULL and false if one of the inputs is NULL. + Returns true if ``x`` is equal to ``y``. Supports all scalar and complex types. The + types of ``x`` and ``y`` must be the same. Unlike :spark:func:`equalto` returns true if both inputs + are NULL and false if one of the inputs is NULL. Nested nulls are compared as values. Corresponds to Spark's operator ``<=>``. Note that NaN in Spark is handled differently from standard floating point semantics. It is considered larger than any other numeric values. This rule is applied for functions - "equalnullsafe", "equalto", "greaterthan", "greaterthanorequal", "lessthan", "lessthanorequal". + "equalnullsafe", "equalto", "greaterthan", "greaterthanorequal", "lessthan", "lessthanorequal". :: + + SELECT equalnullsafe(null, null); -- true + SELECT equalnullsafe(null, ARRAY[1]); -- false + SELECT equalnullsafe(ARRAY[1, null], ARRAY[1, null]); -- true .. spark:function:: equalto(x, y) -> boolean - Returns true if x is equal to y. Supports all scalar types. The + Returns true if x is equal to y. Supports all scalar and complex types. The types of x and y must be the same. Corresponds to Spark's operators ``=`` and ``==``. + Returns NULL for any NULL input, but nested nulls are compared as values. :: + + SELECT equalto(null, null); -- null + SELECT equalto(null, ARRAY[1]); -- null + SELECT equalto(ARRAY[1, null], ARRAY[1, null]); -- true .. spark:function:: greaterthan(x, y) -> boolean @@ -78,5 +87,32 @@ Comparison Functions Returns true if x is not equal to y. Supports all scalar types. The types of x and y must be the same. Corresponds to Spark's operator ``!=``. +.. spark:function:: decimal_lessthan(x, y) -> boolean + + Returns true if x is less than y. Supports decimal types with different precisions and scales. + Corresponds to Spark's operator ``<``. + +.. spark:function:: decimal_lessthanorequal(x, y) -> boolean + + Returns true if x is less than y or x is equal to y. Supports decimal types with different precisions and scales. + Corresponds to Spark's operator ``<=``. + +.. spark:function:: decimal_equalto(x, y) -> boolean + + Returns true if x is equal to y. Supports decimal types with different precisions and scales. + Corresponds to Spark's operator ``==``. + +.. spark:function:: decimal_notequalto(x, y) -> boolean + + Returns true if x is not equal to y. Supports decimal types with different precisions and scales. + Corresponds to Spark's operator ``!=``. + +.. spark:function:: decimal_greaterthan(x, y) -> boolean + + Returns true if x is greater than y. Supports decimal types with different precisions and scales. + Corresponds to Spark's operator ``>``. +.. spark:function:: decimal_greaterthanorequal(x, y) -> boolean + Returns true if x is greater than y or x is equal to y. Supports decimal types with different precisions and scales. + Corresponds to Spark's operator ``>=``. diff --git a/velox/docs/functions/spark/conversion.rst b/velox/docs/functions/spark/conversion.rst new file mode 100644 index 0000000000000..037e3baebf5a8 --- /dev/null +++ b/velox/docs/functions/spark/conversion.rst @@ -0,0 +1,241 @@ +==================== +Conversion Functions +==================== + +Casting from UNKNOWN type to all other scalar types is supported, e.g., cast(NULL as int). + +Cast to Integral Types +---------------------- + +Integral types include bigint, integer, smallint, and tinyint. + +From integral types +^^^^^^^^^^^^^^^^^^^ + +Casting one integral type to another is allowed. When the input value exceeds the range of result type, +a value of the result type is created forcedly with the input value. + +Valid examples: + +:: + + SELECT cast(1234567 as bigint); -- 1234567 + SELECT cast(12 as tinyint); -- 12 + SELECT cast(1234 as tinyint); -- -46 + SELECT cast(1234567 as smallint); -- -10617 + +From floating-point types +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Casting from floating-point input to an integral type truncates the input value. +It is allowed when the truncated result exceeds the range of result type. + +Valid examples + +:: + + SELECT cast(12345.12 as bigint); -- 12345 + SELECT cast(12345.67 as bigint); -- 12345 + SELECT cast(127.1 as tinyint); -- 127 + SELECT cast(127.8 as tinyint); -- 127 + SELECT cast(1234567.89 as smallint); -- -10617 + SELECT cast(cast('inf' as double) as bigint); -- 9223372036854775807 + SELECT cast(cast('nan' as double) as integer); -- 0 + SELECT cast(cast('nan' as double) as smallint); -- 0 + SELECT cast(cast('nan' as double) as tinyint); -- 0 + SELECT cast(cast('nan' as double) as bigint); -- 0 + +From strings +^^^^^^^^^^^^ + +Casting a string to an integral type is allowed if the string represents a number within the range of result type. +Casting from strings that represent floating-point numbers truncates the decimal part of the input value. +Casting from invalid input values throws. + +Valid examples + +:: + + SELECT cast('12345' as bigint); -- 12345 + SELECT cast('+1' as tinyint); -- 1 + SELECT cast('-1' as tinyint); -- -1 + SELECT cast('12345.67' as bigint); -- 12345 + SELECT cast('1.2' as tinyint); -- 1 + SELECT cast('-1.8' as tinyint); -- -1 + SELECT cast('+1' as tinyint); -- 1 + SELECT cast('1.' as tinyint); -- 1 + SELECT cast('-1' as tinyint); -- -1 + SELECT cast('-1.' as tinyint); -- -1 + SELECT cast('0.' as tinyint); -- 0 + SELECT cast('.' as tinyint); -- 0 + SELECT cast('-.' as tinyint); -- 0 + +Invalid examples + +:: + + SELECT cast('1234567' as tinyint); -- Out of range + SELECT cast('1a' as tinyint); -- Invalid argument + SELECT cast('' as tinyint); -- Invalid argument + SELECT cast('1,234,567' as bigint); -- Invalid argument + SELECT cast('1'234'567' as bigint); -- Invalid argument + SELECT cast('nan' as bigint); -- Invalid argument + SELECT cast('infinity' as bigint); -- Invalid argument + +From decimal +^^^^^^^^^^^^ + +The decimal part will be truncated for casting to an integer. +It is allowed when the truncated result exceeds the range of result type. + +Valid examples + +:: + + SELECT cast(cast(2.56 as DECIMAL(6, 2)) as bigint); -- 2 + SELECT cast(cast(3.46 as DECIMAL(6, 2)) as bigint); -- 3 + SELECT cast(cast(5500.0 as DECIMAL(5, 1)) as tinyint); -- 124 + SELECT cast(cast(2147483648.90 as DECIMAL(12, 2)) as tinyint); -- 0 + SELECT cast(cast(2147483648.90 as DECIMAL(12, 2)) as integer); -- -2147483648 + SELECT cast(cast(2147483648.90 as DECIMAL(12, 2)) as bigint); -- 2147483648 + +Cast to Boolean +--------------- + +From VARCHAR +^^^^^^^^^^^^ + +The strings `t, f, y, n, 1, 0, yes, no, true, false` and their upper case equivalents are allowed to be casted to boolean. +Casting from other strings to boolean throws. + +Valid examples + +:: + + SELECT cast('1' as boolean); -- true + SELECT cast('0' as boolean); -- false + SELECT cast('t' as boolean); -- true (case insensitive) + SELECT cast('true' as boolean); -- true (case insensitive) + SELECT cast('f' as boolean); -- false (case insensitive) + SELECT cast('false' as boolean); -- false (case insensitive) + SELECT cast('y' as boolean); -- true (case insensitive) + SELECT cast('yes' as boolean); -- true (case insensitive) + SELECT cast('n' as boolean); -- false (case insensitive) + SELECT cast('no' as boolean); -- false (case insensitive) + +Invalid examples + +:: + + SELECT cast('1.7E308' as boolean); -- Invalid argument + SELECT cast('nan' as boolean); -- Invalid argument + SELECT cast('infinity' as boolean); -- Invalid argument + SELECT cast('12' as boolean); -- Invalid argument + SELECT cast('-1' as boolean); -- Invalid argument + SELECT cast('tr' as boolean); -- Invalid argument + SELECT cast('tru' as boolean); -- Invalid argument + +Cast to String +-------------- + +From TIMESTAMP +^^^^^^^^^^^^^^ + +Casting a timestamp to a string returns ISO 8601 format with space as separator between date and time, +and the year part is padded with zeros to 4 characters. +The conversion precision is microsecond, and trailing zeros are not appended. +When the year exceeds 9999, a positive sign is added. + +Valid examples + +:: + + SELECT cast(cast('1970-01-01 00:00:00' as timestamp) as string); -- '1970-01-01 00:00:00' + SELECT cast(cast('2000-01-01 12:21:56.129' as timestamp) as string); -- '2000-01-01 12:21:56.129' + SELECT cast(cast('2000-01-01 12:21:56.100000' as timestamp) as string); -- '2000-01-01 12:21:56.1' + SELECT cast(cast('2000-01-01 12:21:56.129900' as timestamp) as string); -- '2000-01-01 12:21:56.1299' + SELECT cast(cast('10000-02-01 16:00:00.000' as timestamp) as string); -- '+10000-02-01 16:00:00' + SELECT cast(cast('0384-01-01 08:00:00.000' as timestamp) as string); -- '0384-01-01 08:00:00' + SELECT cast(cast('-0010-02-01 10:00:00.000' as timestamp) as string); -- '-0010-02-01 10:00:00' + +Cast to Date +------------ + +From strings +^^^^^^^^^^^^ + +All Spark supported patterns are allowed: + + * ``[+-](YYYY-MM-DD)`` + * ``[+-]yyyy*`` + * ``[+-]yyyy*-[m]m`` + * ``[+-]yyyy*-[m]m-[d]d`` + * ``[+-]yyyy*-[m]m-[d]d *`` + * ``[+-]yyyy*-[m]m-[d]dT*`` + +The asterisk ``*`` in ``yyyy*`` stands for any numbers. +For the last two patterns, the trailing ``*`` can represent none or any sequence of characters, e.g: + + * "1970-01-01 123" + * "1970-01-01 (BC)" + +All leading and trailing UTF8 white-spaces will be trimmed before cast. +Casting from invalid input values throws. + +Valid examples + +:: + + SELECT cast('1970' as date); -- 1970-01-01 + SELECT cast('1970-01' as date); -- 1970-01-01 + SELECT cast('1970-01-01' as date); -- 1970-01-01 + SELECT cast('1970-01-01T123' as date); -- 1970-01-01 + SELECT cast('1970-01-01 ' as date); -- 1970-01-01 + SELECT cast('1970-01-01 (BC)' as date); -- 1970-01-01 + +Invalid examples + +:: + + SELECT cast('2012-Oct-23' as date); -- Invalid argument + SELECT cast('2012/10/23' as date); -- Invalid argument + SELECT cast('2012.10.23' as date); -- Invalid argument + +Cast to Decimal +--------------- + +From varchar +^^^^^^^^^^^^ + +Casting varchar to a decimal of given precision and scale is allowed. +The behavior is similar with Presto except Spark allows leading and trailing white-spaces in input varchars. + +Valid example + +:: + + SELECT cast(' 1.23' as decimal(38, 0)); -- 1 + SELECT cast('1.23 ' as decimal(38, 0)); -- 1 + SELECT cast(' 1.23 ' as decimal(38, 0)); -- 1 + SELECT cast(' -3E+2' as decimal(12, 2)); -- -300.00 + SELECT cast('-3E+2 ' as decimal(12, 2)); -- -300.00 + SELECT cast(' -3E+2 ' as decimal(12, 2)); -- -300.00 + +Cast to Varbinary +----------------- + +From integral types +^^^^^^^^^^^^^^^^^^^ + +Casting integral value to varbinary type is allowed. +Bytes of input value are converted into an array of bytes in little-endian order. +Supported types are tinyint, smallint, integer and bigint. + +Valid example + +:: + + SELECT cast(cast(18 as tinyint) as binary); -- [12] + SELECT cast(cast(180 as smallint) as binary); -- [00 B4] + SELECT cast(cast(180000 as integer) as binary); -- [00 02 BF 20] + SELECT cast(cast(180000 as bigint) as binary); -- [00 00 00 00 00 02 BF 20] diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index fca04f0a62720..85ca72628933b 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -7,11 +7,49 @@ Convenience Extraction Functions These functions support TIMESTAMP and DATE input types. +.. spark:function:: add_months(startDate, numMonths) -> date + + Returns the date that is ``numMonths`` after ``startDate``. + Adjusts result to a valid one, considering months have different total days, and especially + February has 28 days in common year but 29 days in leap year. + For example, add_months('2015-01-30', 1) returns '2015-02-28', because 28th is the last day + in February of 2015. + ``numMonths`` can be zero or negative. Throws an error when inputs lead to int overflow, + e.g., add_months('2023-07-10', -2147483648). :: + + SELECT add_months('2015-01-01', 10); -- '2015-11-01' + SELECT add_months('2015-01-30', 1); -- '2015-02-28' + SELECT add_months('2015-01-30', 0); -- '2015-01-30' + SELECT add_months('2015-01-30', -2); -- '2014-11-30' + SELECT add_months('2015-03-31', -1); -- '2015-02-28' + .. spark:function:: date_add(start_date, num_days) -> date - Returns the date that is num_days after start_date. - If num_days is a negative value then these amount of days will be - deducted from start_date. + Returns the date that is ``num_days`` after ``start_date``. According to the inputs, + the returned date will wrap around between the minimum negative date and + maximum positive date. date_add('1969-12-31', 2147483647) get 5881580-07-10, + and date_add('2024-01-22', 2147483647) get -5877587-07-12. + + If ``num_days`` is a negative value then these amount of days will be + deducted from ``start_date``. + Supported types for ``num_days`` are: TINYINT, SMALLINT, INTEGER. + +.. spark:function:: date_from_unix_date(integer) -> date + + Creates date from the number of days since 1970-01-01 in either direction. Returns null when input is null. + + SELECT date_from_unix_date(1); -- '1970-01-02' + SELECT date_from_unix_date(-1); -- '1969-12-31' + +.. spark:function:: date_sub(start_date, num_days) -> date + + Returns the date that is ``num_days`` before ``start_date``. According to the inputs, + the returned date will wrap around between the minimum negative date and + maximum positive date. date_sub('1969-12-31', -2147483648) get 5881580-07-11, + and date_sub('2023-07-10', -2147483648) get -5877588-12-29. + + ``num_days`` can be positive or negative. + Supported types for ``num_days`` are: TINYINT, SMALLINT, INTEGER. .. spark:function:: datediff(endDate, startDate) -> integer @@ -21,38 +59,70 @@ These functions support TIMESTAMP and DATE input types. SELECT datediff('2009-07-31', '2009-07-30'); -- 1 SELECT datediff('2009-07-30', '2009-07-31'); -- -1 -.. spark:function:: date_sub(start_date, num_days) -> date +.. spark:function:: dayofmonth(date) -> integer - Returns the date that is num_days before start_date. According to the inputs, - the returned date will wrap around between the minimum negative date and - maximum positive date. date_sub('1969-12-31', -2147483648) get 5881580-07-11, - and date_sub('2023-07-10', -2147483648) get -5877588-12-29. + Returns the day of month of the date. :: - num_days can be positive or negative. + SELECT dayofmonth('2009-07-30'); -- 30 .. spark:function:: dayofyear(date) -> integer - Returns Returns the day of year of the date/timestamp. :: + Returns the day of year of the date. :: - SELECT dayofyear('2016-04-09'); -- 100 + SELECT dayofyear('2016-04-09'); -- 100 -.. spark:function:: dayofmonth(date) -> integer +.. spark:function:: dayofweek(date) -> integer + + Returns the day of the week for date (1 = Sunday, 2 = Monday, ..., 7 = Saturday). - Returns the day of month of the date/timestamp. :: + SELECT dayofweek('2009-07-30'); -- 5 + SELECT dayofweek('2023-08-22'); -- 3 - SELECT dayofmonth('2009-07-30'); -- 30 +.. spark:function:: from_unixtime(unixTime, format) -> string -.. spark:function:: dayofweek(date/timestamp) -> integer + Adjusts ``unixTime`` (elapsed seconds since UNIX epoch) to configured session timezone, then + converts it to a formatted time string according to ``format``. Only supports BIGINT type for + ``unixTime``. + `Valid patterns for date format + `_. Throws exception for + invalid ``format``. This function will convert input to milliseconds, and integer overflow is + allowed in the conversion, which aligns with Spark. See the below third example where INT64_MAX + is used, -1000 milliseconds are produced by INT64_MAX * 1000 due to integer overflow. :: - Returns the day of the week for date/timestamp (1 = Sunday, 2 = Monday, ..., 7 = Saturday). - We can use `dow` as alias for :: + SELECT from_unixtime(100, 'yyyy-MM-dd HH:mm:ss'); -- '1970-01-01 00:01:40' + SELECT from_unixtime(3600, 'yyyy'); -- '1970' + SELECT from_unixtime(9223372036854775807, "yyyy-MM-dd HH:mm:ss"); -- '1969-12-31 23:59:59' - SELECT dayofweek('2009-07-30'); -- 5 - SELECT dayofweek('2023-08-22 11:23:00.100'); -- 3 + If we run the following query in the `Asia/Shanghai` time zone: :: -.. function:: dow(x) -> integer + SELECT from_unixtime(100, 'yyyy-MM-dd HH:mm:ss'); -- '1970-01-01 08:01:40' + SELECT from_unixtime(3600, 'yyyy'); -- '1970' + SELECT from_unixtime(9223372036854775807, "yyyy-MM-dd HH:mm:ss"); -- '1970-01-01 07:59:59' - This is an alias for :func:`day_of_week`. +.. spark:function:: from_utc_timestamp(timestamp, string) -> timestamp + + Returns the timestamp value from UTC timezone to the given timezone. :: + + SELECT from_utc_timestamp('2015-07-24 07:00:00', 'America/Los_Angeles'); -- '2015-07-24 00:00:00' + +.. spark:function:: get_timestamp(string, dateFormat) -> timestamp + + Returns timestamp by parsing ``string`` according to the specified ``dateFormat``. + The format follows Spark's + `Datetime patterns + `_. + Returns NULL for parsing error or NULL input. Throws exception for invalid format. :: + + SELECT get_timestamp('1970-01-01', 'yyyy-MM-dd); -- timestamp `1970-01-01` + SELECT get_timestamp('1970-01-01', 'yyyy-MM'); -- NULL (parsing error) + SELECT get_timestamp('1970-01-01', null); -- NULL + SELECT get_timestamp('2020-06-10', 'A'); -- (throws exception) + +.. spark:function:: hour(timestamp) -> integer + + Returns the hour of ``timestamp``.:: + + SELECT hour('2009-07-30 12:58:59'); -- 12 .. spark:function:: last_day(date) -> date @@ -69,6 +139,108 @@ These functions support TIMESTAMP and DATE input types. ``day`` need to be from 1 to 31, and matches the number of days in each month. days of ``year-month-day - 1970-01-01`` need to be in the range of INTEGER type. +.. spark:function:: make_ym_interval([years[, months]]) -> interval year to month + + Make year-month interval from ``years`` and ``months`` fields. + Returns the actual year-month with month in the range of [0, 11]. + Both ``years`` and ``months`` can be zero, positive or negative. + Throws an error when inputs lead to int overflow, + e.g., make_ym_interval(178956970, 8). :: + + SELECT make_ym_interval(1, 2); -- 1-2 + SELECT make_ym_interval(1, 0); -- 1-0 + SELECT make_ym_interval(-1, 1); -- -0-11 + SELECT make_ym_interval(1, 100); -- 9-4 + SELECT make_ym_interval(1, 12); -- 2-0 + SELECT make_ym_interval(1, -12); -- 0-0 + SELECT make_ym_interval(2); -- 2-0 + SELECT make_ym_interval(); -- 0-0 + +.. spark:function:: minute(timestamp) -> integer + + Returns the minutes of ``timestamp``.:: + + SELECT minute('2009-07-30 12:58:59'); -- 58 + +.. spark:function:: quarter(date) -> integer + + Returns the quarter of ``date``. The value ranges from ``1`` to ``4``. :: + + SELECT quarter('2009-07-30'); -- 3 + +.. spark:function:: make_timestamp(year, month, day, hour, minute, second[, timezone]) -> timestamp + + Create timestamp from ``year``, ``month``, ``day``, ``hour``, ``minute`` and ``second`` fields. + If the ``timezone`` parameter is provided, + the function interprets the input time components as being in the specified ``timezone``. + Otherwise the function assumes the inputs are in the session's configured time zone. + Requires ``session_timezone`` to be set, or an exceptions will be thrown. + + Arguments: + * year - the year to represent, within the Joda datetime + * month - the month-of-year to represent, from 1 (January) to 12 (December) + * day - the day-of-month to represent, from 1 to 31 + * hour - the hour-of-day to represent, from 0 to 23 + * minute - the minute-of-hour to represent, from 0 to 59 + * second - the second-of-minute and its micro-fraction to represent, from 0 to 60. + The value can be either an integer like 13, or a fraction like 13.123. + The fractional part can have up to 6 digits to represent microseconds. + If the sec argument equals to 60, the seconds field is set + to 0 and 1 minute is added to the final timestamp. + * timezone - the time zone identifier. For example, CET, UTC and etc. + + Returns the timestamp adjusted to the GMT time zone. + Returns NULL for invalid or NULL input. :: + + SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887); -- 2014-12-28 06:30:45.887 + SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887, 'CET'); -- 2014-12-28 05:30:45.887 + SELECT make_timestamp(2019, 6, 30, 23, 59, 60); -- 2019-07-01 00:00:00 + SELECT make_timestamp(2019, 6, 30, 23, 59, 1); -- 2019-06-30 23:59:01 + SELECT make_timestamp(null, 7, 22, 15, 30, 0); -- NULL + SELECT make_timestamp(2014, 12, 28, 6, 30, 60.000001); -- NULL + SELECT make_timestamp(2014, 13, 28, 6, 30, 45.887); -- NULL + +.. spark:function:: month(date) -> integer + + Returns the month of ``date``. :: + + SELECT month('2009-07-30'); -- 7 + +.. spark:function:: next_day(startDate, dayOfWeek) -> date + + Returns the first date which is later than ``startDate`` and named as ``dayOfWeek``. + Returns null if ``dayOfWeek`` is invalid. + ``dayOfWeek`` is case insensitive and must be one of the following: + ``SU``, ``SUN``, ``SUNDAY``, ``MO``, ``MON``, ``MONDAY``, ``TU``, ``TUE``, ``TUESDAY``, + ``WE``, ``WED``, ``WEDNESDAY``, ``TH``, ``THU``, ``THURSDAY``, ``FR``, ``FRI``, ``FRIDAY``, + ``SA``, ``SAT``, ``SATURDAY``. :: + + SELECT next_day('2015-07-23', "Mon"); -- '2015-07-27' + SELECT next_day('2015-07-23', "mo"); -- '2015-07-27' + SELECT next_day('2015-07-23', "Tue"); -- '2015-07-28' + SELECT next_day('2015-07-23', "tu"); -- '2015-07-28' + SELECT next_day('2015-07-23', "we"); -- '2015-07-29' + +.. spark:function:: second(timestamp) -> integer + + Returns the seconds of ``timestamp``. :: + + SELECT second('2009-07-30 12:58:59'); -- 59 + +.. spark:function:: timestamp_micros(x) -> timestamp + + Returns timestamp from the number of microseconds since UTC epoch. + Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT.:: + + SELECT timestamp_micros(1230219000123123); -- '2008-12-25 15:30:00.123123' + +.. spark:function:: timestamp_millis(x) -> timestamp + + Returns timestamp from the number of milliseconds since UTC epoch. + Supported types are: TINYINT, SMALLINT, INTEGER and BIGINT.:: + + SELECT timestamp_millis(1230219000123); -- '2008-12-25 15:30:00.123' + .. spark:function:: to_unix_timestamp(string) -> integer Alias for ``unix_timestamp(string) -> integer``. @@ -78,6 +250,39 @@ These functions support TIMESTAMP and DATE input types. Alias for ``unix_timestamp(string, format) -> integer``. +.. spark:function:: to_utc_timestamp(timestamp, string) -> timestamp + + Returns the timestamp value from the given timezone to UTC timezone. :: + + SELECT to_utc_timestamp('2015-07-24 00:00:00', 'America/Los_Angeles'); -- '2015-07-24 07:00:00' + +.. spark:function:: unix_date(date) -> integer + + Returns the number of days since 1970-01-01. :: + + SELECT unix_date('1970-01-01'); -- '0' + SELECT unix_date('1970-01-02'); -- '1' + SELECT unix_date('1969-12-31'); -- '-1' + +.. spark:function:: unix_micros(timestamp) -> bigint + + Returns the number of microseconds since 1970-01-01 00:00:00 UTC.:: + + SELECT unix_micros('1970-01-01 00:00:01'); -- 1000000 + +.. spark:function:: unix_millis(timestamp) -> bigint + + Returns the number of milliseconds since 1970-01-01 00:00:00 UTC. Truncates + higher levels of precision.:: + + SELECT unix_millis('1970-01-01 00:00:01'); -- 1000 + +.. spark:function:: unix_seconds(timestamp) -> bigint + + Returns the number of seconds since 1970-01-01 00:00:00 UTC. :: + + SELECT unix_seconds('1970-01-01 00:00:01'); -- 1 + .. spark:function:: unix_timestamp() -> integer Returns the current UNIX timestamp in seconds. @@ -104,8 +309,22 @@ These functions support TIMESTAMP and DATE input types. Returns the `ISO-Week`_ of the year from x. The value ranges from ``1`` to ``53``. A week is considered to start on a Monday and week 1 is the first week with >3 days. +.. function:: weekday(date) -> integer + + Returns the day of the week for date (0 = Monday, 1 = Tuesday, …, 6 = Sunday). :: + + SELECT weekday('2015-04-08'); -- 2 + SELECT weekday('2024-02-10'); -- 5 + .. _ISO-Week: https://en.wikipedia.org/wiki/ISO_week_date .. spark:function:: year(x) -> integer Returns the year from ``x``. + +.. spark:function:: year_of_week(x) -> integer + + Returns the ISO week-numbering year that ``x`` falls in. For example, 2005-01-02 is + part of the 53rd week of year 2004, so the result is 2004. Only supports DATE type. + + SELECT year_of_week('2005-01-02'); -- 2004 diff --git a/velox/docs/functions/spark/decimal.rst b/velox/docs/functions/spark/decimal.rst new file mode 100644 index 0000000000000..19eee325f4b31 --- /dev/null +++ b/velox/docs/functions/spark/decimal.rst @@ -0,0 +1,103 @@ +================= +Decimal Operators +================= + +When calculating the result precision and scale of arithmetic operators, +the formulas follow Hive which is based on the SQL standard and MS SQL: + +https://cwiki.apache.org/confluence/download/attachments/27362075/Hive_Decimal_Precision_Scale_Support.pdf + +https://msdn.microsoft.com/en-us/library/ms190476.aspx + +Addition and Subtraction +------------------------ + +:: + + p = max(p1 - s1, p2 - s2) + max(s1, s2) + 1 + s = max(s1, s2) + +Multiplication +-------------- + +:: + + p = p1 + p2 + 1 + s = s1 + s2 + +Division +-------- + +:: + + p = p1 - s1 + s2 + max(6, s1 + p2 + 1) + s = max(6, s1 + p2 + 1) + +For above arithmetic operators, when the precision of result exceeds 38, +caps p at 38 and reduces the scale, in order to prevent the truncation of +the integer part of the decimals. Below formula illustrates how the result +precision and scale are adjusted. + +:: + + precision = 38 + scale = max(38 - (p - s), min(s, 6)) + +Users experience runtime errors when the actual result cannot be represented +with the calculated decimal type. + +Decimal Functions +----------------- + +.. spark:function:: unscaled_value(x) -> bigint + + Return the unscaled bigint value of a short decimal ``x``. + Supported type is: SHORT_DECIMAL. + +Decimal Special Forms +--------------------- + +.. spark:function:: make_decimal(x[, nullOnOverflow]) -> decimal + + Create ``decimal`` of requsted precision and scale from an unscaled bigint value ``x``. + By default, the value of ``nullOnOverflow`` is true, and null will be returned when ``x`` is too large for the result precision. + Otherwise, exception will be thrown when ``x`` overflows. + +.. spark:function:: decimal_round(decimal[, scale]) -> [decimal] + + Returns ``decimal`` rounded to a new scale using HALF_UP rounding mode. In HALF_UP rounding, the digit 5 is rounded up. + ``scale`` is the new scale to be rounded to. It is 0 by default, and integer in [INT_MIN, INT_MAX] is allowed to be its value. + When the absolute value of scale exceeds the maximum precision of long decimal (38), the round logic is equivalent to the case where it is 38 as we cannot exceed the maximum precision. + The result precision and scale are decided with the precision and scale of input ``decimal`` and ``scale``. + After rounding we may need one more digit in the integral part. + + :: + SELECT (round(cast (9.9 as decimal(2, 1)), 0)); -- decimal 10 + SELECT (round(cast (99 as decimal(2, 0)), -1)); -- decimal 100 + + When ``scale`` is negative, we need to adjust ``-scale`` number of digits before the decimal point, + which means we need at least ``-scale + 1`` digits after rounding, and the result scale is 0. + + :: + + SELECT round(cast (0.856 as DECIMAL(3, 3)), -1); -- decimal 0 + SELECT round(cast (85.6 as DECIMAL(3, 1)), -1); -- decimal 90 + SELECT round(cast (85.6 as DECIMAL(3, 1)), -2); -- decimal 100 + SELECT round(cast (85.6 as DECIMAL(3, 1)), -99); -- decimal 0 + SELECT round(cast (12345678901234.56789 as DECIMAL(32, 5)), -9); -- decimal 12346000000000 + + When ``scale`` is 0, the result scale is 0. + + :: + + SELECT round(cast (85.6 as DECIMAL(3, 1))); -- decimal 86 + SELECT round(cast (0.856 as DECIMAL(3, 3)), 0); -- decimal 1 + + When ``scale`` is positive, the result scale is the minor one of input scale and ``scale``. + The result precision is decided with the number of integral digits and the result scale, but cannot exceed the max precision of decimal. + + :: + + SELECT round(cast (85.681 as DECIMAL(5, 3)), 1); -- decimal 85.7 + SELECT round(cast (85.681 as DECIMAL(5, 3)), 999); -- decimal 85.681 + SELECT round(cast (0.1234567890123456789 as DECIMAL(19, 19)), 14); -- decimal 0.12345678901235 diff --git a/velox/docs/functions/spark/json.rst b/velox/docs/functions/spark/json.rst index 07f4f3a75ace2..8004873986880 100644 --- a/velox/docs/functions/spark/json.rst +++ b/velox/docs/functions/spark/json.rst @@ -2,26 +2,12 @@ JSON Functions ============== -JSON Format ------------ +.. spark:function:: json_object_keys(jsonString) -> array(string) -JSON is a language-independent data format that represents data as -human-readable text. A JSON text can represent a number, a boolean, a -string, an array, an object, or a null, with slightly different grammar. -For instance, a JSON text representing a string must escape all characters -and enclose the string in double quotes, such as ``"123\n"``, whereas a JSON -text representing a number does not need to, such as ``123``. A JSON text -representing an array must enclose the array elements in square brackets, -such as ``[1,2,3]``. More detailed grammar can be found in -`this JSON introduction`_. + Returns all the keys of the outermost JSON object as an array if a valid JSON object is given. If it is any other valid JSON string, an invalid JSON string or an empty string, the function returns null. :: -.. _this JSON introduction: https://www.json.org - -JSON Functions --------------- - -.. spark:function:: get_json_object(json, path) -> varchar - - Extracts a json object from path:: - - SELECT get_json_object('{"a":"b"}', '$.a'); -- b \ No newline at end of file + SELECT json_object_keys('{}'); -- [] + SELECT json_object_keys('{"name": "Alice", "age": 5, "id": "001"}'); -- ['name', 'age', 'id'] + SELECT json_object_keys(''); -- NULL + SELECT json_object_keys(1); -- NULL + SELECT json_object_keys('"hello"'); -- NULL diff --git a/velox/docs/functions/spark/map.rst b/velox/docs/functions/spark/map.rst index 9234c377a7fdc..74735e9d5fe08 100644 --- a/velox/docs/functions/spark/map.rst +++ b/velox/docs/functions/spark/map.rst @@ -6,18 +6,19 @@ Map Functions Returns value for given ``key``, or ``NULL`` if the key is not contained in the map. -.. spark:function:: map() -> map(unknown, unknown) +.. spark:function:: map(K, V, K, V, ...) -> map(K,V) - Returns an empty map. :: + Returns a map created using the given key/value pairs. Keys are not allowed to be null. :: - SELECT map(); -- {} + SELECT map(1, 2, 3, 4); -- {1 -> 2, 3 -> 4} -.. spark:function:: map(array(K), array(V)) -> map(K,V) - :noindex: + SELECT map(array(1, 2), array(3, 4)); -- {[1, 2] -> [3, 4]} + +.. spark:function:: map_entries(map(K,V)) -> array(row(K,V)) - Returns a map created using the given key/value arrays. Duplicate map key will cause exception. :: + Returns an array of all entries in the given map. :: - SELECT map(ARRAY[1,3], ARRAY[2,4]); -- {1 -> 2, 3 -> 4} + SELECT map_entries(MAP(ARRAY[1, 2], ARRAY['x', 'y'])); -- [ROW(1, 'x'), ROW(2, 'y')] .. spark:function:: map_filter(map(K,V), func) -> map(K,V) @@ -32,9 +33,35 @@ Map Functions SELECT map_from_arrays(array(1.0, 3.0), array('2', '4')); -- {1.0 -> 2, 3.0 -> 4} -.. spark:function:: size(map(K,V)) -> bigint +.. spark:function:: map_keys(x(K,V)) -> array(K) + + Returns all the keys in the map ``x``. + +.. spark:function:: map_values(x(K,V)) -> array(V) + + Returns all the values in the map ``x``. + +.. spark:function:: map_zip_with(map(K,V1), map(K,V2), function(K,V1,V2,V3)) -> map(K,V3) + + Merges the two given maps into a single map by applying ``function`` to the pair of values with the same key. + For keys only presented in one map, NULL will be passed as the value for the missing key. :: + + SELECT map_zip_with(MAP(ARRAY[1, 2, 3], ARRAY['a', 'b', 'c']), -- {1 -> ad, 2 -> be, 3 -> cf} + MAP(ARRAY[1, 2, 3], ARRAY['d', 'e', 'f']), + (k, v1, v2) -> concat(v1, v2)); + SELECT map_zip_with(MAP(ARRAY['k1', 'k2'], ARRAY[1, 2]), -- {k1 -> ROW(1, null), k2 -> ROW(2, 4), k3 -> ROW(null, 9)} + MAP(ARRAY['k2', 'k3'], ARRAY[4, 9]), + (k, v1, v2) -> (v1, v2)); + SELECT map_zip_with(MAP(ARRAY['a', 'b', 'c'], ARRAY[1, 8, 27]), -- {a -> a1, b -> b4, c -> c9} + MAP(ARRAY['a', 'b', 'c'], ARRAY[1, 2, 3]), + (k, v1, v2) -> k || CAST(v1/v2 AS VARCHAR)); + +.. spark:function:: size(map(K,V), legacySizeOfNull) -> integer :noindex: - Returns the size of the input map. Returns null for null input - if :doc:`spark.legacy_size_of_null <../../configs>` is set to false. - Otherwise, returns -1 for null input. + Returns the size of the input map. Returns null for null input if ``legacySizeOfNull`` + is set to false. Otherwise, returns -1 for null input. :: + + SELECT size(map(array(1, 2), array(3, 4)), true); -- 2 + SELECT size(NULL, true); -- -1 + SELECT size(NULL, false); -- NULL diff --git a/velox/docs/functions/spark/math.rst b/velox/docs/functions/spark/math.rst index 06b671c27ee1c..99b23144d1d13 100644 --- a/velox/docs/functions/spark/math.rst +++ b/velox/docs/functions/spark/math.rst @@ -14,10 +14,26 @@ Mathematical Functions Returns inverse hyperbolic cosine of ``x``. +.. spark:function:: asin(x) -> double + + Returns the arc sine of ``x``. + .. spark:function:: asinh(x) -> double Returns inverse hyperbolic sine of ``x``. +.. spark:function:: atan(x) -> double + + Returns the arc tangent of ``x``. + +.. spark:function:: atan2(y, x) -> double + + Returns the arc tangent of ``y / x``. For compatibility with Spark, returns 0 for the following corner cases: + * atan2(0.0, 0.0) + * atan2(-0.0, -0.0) + * atan2(-0.0, 0.0) + * atan2(0.0, -0.0) + .. spark:function:: atanh(x) -> double Returns inverse hyperbolic tangent of ``x``. @@ -25,7 +41,21 @@ Mathematical Functions .. spark:function:: add(x, y) -> [same as x] Returns the result of adding x to y. The types of x and y must be the same. - For integral types, overflow results in an error. Corresponds to sparks's operator ``+``. + Corresponds to sparks's operator ``+``. + +.. spark:function:: add(x, y) -> decimal + + Returns the result of adding ``x`` to ``y``. The argument types should be DECIMAL, and can have different precisions and scales. + Fast path is implemented for cases that should not overflow. For the others, the whole parts and fractional parts of input decimals are added separately and combined finally. + The result type is calculated with the max precision of input precisions, the max scale of input scales, and one extra digit for possible carrier. + Overflow results in null output. Corresponds to Spark's operator ``+``. + + :: + + SELECT CAST(1.1232100 as DECIMAL(38, 7)) + CAST(1 as DECIMAL(10, 0)); -- DECIMAL(38, 6) 2.123210 + SELECT CAST(-999999999999999999999999999.999 as DECIMAL(30, 3)) + CAST(-999999999999999999999999999.999 as DECIMAL(30, 3)); -- DECIMAL(31, 3) -1999999999999999999999999999.998 + SELECT CAST(99999999999999999999999999999999.99998 as DECIMAL(38, 6)) + CAST(-99999999999999999999999999999999.99999 as DECIMAL(38, 5)); -- DECIMAL(38, 6) -0.000010 + SELECT CAST(-99999999999999999999999999999999990.0 as DECIMAL(38, 3)) + CAST(-0.00001 as DECIMAL(38, 7)); -- DECIMAL(38, 6) NULL .. spark:function:: bin(x) -> varchar @@ -36,6 +66,30 @@ Mathematical Functions Returns ``x`` rounded up to the nearest integer. Supported types are: BIGINT and DOUBLE. +.. function:: checked_add(x, y) -> [same as x] + + Returns the result of adding x to y. The types of x and y must be the same. + For integral types, overflow results in an error. Corresponds to Spark's operator ``+`` with ``failOnError`` as true. + +.. function:: checked_divide(x, y) -> [same as x] + + Returns the results of dividing x by y. The types of x and y must be the same. + Division by zero results in an error. Corresponds to Spark's operator ``/`` with ``failOnError`` as true. + +.. function:: checked_multiply(x, y) -> [same as x] + + Returns the result of multiplying x by y. The types of x and y must be the same. + For integral types, overflow results in an error. Corresponds to Spark's operator ``*`` with ``failOnError`` as true. + +.. function:: checked_subtract(x, y) -> [same as x] + + Returns the result of subtracting y from x. The types of x and y must be the same. + For integral types, overflow results in an error. Corresponds to Spark's operator ``-`` with ``failOnError`` as true. + +.. spark:function:: cos(x) -> double + + Returns the cosine of ``x``. + .. spark:function:: cosh(x) -> double Returns the hyperbolic cosine of ``x``. @@ -48,33 +102,80 @@ Mathematical Functions Returns the cosecant of ``x``. +.. spark:function:: degrees(x) -> double + + Converts angle x in radians to degrees. + .. spark:function:: divide(x, y) -> double Returns the results of dividing x by y. Performs floating point division. + Supported type is DOUBLE. Corresponds to Spark's operator ``/``. :: SELECT 3 / 2; -- 1.5 SELECT 2L / 2L; -- 1.0 SELECT 3 / 0; -- NULL +.. spark:function:: divide(x, y) -> decimal + + Returns the results of dividing x by y. + Supported type is DECIMAL which can be different precision and scale. + Performs floating point division. + The result type depends on the precision and scale of x and y. + Overflow results return null. Corresponds to Spark's operator ``/``. :: + + SELECT CAST(1 as DECIMAL(17, 3)) / CAST(2 as DECIMAL(17, 3)); -- decimal 0.500000000000000000000 + SELECT CAST(1 as DECIMAL(20, 3)) / CAST(20 as DECIMAL(20, 2)); -- decimal 0.0500000000000000000 + SELECT CAST(1 as DECIMAL(20, 3)) / CAST(0 as DECIMAL(20, 3)); -- NULL + .. spark:function:: exp(x) -> double Returns Euler's number raised to the power of ``x``. +.. spark:function:: expm1(x) -> double + + Returns Euler's number raised to the power of ``x``, minus 1, which is ``exp(x) - 1`` in math. This function expm1(x) is more accurate than ``exp(x) - 1``, when ``x`` is close to zero. + If the argument is NaN, the result is NaN. + If the argument is positive infinity, then the result is positive infinity. + If the argument is negative infinity, then the result is -1.0. + If the argument is zero, then the result is a zero with the same sign as the argument. + .. spark:function:: floor(x) -> [same as x] Returns ``x`` rounded down to the nearest integer. Supported types are: BIGINT and DOUBLE. +.. spark:function:: hex(x) -> varchar + + Converts ``x`` to hexadecimal. + Supported types are: BIGINT, VARBINARY and VARCHAR. + If the argument is a VARCHAR or VARBINARY, the result is string where each input byte is represented using 2 hex characters. + If the argument is a positive BIGINT, the result is a hex representation of the number (up to 16 characters), + if the argument is a negative BIGINT, the result is a hex representation of the number which will be treated as two's complement. :: + + SELECT hex("Spark SQL"); -- 537061726B2053514C + SELECT hex(17); -- 11 + SELECT hex(-1); -- FFFFFFFFFFFFFFFF + + .. spark:function:: hypot(a, b) -> double Returns the square root of `a` squared plus `b` squared. +.. spark:function:: isnan(x) -> boolean + + Returns true if x is Nan, or false otherwise. Returns false is x is NULL. + Supported types are: REAL, DOUBLE. + +.. spark:function:: log(base, expr) -> double -.. function:: log1p(x) -> double + Returns the logarithm of ``expr`` with ``base``. + Returns NULL if either ``expr`` or ``base`` is less than or equal to 0. + +.. spark:function:: log1p(x) -> double Returns the natural logarithm of the “given value ``x`` plus one”. - Return NULL if x is less than or equal to -1. + Returns NULL if x is less than or equal to -1. .. spark:function:: log2(x) -> double @@ -87,7 +188,18 @@ Mathematical Functions .. spark:function:: multiply(x, y) -> [same as x] Returns the result of multiplying x by y. The types of x and y must be the same. - For integral types, overflow results in an error. Corresponds to Spark's operator ``*``. + Corresponds to Spark's operator ``*``. + +.. spark:function:: multiply(x, y) -> [decimal] + + Returns the result of multiplying x by y. The types of x and y must be decimal which can be different precision and scale. + The result type depends on the precision and scale of x and y. + Overflow results return null. Corresponds to Spark's operator ``*``. :: + + SELECT CAST(1 as DECIMAL(17, 3)) * CAST(2 as DECIMAL(17, 3)); -- decimal 2.000000 + SELECT CAST(1 as DECIMAL(20, 3)) * CAST(20 as DECIMAL(20, 2)); -- decimal 20.00000 + SELECT CAST(1 as DECIMAL(20, 3)) * CAST(0 as DECIMAL(20, 3)); -- decimal 0.000000 + SELECT CAST(201e-38 as DECIMAL(38, 38)) * CAST(301e-38 as DECIMAL(38, 38)); -- decimal 0.0000000000000000000000000000000000000 .. spark:function:: not(x) -> boolean @@ -100,7 +212,7 @@ Mathematical Functions .. spark:function:: pmod(n, m) -> [same as n] Returns the positive remainder of n divided by m. - Supported types are: TINYINT, SMALLINT, INTEGER, BIGINT, FLOAT and DOUBLE. + Supported types are: TINYINT, SMALLINT, INTEGER, BIGINT, REAL and DOUBLE. .. spark:function:: power(x, p) -> double @@ -112,14 +224,13 @@ Mathematical Functions SELECT rand(); -- 0.9629742951434543 -.. spark:function:: rand(seed, partitionIndex) -> double +.. spark:function:: rand(seed) -> double Returns a random value with uniformly distributed values in [0, 1) using a seed formed - by combining user-specified ``seed`` and framework provided ``partitionIndex``. The + by combining user-specified ``seed`` and the configuration `spark.partition_id`. The framework is responsible for deterministic partitioning of the data and assigning unique - ``partitionIndex`` to each thread (in a deterministic way). - ``seed`` must be constant. NULL ``seed`` is identical to zero ``seed``. ``partitionIndex`` - cannot be NULL. :: + `spark.partition_id` to each thread (in a deterministic way) . + ``seed`` must be constant. NULL ``seed`` is identical to zero ``seed``. :: SELECT rand(0); -- 0.5488135024422883 SELECT rand(NULL); -- 0.5488135024422883 @@ -128,18 +239,28 @@ Mathematical Functions An alias for ``rand()``. -.. spark:function:: random(seed, partitionIndex) -> double +.. spark:function:: random(seed) -> double - An alias for ``rand(seed, partitionIndex)``. + An alias for ``rand(seed)``. .. spark:function:: remainder(n, m) -> [same as n] Returns the modulus (remainder) of ``n`` divided by ``m``. Corresponds to Spark's operator ``%``. + Supported types are: TINYINT, SMALLINT, INTEGER, BIGINT, REAL and DOUBLE. + +.. spark:function:: rint(x) -> double + + Returns the double value that is closest in value to the argument and is + equal to a mathematical integer. + Returns ``x`` if ``x`` is a positive or negative infinity or a NaN. :: + + SELECT rint(12.3456); -- 12.0 .. spark:function:: round(x, d) -> [same as x] Returns ``x`` rounded to ``d`` decimal places using HALF_UP rounding mode. In HALF_UP rounding, the digit 5 is rounded up. + Supported types for ``x`` are integral and floating point types. .. spark:function:: sec(x) -> double @@ -152,8 +273,55 @@ Mathematical Functions .. spark:function:: subtract(x, y) -> [same as x] Returns the result of subtracting y from x. The types of x and y must be the same. - For integral types, overflow results in an error. Corresponds to Spark's operator ``-``. + Corresponds to Spark's operator ``-``. + +.. spark:function:: subtract(x, y) -> decimal + + Returns the result of subtracting ``y`` from ``x``. Reuses the logic of add function for decimal type. + Corresponds to Spark's operator ``-``. + + :: + + SELECT CAST(1.1232100 as DECIMAL(38, 7)) - CAST(1 as DECIMAL(10, 0)); -- DECIMAL(38, 6) 0.123210 + SELECT CAST(-999999999999999999999999999.999 as DECIMAL(30, 3)) - CAST(-999999999999999999999999999.999 as DECIMAL(30, 3)); -- DECIMAL(31, 3) 0.000 + SELECT CAST(99999999999999999999999999999999.99998 as DECIMAL(38, 6)) - CAST(-0.00001 as DECIMAL(38, 5)); -- DECIMAL(38, 6) 99999999999999999999999999999999.999990 + SELECT CAST(-99999999999999999999999999999999990.0 as DECIMAL(38, 3)) - CAST(0.00001 as DECIMAL(38, 7)); -- DECIMAL(38, 6) NULL .. spark:function:: unaryminus(x) -> [same as x] Returns the negative of `x`. Corresponds to Spark's operator ``-``. + +.. spark:function:: unhex(x) -> varbinary + + Converts hexadecimal varchar ``x`` to varbinary. + ``x`` is considered case insensitive and expected to contain only hexadecimal characters 0-9 and A-F. + If ``x`` contains non-hexadecimal character, the function returns NULL. + When ``x`` contains an even number of characters, each pair is converted to a single byte. The number of bytes in the result is half the number of bytes in the input. + When ``x`` contains an odd number of characters, the first character is decoded into the first byte of the result and the remaining pairs of characters are decoded into subsequent bytes. This behavior matches Spark 3.3.2 and newer. :: + + SELECT unhex("23"); -- # + SELECT unhex("f"); -- \x0F + SELECT unhex("b2323"); -- \x0B## + SELECT unhex("G"); -- NULL + SELECT unhex("G23"); -- NULL + +.. spark:function:: width_bucket(x, bound1, bound2, n) -> bigint + + Returns the zero-based bucket number to which ``x`` would be assigned in an equiwidth histogram with ``n`` buckets, + in the range ``bound1`` to ``bound2``. + `bound1` can be greater than `bound2`. + If `bound1` less than `bound2`, if `x` less than `bound1` return 0, if `x` greater than or equal to `bound2` return n + 1. + If `bound1` greater than `bound2`, if `x` greater than `bound1` return 0, if `x` less than or equal to `bound2` return n + 1. + `n` must be a positive integral value. `x`, `bound1`, and `bound2` cannot be NaN. `bound1`, and `bound2` must be finite. + `bound1` cannot equal `bound2`; + Otherwise, the function will return NULL. + + :: + + SELECT width_bucket(-1.0, 0.0, 10.0, 5); -- 0 + SELECT width_bucket(0.1, 0.0, 10.0, 5); -- 1 + SELECT width_bucket(10.1, 0.0, 10.0, 5); -- 6 + SELECT width_bucket(-1.0, 10.0, 0.0, 5); -- 6 + SELECT width_bucket(0.1, 10.0, 0.0, 5); -- 5 + SELECT width_bucket(10.1, 10.0, 0.0, 5); -- 0 + SELECT width_bucket(10.1, 10.0, 10.0, 5); -- NULL diff --git a/velox/docs/functions/spark/misc.rst b/velox/docs/functions/spark/misc.rst new file mode 100644 index 0000000000000..0238d8018adc2 --- /dev/null +++ b/velox/docs/functions/spark/misc.rst @@ -0,0 +1,38 @@ +==================================== +Miscellaneous Functions +==================================== + +.. spark:function:: monotonically_increasing_id() -> bigint + + Returns monotonically increasing 64-bit integers. The generated ID is + guaranteed to be monotonically increasing and unique, but not consecutive. + The current implementation puts the partition ID in the upper 31 bits, and + the lower 33 bits represent the record number within each partition. + The assumption is that the data frame has less than 1 billion partitions, + and each partition has less than 8 billion records. + The function relies on partition IDs, which are provided by the framework + via the configuration 'spark.partition_id'. + +.. spark:function:: raise_error(message) + + Throws a user error with the specified ``message``. + If ``message`` is NULL, throws a user error with empty message. + +.. spark:function:: spark_partition_id() -> integer + + Returns the current partition id. + The framework provides partition id through the configuration + 'spark.partition_id'. + It ensures deterministic data partitioning and assigns a unique partition + id to each task in a deterministic way. Consequently, this function is + marked as deterministic, enabling Velox to perform constant folding on it. + +.. spark:function:: uuid(seed) -> string + + Returns an universally unique identifier (UUID) string. The value is + returned as a canonical UUID 36-character string. The UUID is generated + from Pseudo-Random Numbers with the seed by combining user-specified + ``seed`` and the configuration `spark.partition_id`. + ``seed`` must be constant. :: + + SELECT uuid(0); -- "8c7f0aac-97c4-4a2f-b716-a675d821ccc0" diff --git a/velox/docs/functions/spark/regexp.rst b/velox/docs/functions/spark/regexp.rst index 28b58cfa072a0..23778a5e2cd41 100644 --- a/velox/docs/functions/spark/regexp.rst +++ b/velox/docs/functions/spark/regexp.rst @@ -5,12 +5,45 @@ Regular Expression Functions Regular expression functions use RE2 as the regex engine. RE2 is fast, but supports only a subset of PCRE syntax and in particular does not support backtracking and associated features (e.g. back references). +Java and RE2 regex output can diverage and users should be cautious that +the patterns they are using perform similarly between RE2 and Java. +For example, character class unions, intersections, and differences +``([a[b]], [a&&[b]], [a&&[^b]])`` are intepreted as a single character class +that contain ``[, &, and ^`` rather than union, intersection, or +difference of the character classes. + + See https://github.com/google/re2/wiki/Syntax for more information. +.. spark:function:: like(string, pattern) -> boolean + like(string, pattern, escape) -> boolean + + Evaluates if the ``string`` matches the ``pattern``. Patterns can contain + regular characters as well as wildcards. Wildcard characters can be escaped + using the single character specified for the ``escape`` parameter. Only ASCII + characters are supported for the ``escape`` parameter. Matching is case sensitive. + + Note: The wildcard '%' represents 0, 1 or multiple characters and the + wildcard '_' represents exactly one character. + + Note: Each function instance allow for a maximum of 20 regular expressions to + be compiled per thread of execution. Not all patterns require + compilation of regular expressions. Patterns 'hello', 'hello%', '_hello__%', + '%hello', '%__hello_', '%hello%', where 'hello', 'velox' + contains only regular characters and '_' wildcards are evaluated without + using regular expressions. Only those patterns that require the compilation of + regular expressions are counted towards the limit. + + SELECT like('abc', '%b%'); -- true + SELECT like('a_c', '%#_%', '#'); -- true + .. spark:function:: regexp_extract(string, pattern) -> varchar Returns the first substring matched by the regular expression ``pattern`` - in ``string``. :: + in ``string``. + + regexp_extract does not support column references for the ``pattern`` argument. + Patterns must be constant values. :: SELECT regexp_extract('1a 2b 14m', '\d+'); -- 1 @@ -18,10 +51,28 @@ See https://github.com/google/re2/wiki/Syntax for more information. :noindex: Finds the first occurrence of the regular expression ``pattern`` in - ``string`` and returns the capturing group number ``group``. :: + ``string`` and returns the capturing group number ``group``. + + regexp_extract does not support column references for the ``pattern`` argument. + Patterns must be constant values. :: SELECT regexp_extract('1a 2b 14m', '(\d+)([a-z]+)', 2); -- 'a' +.. spark:function:: regexp_extract_all(string, pattern) -> array(varchar): + + Returns the substring(s) matched by the regular expression ``pattern`` + in ``string``:: + + SELECT regexp_extract_all('1a 2b 14m', '\d+'); -- [1, 2, 14] + +.. spark:function:: regexp_extract_all(string, pattern, group) -> array(varchar): + :noindex: + + Finds all occurrences of the regular expression ``pattern`` in + ``string`` and returns the capturing group number ``group``:: + + SELECT regexp_extract_all('1a 2b 14m', '(\d+)([a-z]+)', 2); -- ['a', 'b', 'm'] + .. spark:function:: rlike(string, pattern) -> boolean Evaluates the regular expression ``pattern`` and determines if it is @@ -31,6 +82,52 @@ See https://github.com/google/re2/wiki/Syntax for more information. pattern only needs to be contained within ``string``, rather than needing to match all of ``string``. In other words, this performs a *contains* operation rather than a *match* operation. You can match - the entire string by anchoring the pattern using ``^`` and ``$``. :: + the entire string by anchoring the pattern using ``^`` and ``$``. + + rlike does not support column references for the ``pattern`` argument. + Patterns must be constant values. :: SELECT rlike('1a 2b 14m', '\d+b'); -- true + +.. spark:function:: regexp_replace(string, pattern, overwrite) -> varchar + + Replaces all substrings in ``string`` that match the regular expression ``pattern`` with the string ``overwrite``. If no match is found, the original string is returned as is. + There is a limit to the number of unique regexes to be compiled per function call, which is 20. If this limit is exceeded the function will throw an exception. + + Parameters: + + - **string**: The string to be searched. + - **pattern**: The regular expression pattern that is searched for in the string. + - **overwrite**: The string that replaces the substrings in ``string`` that match the ``pattern``. + + Examples: + + :: + + SELECT regexp_replace('Hello, World!', 'l', 'L'); -- 'HeLLo, WorLd!' + SELECT regexp_replace('300-300', '(\\d+)-(\\d+)', '400'); -- '400' + SELECT regexp_replace('300-300', '(\\d+)', '400'); -- '400-400' + +.. spark:function:: regexp_replace(string, pattern, overwrite, position) -> varchar + :noindex: + + Replaces all substrings in ``string`` that match the regular expression ``pattern`` with the string ``overwrite`` starting from the specified ``position``. If no match is found, the original string is returned as is. If the ``position`` is less than one, the function throws an exception. If ``position`` is greater than the length of ``string``, the function returns the original ``string`` without any modifications. + There is a limit to the number of unique regexes to be compiled per function call, which is 20. If this limit is exceeded the function will throw an exception. + + This function is 1-indexed, meaning the position of the first character is 1. + Parameters: + + - **string**: The string to be searched. + - **pattern**: The regular expression pattern that is searched for in the string. + - **overwrite**: The string that replaces the substrings in ``string`` that match the ``pattern``. + - **position**: The position to start from in terms of number of characters. 1 means to start from the beginning of the string. 3 means to start from the 3rd character. Positions less than one, the function will throw an error. If ``position`` is greater than the length of ``string``, the function returns the original ``string`` without any modifications. + + Examples: + + :: + + SELECT regexp_replace('Hello, World!', 'l', 'L', 6); -- 'Hello, WorLd!' + + SELECT regexp_replace('Hello, World!', 'l', 'L', 5); -- 'Hello, World!' + + SELECT regexp_replace('Hello, World!', 'l', 'L', 100); -- 'Hello, World!' diff --git a/velox/docs/functions/spark/string.rst b/velox/docs/functions/spark/string.rst index ed8c49b38de52..d241714601e6b 100644 --- a/velox/docs/functions/spark/string.rst +++ b/velox/docs/functions/spark/string.rst @@ -2,12 +2,23 @@ String Functions ==================================== -Unless specified otherwise, all functions return NULL if at least one of the arguments is NULL. +.. note:: + + Unless specified otherwise, all functions return NULL if at least one of the arguments is NULL. + + These functions assume that input strings contain valid UTF-8 encoded Unicode code points. + The behavior is undefined if they are not. .. spark:function:: ascii(string) -> integer Returns unicode code point of the first character of ``string``. Returns 0 if ``string`` is empty. +.. spark:function:: bit_length(string/binary) -> integer + + Returns the bit length for the specified string column. :: + + SELECT bit_length('123'); -- 24 + .. spark:function:: chr(n) -> varchar Returns the Unicode code point ``n`` as a single character string. @@ -23,6 +34,39 @@ Unless specified otherwise, all functions return NULL if at least one of the arg SELECT contains('Spark SQL', null); -- NULL SELECT contains(x'537061726b2053514c', x'537061726b'); -- true +.. spark:function:: conv(number, fromBase, toBase) -> varchar + + Converts ``number`` represented as a string from ``fromBase`` to ``toBase``. + ``fromBase`` must be an INTEGER value between 2 and 36 inclusively. ``toBase`` must + be an INTEGER value between 2 and 36 inclusively or between -36 and -2 inclusively. + Otherwise, returns NULL. + Returns a signed number if ``toBase`` is negative. Otherwise, returns an unsigned one. + Returns NULL if ``number`` is empty. + Skips leading spaces. ``number`` may contain other characters not valid for ``fromBase``. + All characters starting from the first invalid character till the end of the string are + ignored. Only converts valid characters even though ``fromBase`` = ``toBase``. Returns + '0' if no valid character is found. :: + + SELECT conv('100', 2, 10); -- '4' + SELECT conv('-10', 16, -10); -- '-16' + SELECT conv("-1", 10, 16); -- 'FFFFFFFFFFFFFFFF' + SELECT conv("123", 10, 39); -- NULL + SELECT conv('', 16, 10); -- NULL + SELECT conv(' ', 2, 10); -- NULL + SELECT conv("11", 10, 16); -- 'B' + SELECT conv("11ABC", 10, 16); -- 'B' + SELECT conv("11abc", 10, 10); -- '11' + SELECT conv('H016F', 16, 10); -- '0' + +.. spark:function:: empty2null(input) -> varchar + + Returns NULL if ``input`` is empty. Otherwise, returns ``input``. + Note: it's an internal Spark function used to convert empty value of a partition column, + which is then converted to Hive default partition value ``__HIVE_DEFAULT_PARTITION__``. :: + + SELECT empty2null(''); -- NULL + SELECT empty2null('abc'); -- 'abc' + .. spark:function:: endswith(left, right) -> boolean Returns true if 'left' ends with 'right'. Otherwise, returns false. :: @@ -31,6 +75,20 @@ Unless specified otherwise, all functions return NULL if at least one of the arg SELECT endswith('js SQL', 'js'); -- false SELECT endswith('js SQL', NULL); -- NULL +.. spark:function:: find_in_set(str, strArray) -> integer + + Returns 1-based index of the given string ``str`` in the comma-delimited list ``strArray``. + Returns 0, if the string was not found or if the given string ``str`` contains a comma. :: + + SELECT find_in_set('ab', 'abc,b,ab,c,def'); -- 3 + SELECT find_in_set('ab,', 'abc,b,ab,c,def'); -- 0 + SELECT find_in_set('dfg', 'abc,b,ab,c,def'); -- 0 + SELECT find_in_set('', ''); -- 1 + SELECT find_in_set('', '123,'); -- 2 + SELECT find_in_set('', ',123'); -- 1 + SELECT find_in_set(NULL, ',123'); -- NULL + SELECT find_in_set("abc", NULL); -- NULL + .. spark:function:: instr(string, substring) -> integer Returns the starting position of the first instance of ``substring`` in @@ -45,6 +103,15 @@ Unless specified otherwise, all functions return NULL if at least one of the arg Returns the length of ``string`` in characters. +.. spark:function:: levenshtein(string1, string2[, threshold]) -> integer + + Returns the `Levenshtein distance `_ between the two given strings. + If the provided ``threshold`` is negative, or the levenshtein distance exceeds ``threshold``, returns -1. :: + + SELECT levenshtein('kitten', 'sitting'); -- 3 + SELECT levenshtein('kitten', 'sitting', 10); -- 3 + SELECT levenshtein('kitten', 'sitting', 2); -- -1 + .. spark:function:: lower(string) -> string Returns string with all characters changed to lowercase. :: @@ -77,6 +144,30 @@ Unless specified otherwise, all functions return NULL if at least one of the arg SELECT ltrim('ps', 'spark'); -- "ark" +.. spark:function:: mask(string[, upperChar, lowerChar, digitChar, otherChar]) -> string + + Returns a masked version of the input ``string``. + ``string``: string value to mask. + ``upperChar``: A single character string used to substitute upper case characters. The default is 'X'. If NULL, upper case characters remain unmasked. + ``lowerChar``: A single character string used to substitute lower case characters. The default is 'x'. If NULL, lower case characters remain unmasked. + ``digitChar``: A single character string used to substitute digits. The default is 'n'. If NULL, digits remain unmasked. + ``otherChar``: A single character string used to substitute any other character. The default is NULL, which leaves these characters unmasked. + Any invalid UTF-8 characters present in the input string will be treated as a single other character. :: + + SELECT mask('abcd-EFGH-8765-4321'); -- "xxxx-XXXX-nnnn-nnnn" + SELECT mask('abcd-EFGH-8765-4321', 'Q'); -- "xxxx-QQQQ-nnnn-nnnn" + SELECT mask('AbCD123-@$#'); -- "XxXXnnn-@$#" + SELECT mask('AbCD123-@$#', 'Q'); -- "QxQQnnn-@$#" + SELECT mask('AbCD123-@$#', 'Q', 'q'); -- "QqQQnnn-@$#" + SELECT mask('AbCD123-@$#', 'Q', 'q', 'd'); -- "QqQQddd-@$#" + SELECT mask('AbCD123-@$#', 'Q', 'q', 'd', 'o'); -- "QqQQdddoooo" + SELECT mask('AbCD123-@$#', NULL, 'q', 'd', 'o'); -- "AqCDdddoooo" + SELECT mask('AbCD123-@$#', NULL, NULL, 'd', 'o'); -- "AbCDdddoooo" + SELECT mask('AbCD123-@$#', NULL, NULL, NULL, 'o'); -- "AbCD123oooo" + SELECT mask(NULL, NULL, NULL, NULL, 'o'); -- NULL + SELECT mask(NULL); -- NULL + SELECT mask('AbCD123-@$#', NULL, NULL, NULL, NULL); -- "AbCD123-@$#" + .. spark:function:: overlay(input, replace, pos, len) -> same as input Replace a substring of ``input`` starting at ``pos`` character with ``replace`` and @@ -97,12 +188,35 @@ Unless specified otherwise, all functions return NULL if at least one of the arg SELECT overlay('Spark SQL', 'tructured', 2, 4); -- "Structured SQL" SELECT overlay('Spark SQL', '_', -6, 3); -- "_Sql" -.. spark:function:: replace(string, search, replace) -> string +.. spark:function:: repeat(input, n) -> varchar + + Returns the string which repeats ``input`` ``n`` times. + Result size must be less than or equal to 1MB. + If ``n`` is less than or equal to 0, empty string is returned. :: + + SELECT repeat('123', 2); -- 123123 - Replaces all occurrences of `search` with `replace`. :: +.. spark:function:: replace(input, replaced) -> varchar + Removes all instances of ``replaced`` from ``input``. + If ``replaced`` is an empty string, returns the original ``input`` string. :: + + SELECT replace('ABCabc', ''); -- ABCabc + SELECT replace('ABCabc', 'bc'); -- ABCc + +.. spark:function:: replace(input, replaced, replacement) -> varchar + + Replaces all instances of ``replaced`` with ``replacement`` in ``input``. + If ``replaced`` is an empty string, returns the original ``input`` string. :: + + SELECT replace('ABCabc', '', 'DEF'); -- ABCabc + SELECT replace('ABCabc', 'abc', ''); -- ABC SELECT replace('ABCabc', 'abc', 'DEF'); -- ABCDEF +.. spark:function:: reverse(string) -> varchar + + Returns input string with characters in reverse order. + .. spark:function:: rpad(string, len, pad) -> string Returns ``string``, right-padded with ``pad`` to a length of ``len``. @@ -129,22 +243,33 @@ Unless specified otherwise, all functions return NULL if at least one of the arg SELECT rtrim('kr', 'spark'); -- "spa" -.. spark:function:: split(string, delimiter) -> array(string) +.. spark:function:: soundex(string) -> string - Splits ``string`` on ``delimiter`` and returns an array. :: + Returns `Soundex code `_ of the string. If first character of ``string`` is not + a letter, ``string`` is returned. :: - SELECT split('oneAtwoBthreeC', '[ABC]'); -- ["one","two","three",""] - SELECT split('one', ''); -- ["o", "n", "e", ""] - SELECT split('one', '1'); -- ["one"] + SELECT soundex('Miller'); -- "M460" -.. spark:function:: split(string, delimiter, limit) -> array(string) - :noindex: +.. spark:function:: split(string, delimiter[, limit]) -> array(string) - Splits ``string`` on ``delimiter`` and returns an array of size at most ``limit``. :: + Splits ``string`` around occurrences that match ``delimiter`` and returns an array with a length of + at most ``limit``. ``delimiter`` is a string representing regular expression. ``limit`` is an integer + which controls the number of times the regex is applied. By default, ``limit`` is -1. When ``limit`` > 0, + the resulting array's length will not be more than ``limit``, and the resulting array's last entry will + contain all input beyond the last matched regex. When ``limit`` <= 0, ``regex`` will be applied as many + times as possible, and the resulting array can be of any size. When ``delimiter`` is empty, if ``limit`` + is smaller than the size of ``string``, the resulting array only contains ``limit`` number of single characters + splitting from ``string``, if ``limit`` is not provided or is larger than the size of ``string``, the resulting + array contains all the single characters of ``string`` and does not include an empty tail character. + The split function align with vanilla spark 3.4+ split function. :: - SELECT split('oneAtwoBthreeC', '[ABC]', -1); -- ["one","two","three",""] - SELECT split('oneAtwoBthreeC', '[ABC]', 0); -- ["one", "two", "three", ""] + SELECT split('oneAtwoBthreeC', '[ABC]'); -- ["one","two","three",""] SELECT split('oneAtwoBthreeC', '[ABC]', 2); -- ["one","twoBthreeC"] + SELECT split('oneAtwoBthreeC', '[ABC]', 5); -- ["one","two","three",""] + SELECT split('one', '1'); -- ["one"] + SELECT split('abcd', ''); -- ["a","b","c","d"] + SELECT split('abcd', '', 3); -- ["a","b","c"] + SELECT split('abcd', '', 5); -- ["a","b","c","d"] .. spark:function:: startswith(left, right) -> boolean @@ -154,6 +279,20 @@ Unless specified otherwise, all functions return NULL if at least one of the arg SELECT startswith('js SQL', 'SQL'); -- false SELECT startswith('js SQL', null); -- NULL +.. spark:function:: str_to_map(string, entryDelimiter, keyValueDelimiter) -> map(string, string) + + Returns a map by splitting ``string`` into entries with ``entryDelimiter`` and splitting + each entry into key/value with ``keyValueDelimiter``. + ``entryDelimiter`` and ``keyValueDelimiter`` must be constant strings with single ascii + character. Allows ``keyValueDelimiter`` not found when splitting an entry. Throws exception + when duplicate map keys are found for single row's result, consistent with Spark's default + behavior. :: + + SELECT str_to_map('a:1,b:2,c:3', ',', ':'); -- {"a":"1","b":"2","c":"3"} + SELECT str_to_map('a', ',', ':'); -- {"a":NULL} + SELECT str_to_map('', ',', ':'); -- {"":NULL} + SELECT str_to_map('a:1,b:2,c:3', ',', ','); -- {"a:1":NULL,"b:2":NULL,"c:3":NULL} + .. spark:function:: substring(string, start) -> varchar Returns the rest of ``string`` from the starting position ``start``. @@ -179,6 +318,30 @@ Unless specified otherwise, all functions return NULL if at least one of the arg SELECT substring('Spark SQL', -10, 3); -- "Sp" SELECT substring('Spark SQL', -20, 3); -- "" +.. spark:function:: substring_index(string, delim, count) -> [same as string] + + Returns the substring from ``string`` before ``count`` occurrences of the delimiter ``delim``. + Here the ``string`` can be VARCHAR or VARBINARY and return type matches type of ``string``. + If ``count`` is positive, returns everything to the left of the final delimiter + (counting from the left). If ``count`` is negative, returns everything to the right + of the final delimiter (counting from the right). If ``count`` is 0, returns empty string. + If ``delim`` is not found or found fewer times than ``count``, returns the original input string. + ``delim`` is case-sensitive. It also takes into account overlapping strings. :: + + SELECT substring_index('Spark.SQL', '.', 1); -- "Spark" + SELECT substring_index('Spark.SQL', '.', 0); -- "" + SELECT substring_index('Spark.SQL', '.', -1); -- "SQL" + SELECT substring_index('TEST.Spark.SQL', '.',2); -- "TEST.Spark" + SELECT substring_index('TEST.Spark.SQL', '', 0); -- "" + SELECT substring_index('TEST.Spark.SQL', '.', -2); -- "Spark.SQL" + SELECT substring_index('TEST.Spark.SQL', '.', 10); -- "TEST.Spark.SQL" + SELECT substring_index('TEST.Spark.SQL', '.', -12); -- "TEST.Spark.SQL" + SELECT substring_index('aaaaa', 'aa', 2); -- "a" + SELECT substring_index('aaaaa', 'aa', -4); -- "aaa" + SELECT substring_index('aaaaa', 'aa', 0); -- "" + SELECT substring_index('aaaaa', 'aa', 5); -- "aaaaa" + SELECT substring_index('aaaaa', 'aa', -5); -- "aaaaa" + .. spark:function:: translate(string, match, replace) -> varchar Returns a new translated string. It translates the character in ``string`` by a @@ -188,7 +351,9 @@ Unless specified otherwise, all functions return NULL if at least one of the arg size is larger than ``replace's``, the extra characters in ``match`` will be removed from ``string``. In addition, this function only considers the first occurrence of a character in ``match`` and uses its corresponding character in - ``replace`` for translation. :: + ``replace`` for translation. + Any invalid UTF-8 characters present in the input string will be treated as a + single character.:: SELECT translate('spark', 'sa', '12'); -- "1p2rk" SELECT translate('spark', 'sa', '1'); -- "1prk" @@ -213,4 +378,4 @@ Unless specified otherwise, all functions return NULL if at least one of the arg Returns string with all characters changed to uppercase. :: - SELECT upper('SparkSql'); -- SPARKSQL \ No newline at end of file + SELECT upper('SparkSql'); -- SPARKSQL diff --git a/velox/docs/functions/spark/url.rst b/velox/docs/functions/spark/url.rst new file mode 100644 index 0000000000000..a6eaf70edba9a --- /dev/null +++ b/velox/docs/functions/spark/url.rst @@ -0,0 +1,66 @@ +============= +URL Functions +============= + +Introduction +------------ + +The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 3986 `_). The following syntax is supported: + +.. code-block:: bash + + [protocol:][//host[:port]][path][?query][#fragment] + + +The extracted components do not contain URI syntax separators such as ``:`` , ``?`` and ``#``. + +Consider for example the below URI: + +.. code-block:: + + http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related + + scheme = http + authority = www.ics.uci.edu + path = /pub/ietf/uri/ + query = k1=v1 + fragment = Related + + +Invalid URI's +------------- + +Well formed URI's should not contain ascii whitespace. `Percent-encoded URI's `_ should be followed by two hexadecimal +digits after the percent character "%". All the url extract functions will return null when passed an invalid uri. + +.. code-block:: + + # Examples of url functions with Invalid URI's. + + # Invalid URI due to whitespace + SELECT url_extract_path('foo '); -- NULL (1 row) + SELECT url_extract_host('http://www.foo.com '); -- NULL (1 row) + + # Invalid URI due to improper escaping of '%' + SELECT url_extract_path('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row) + SELECT url_extract_host('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row) + +Encoding Functions +------------------ + +.. spark:function:: url_encode(value) -> varchar + + Escapes ``value`` by encoding it so that it can be safely included in + URL query parameter names and values: + + * Alphanumeric characters are not encoded. + * The characters ``.``, ``-``, ``*`` and ``_`` are not encoded. + * The ASCII space character is encoded as ``+``. + * All other characters are converted to UTF-8 and the bytes are encoded + as the string ``%XX`` where ``XX`` is the uppercase hexadecimal + value of the UTF-8 byte. + +.. spark:function:: url_decode(value) -> varchar + + Unescapes the URL encoded ``value``. + This function is the inverse of :spark:func:`url_encode`. diff --git a/velox/docs/functions/spark/window.rst b/velox/docs/functions/spark/window.rst index 50804d9ff5c18..2a4d7921c95a5 100644 --- a/velox/docs/functions/spark/window.rst +++ b/velox/docs/functions/spark/window.rst @@ -3,6 +3,7 @@ Window functions ================ Spark window functions can be used to compute SQL window functions. +More details about window functions can be found at :doc:`/develop/window` Value functions --------------- @@ -29,3 +30,8 @@ Returns the rank of a value in a group of values. The rank is one plus the numbe Returns the rank of a value in a group of values. This is similar to rank(), except that tie values do not produce gaps in the sequence. +.. spark:function:: ntile(n) -> integer + +Divides the rows for each window partition into n buckets ranging from 1 to at most ``n``. Bucket values will differ by at most 1. If the number of rows in the partition does not divide evenly into the number of buckets, then the remainder values are distributed one per bucket, starting with the first bucket. + +For example, with 6 rows and 4 buckets, the bucket values would be as follows: ``1 1 2 2 3 4`` diff --git a/velox/docs/index.rst b/velox/docs/index.rst index 9bec57eb9c250..17d5ce21173d7 100644 --- a/velox/docs/index.rst +++ b/velox/docs/index.rst @@ -10,6 +10,7 @@ Velox Documentation functions spark_functions configs + monitoring bindings/python/README_generated_pyvelox develop programming-guide diff --git a/velox/docs/monitoring.rst b/velox/docs/monitoring.rst new file mode 100644 index 0000000000000..b92026f9b6373 --- /dev/null +++ b/velox/docs/monitoring.rst @@ -0,0 +1,9 @@ +********** +Monitoring +********** + +.. toctree:: + :maxdepth: 1 + + monitoring/metrics.rst + monitoring/stats.rst \ No newline at end of file diff --git a/velox/docs/monitoring/metrics.rst b/velox/docs/monitoring/metrics.rst new file mode 100644 index 0000000000000..53f256c6beb11 --- /dev/null +++ b/velox/docs/monitoring/metrics.rst @@ -0,0 +1,531 @@ +=============== +Runtime Metrics +=============== + +Runtime metrics are used to collect the metrics of important velox runtime events +for monitoring purpose. The collected metrics can provide insights into the +continuous availability and performance analysis of a Velox runtime system. For +instance, the collected data can help automatically generate alerts at an +outage. Velox provides a framework to collect the metrics which consists of +three steps: + +**Define**: define the name and type for the metric through DEFINE_METRIC and +DEFINE_HISTOGRAM_METRIC macros. DEFINE_HISTOGRAM_METRIC is used for histogram +metric type and DEFINE_METRIC is used for the other types (see metric type +definition below). BaseStatsReporter provides methods for metric definition. +Register metrics during startup using registerVeloxMetrics() API. + +**Record**: record the metric data point using RECORD_METRIC_VALUE and +RECORD_HISTOGRAM_METRIC_VALUE macros when the corresponding event happens. +BaseStatsReporter provides methods for metric recording. + +**Export**: aggregates the collected data points based on the defined metrics, +and periodically exports to the backend monitoring service, such as ODS used by +Meta, Apache projects `OpenCensus `_ and `Prometheus `_ provided by OSS. A derived +implementation of BaseStatsReporter is required to integrate with a specific +monitoring service. The metric aggregation granularity and export interval are +also configured based on the actual used monitoring service. + +Velox supports five metric types: + +**Count**: tracks the count of events, such as the number of query failures. + +**Sum**: tracks the sum of event data point values, such as sum of query scan +read bytes. + +**Avg**: tracks the average of event data point values, such as average of query +execution time. + +**Rate**: tracks the sum of event data point values per second, such as the +number of shuffle requests per second. + +**Histogram**: tracks the distribution of event data point values, such as query +execution time distribution. The histogram metric divides the entire data range +into a series of adjacent equal-sized intervals or buckets, and then count how +many data values fall into each bucket. DEFINE_HISTOGRAM_METRIC specifies the data +range by min/max values, and the number of buckets. Any collected data value +less than min is counted in min bucket, and any one larger than max is counted +in max bucket. It also allows to specify the value percentiles to report for +monitoring. This allows BaseStatsReporter and the backend monitoring service to +optimize the aggregated data storage. + +Task Execution +-------------- +.. list-table:: + :widths: 40 10 50 + :header-rows: 1 + + * - Metric Name + - Type + - Description + * - driver_yield_count + - Count + - The number of times that a driver has yielded from the thread when it + hits the per-driver cpu time slice limit if enforced. + * - driver_queue_time_ms + - Histogram + - The distribution of driver queue latency in range of [0, 10s] with + 20 buckets. It is configured to report the latency at P50, P90, P99, + and P100 percentiles. + * - driver_exec_time_ms + - Histogram + - The distribution of driver execution time in range of [0, 30s] with + 30 buckets. It is configured to report the latency at P50, P90, P99, + and P100 percentiles. + +Memory Management +----------------- + +.. list-table:: + :widths: 40 10 50 + :header-rows: 1 + + * - Metric Name + - Type + - Description + * - cache_shrink_count + - Count + - The number of times that in-memory data cache has been shrunk under + memory pressure. + * - cache_shrink_ms + - Histogram + - The distribution of cache shrink latency in range of [0, 100s] with 10 + buckets. It is configured to report the latency at P50, P90, P99, and + P100 percentiles. + * - memory_reclaim_count + - Count + - The count of operator memory reclaims. + * - memory_reclaim_exec_ms + - Histogram + - The distribution of memory reclaim execution time in range of [0, 600s] + with 20 buckets. It is configured to report latency at P50, P90, P99, and + P100 percentiles. + * - memory_reclaim_bytes + - Histogram + - The distribution of reclaimed bytes in range of [0, 4GB] with 64 buckets + and reports P50, P90, P99, and P100. + * - task_memory_reclaim_count + - Count + - The count of task memory reclaims. + * - task_memory_reclaim_wait_ms + - Histogram + - The distribution of task memory reclaim wait time in range of [0, 60s] + with 60 buckets. It is configured to report latency at P50, P90, P99, + and P100 percentiles. + * - task_memory_reclaim_exec_ms + - Histogram + - The distribution of task memory execution time in range of [0, 240s] + with 60 buckets. It is configured to report latency at P50, P90, P99, + and P100 percentiles. + * - task_memory_reclaim_wait_timeout_count + - Count + - The number of times that the task memory reclaim wait timeouts. + * - memory_non_reclaimable_count + - Count + - The number of times that the memory reclaim fails because the operator is executing a + non-reclaimable section where it is expected to have reserved enough memory to execute + without asking for more. Therefore, it is an indicator that the memory reservation + is not sufficient. It excludes counting instances where the operator is in a + non-reclaimable state due to currently being on-thread and running or being already + cancelled. + * - arbitrator_requests_count + - Count + - The number of times a memory arbitration request was initiated by a + memory pool attempting to grow its capacity. + * - arbitrator_local_arbitration_count + - Count + - The number of arbitration that reclaims the used memory from the query which initiates + the memory arbitration request itself. It ensures the memory arbitration request won't + exceed its per-query memory capacity limit. + * - arbitrator_global_arbitration_count + - Count + - The number of arbitration which ensures the total allocated query capacity won't exceed + the arbitrator capacity limit. It may or may not reclaim memory from the query which + initiate the memory arbitration request. This indicates the velox runtime doesn't have + enough memory to run all the queries at their peak memory usage. We have to trigger + spilling to let them run through completion. + * - arbitrator_slow_global_arbitration_count + - Count + - The number of global arbitration that reclaims used memory by slow disk spilling. + * - arbitrator_aborted_count + - Count + - The number of times a query level memory pool is aborted as a result of + a memory arbitration process. The memory pool aborted will eventually + result in a cancelling the original query. + * - arbitrator_failures_count + - Count + - The number of times a memory arbitration request failed. This may occur + either because the requester was terminated during the processing of + its request, the arbitration request would surpass the maximum allowed + capacity for the requester, or the arbitration process couldn't release + the requested amount of memory. + * - arbitrator_wait_time_ms + - Histogram + - The distribution of the amount of time an arbitration request stays in + arbitration queues and waits the arbitration r/w locks in range of [0, 600s] + with 20 buckets. It is configured to report the latency at P50, P90, P99, + and P100 percentiles. + * - arbitrator_arbitration_time_ms + - Histogram + - The distribution of the amount of time it take to complete a single + arbitration request stays queued in range of [0, 600s] with 20 + buckets. It is configured to report the latency at P50, P90, P99, + and P100 percentiles. + * - arbitrator_free_capacity_bytes + - Average + - The average of total free memory capacity which is managed by the + memory arbitrator. + * - arbitrator_free_reserved_capacity_bytes + - Average + - The average of free memory capacity reserved to ensure each query has + the minimal required capacity to run. + * - memory_pool_initial_capacity_bytes + - Histogram + - The distribution of a root memory pool's initial capacity in range of [0 256MB] + with 32 buckets. It is configured to report the capacity at P50, P90, P99, + and P100 percentiles. + * - memory_pool_capacity_growth_count + - Histogram + - The distribution of a root memory pool cappacity growth attemps through + memory arbitration in range of [0, 256] with 32 buckets. It is configured + to report the count at P50, P90, P99, and P100 percentiles. + * - memory_pool_usage_leak_bytes + - Sum + - The leaf memory pool usage leak in bytes. + * - memory_pool_reservation_leak_bytes + - Sum + - The leaf memory pool reservation leak in bytes. + * - memory_pool_capacity_leak_bytes + - Sum + - The root memory pool reservation leak in bytes. + * - memory_allocator_double_free_count + - Count + - Tracks the count of double frees in memory allocator, indicating the + possibility of buffer ownership issues when a buffer is freed more + than once. + * - memory_allocator_mapped_bytes + - Avg + - Number of bytes currently mapped in MemoryAllocator. These bytes represent + the bytes that are either currently being allocated or were in the past + allocated, not yet been returned back to the operating system, in the + form of 'Allocation' or 'ContiguousAllocation'. + * - memory_allocator_alloc_bytes + - Avg + - Number of bytes currently allocated (used) from MemoryAllocator in the form + of 'Allocation' or 'ContiguousAllocation'. + * - mmap_allocator_external_mapped_bytes + - Avg + - Number of bytes currently mapped in MmapAllocator, in the form of + 'ContiguousAllocation'. + NOTE: This applies only to MmapAllocator + * - mmap_allocator_delegated_alloc_bytes + - Avg + - Number of bytes currently allocated from MmapAllocator directly from raw + allocateBytes() interface, and internally allocated by malloc. Only small + chunks of memory are delegated to malloc + NOTE: This applies only to MmapAllocator + +Cache +-------------- + +.. list-table:: + :widths: 40 10 50 + :header-rows: 1 + + * - Metric Name + - Type + - Description + * - cache_max_age_secs + - Avg + - Max possible age of AsyncDataCache and SsdCache entries since the raw file + was opened to load the cache. + * - memory_cache_num_entries + - Avg + - Total number of cache entries. + * - memory_cache_num_empty_entries + - Avg + - Total number of cache entries that do not cache anything. + * - memory_cache_num_shared_entries + - Avg + - Total number of cache entries that are pinned for shared access. + * - memory_cache_num_exclusive_entries + - Avg + - Total number of cache entries that are pinned for exclusive access. + * - memory_cache_num_prefetched_entries + - Avg + - Total number of cache entries that are being or have been prefetched but + have not been hit. + * - memory_cache_total_tiny_bytes + - Avg + - Total number of bytes of the cached data that is much smaller than kTinyDataSize. + * - memory_cache_total_large_bytes + - Avg + - Total number of bytes of the cached data excluding 'memory_cache_total_tiny_bytes' + * - memory_cache_total_tiny_padding_bytes + - Avg + - Total unused capacity bytes in 'memory_cache_total_tiny_bytes'. + * - memory_cache_total_large_padding_bytes + - Avg + - Total unused capacity bytes in 'memory_cache_total_large_bytes'. + * - memory_cache_total_prefetched_bytes + - Avg + - Total bytes of cache entries in prefetch state. + * - memory_cache_sum_evict_score + - Sum + - Sum of scores of evicted entries. This serves to infer an average lifetime + for entries in cache. + * - memory_cache_num_hits + - Sum + - Number of hits (saved IO) since last counter retrieval. The first hit to a + prefetched entry does not count. + * - memory_cache_hit_bytes + - Sum + - Amount of hit bytes (saved IO) since last counter retrieval. The first hit + to a prefetched entry does not count. + * - memory_cache_num_new + - Sum + - Number of new entries created since last counter retrieval. + * - memory_cache_num_evicts + - Sum + - Number of times a valid entry was removed in order to make space, since + last counter retrieval. + * - memory_cache_num_savable_evicts + - Sum + - Number of times a valid entry was removed in order to make space but has not + been saved to SSD yet, since last counter retrieval. + * - memory_cache_num_evict_checks + - Sum + - Number of entries considered for evicting, since last counter retrieval. + * - memory_cache_num_wait_exclusive + - Sum + - Number of times a user waited for an entry to transit from exclusive to + shared mode, since last counter retrieval. + * - memory_cache_num_alloc_clocks + - Sum + - Clocks spent in allocating or freeing memory for backing cache entries, + since last counter retrieval + * - memory_cache_num_aged_out_entries + - Sum + - Number of AsyncDataCache entries that are aged out and evicted. + given configured TTL. + * - memory_cache_num_stale_entries + - Count + - Number of AsyncDataCache entries that are stale because of cache request + size mismatch. + * - ssd_cache_cached_regions + - Avg + - Number of regions currently cached by SSD. + * - ssd_cache_cached_entries + - Avg + - Number of entries currently cached by SSD. + * - ssd_cache_cached_bytes + - Avg + - Total bytes currently cached by SSD. + * - ssd_cache_read_entries + - Sum + - Total number of entries read from SSD. + * - ssd_cache_read_bytes + - Sum + - Total number of bytes read from SSD. + * - ssd_cache_written_entries + - Sum + - Total number of entries written to SSD. + * - ssd_cache_written_bytes + - Sum + - Total number of bytes written to SSD. + * - ssd_cache_aged_out_entries + - Sum + - Total number of SsdCache entries that are aged out and evicted given + configured TTL. + * - ssd_cache_aged_out_regions + - Sum + - Total number of SsdCache regions that are aged out and evicted given + configured TTL. + * - ssd_cache_open_ssd_errors + - Sum + - Total number of SSD file open errors. + * - ssd_cache_open_checkpoint_errors + - Sum + - Total number of SSD checkpoint file open errors. + * - ssd_cache_open_log_errors + - Sum + - Total number of SSD evict log file open errors. + * - ssd_cache_delete_checkpoint_errors + - Sum + - Total number of errors while deleting SSD checkpoint files. + * - ssd_cache_read_without_checksum + - Sum + - Total number of SSD cache reads without checksum verification + due to SSD cache request size mismatch + * - ssd_cache_grow_file_errors + - Sum + - Total number of errors while growing SSD cache files. + * - ssd_cache_write_ssd_errors + - Sum + - Total number of error while writing to SSD cache files. + * - ssd_cache_write_ssd_dropped + - Sum + - Total number of writes dropped due to no cache space. + * - ssd_cache_write_checkpoint_errors + - Sum + - Total number of errors while writing SSD checkpoint file. + * - ssd_cache_read_corruptions + - Sum + - Total number of corrupted SSD data read detected by checksum. + * - ssd_cache_read_ssd_errors + - Sum + - Total number of errors while reading from SSD cache files. + * - ssd_cache_read_checkpoint_errors + - Sum + - Total number of errors while reading from SSD checkpoint files. + * - ssd_cache_checkpoints_read + - Sum + - Total number of checkpoints read. + * - ssd_cache_checkpoints_written + - Sum + - Total number of checkpoints written. + * - ssd_cache_regions_evicted + - Sum + - Total number of cache regions evicted. + * - ssd_cache_recovered_entries + - Sum + - Total number of cache entries recovered from checkpoint. + +Storage +------- + +.. list-table:: + :widths: 40 10 50 + :header-rows: 1 + + * - Metric Name + - Type + - Description + * - storage_throttled_duration_ms + - Histogram + - The time distribution of storage IO throttled duration in range of [0, 30s] + with 30 buckets. It is configured to report the capacity at P50, P90, P99, + and P100 percentiles. + * - storage_local_throttled_count + - Count + - The number of times that storage IOs get throttled in a storage directory. + * - storage_global_throttled_count + - Count + - The number of times that storage IOs get throttled in a storage cluster. + +Spilling +-------- + +.. list-table:: + :widths: 40 10 50 + :header-rows: 1 + + * - Metric Name + - Type + - Description + * - spill_max_level_exceeded_count + - Count + - The number of times that a spill-able operator hits the max spill level + limit. + * - spill_input_bytes + - Sum + - The number of bytes in memory to spill. + * - spill_bytes + - Sum + - The number of bytes spilled to disk which can be the number of compressed + bytes if compression is enabled. + * - spill_rows_count + - Count + - The number of spilled rows. + * - spill_files_count + - Count + - The number of spilled files. + * - spill_fill_time_ms + - Histogram + - The distribution of the amount of time spent on filling rows for spilling + in range of [0, 600s] with 20 buckets. It is configured to report the + latency at P50, P90, P99, and P100 percentiles. + * - spill_sort_time_ms + - Histogram + - The distribution of the amount of time spent on sorting rows for spilling + in range of [0, 600s] with 20 buckets. It is configured to report the + latency at P50, P90, P99, and P100 percentiles. + * - spill_serialization_time_ms + - Histogram + - The distribution of the amount of time spent on serializing rows for + spilling in range of [0, 600s] with 20 buckets. It is configured to report + the latency at P50, P90, P99, and P100 percentiles. + * - spill_disk_writes_count + - Count + - The number of disk writes to spill rows. + * - spill_flush_time_ms + - Histogram + - The distribution of the amount of time spent on copy out serialized + rows for disk write in range of [0, 600s] with 20 buckets. It is configured + to report the latency at P50, P90, P99, and P100 percentiles. Note: If + compression is enabled, this includes the compression time. + * - spill_write_time_ms + - Histogram + - The distribution of the amount of time spent on writing spilled rows to + disk in range of [0, 600s] with 20 buckets. It is configured to report the + latency at P50, P90, P99, and P100 percentiles. + * - file_writer_early_flushed_raw_bytes + - Sum + - Number of bytes pre-maturely flushed from file writers because of memory reclaiming. + * - spill_memory_bytes + - Avg + - The current spilling memory usage in bytes. + * - spill_peak_memory_bytes + - Avg + - The peak spilling memory usage in bytes. + +Exchange +-------- + +.. list-table:: + :widths: 40 10 50 + :header-rows: 1 + + * - Metric Name + - Type + - Description + * - exchange_data_time_ms + - Histogram + - The distribution of data exchange latency in range of [0, 50s] with 50 + buckets. It is configured to report latency at P50, P90, P99, and P100 + percentiles. + * - exchange_data_bytes + - Sum + - The exchange data size in bytes. + * - exchange_data_size + - Histogram + - The distribution of exchange data size in range of [0, 128MB] with 128 + buckets. It is configured to report the capacity at P50, P90, P99, and P100 + percentiles. + * - exchange_data_count + - Count + - The number of data exchange requests. + * - exchange_data_size_time_ms + - Histogram + - The distribution of data exchange size latency in range of [0, 5s] with 50 + buckets. It is configured to report latency at P50, P90, P99, and P100 + percentiles. + * - exchange_data_size_count + - Count + - The number of data size exchange requests. + +Hive Connector +-------------- + +.. list-table:: + :widths: 40 10 50 + :header-rows: 1 + + * - Metric Name + - Type + - Description + * - hive_file_handle_generate_latency_ms + - Histogram + - The distribution of hive file open latency in range of [0, 100s] with 10 + buckets. It is configured to report latency at P50, P90, P99, and P100 + percentiles. diff --git a/velox/docs/monitoring/stats.rst b/velox/docs/monitoring/stats.rst new file mode 100644 index 0000000000000..f8a75f295cf95 --- /dev/null +++ b/velox/docs/monitoring/stats.rst @@ -0,0 +1,156 @@ +============= +Runtime Stats +============= + +Runtime stats are used to collect the per-query velox runtime events for +offline query analysis purpose. The collected stats can provide insights into +the operator level query execution internals, such as how much time a query +operator spent in disk spilling. The collected stats are organized in a +free-form key-value for easy extension. The key is the event name and the +value is defined as RuntimeCounter which is used to store and aggregate a +particular event occurrences during the operator execution. RuntimeCounter has +three types: kNone used to record event count, kNanos used to record event time +in nanoseconds and kBytes used to record memory or storage size in bytes. It +records the count of events, and the min/max/sum of the event values. The stats +are stored in OperatorStats structure. The query system can aggregate the +operator level stats collected from each driver by pipeline and task for +analysis. + +Memory Arbitration +------------------ +These stats are reported by all operators. + +.. list-table:: + :widths: 50 25 50 + :header-rows: 1 + + * - Stats + - Unit + - Description + * - memoryReclaimCount + - + - The number of times that the memory arbitration to reclaim memory from + an spillable operator. + This stats only applies for spillable operators. + * - memoryReclaimWallNanos + - nanos + - The memory reclaim execution time of an operator during the memory + arbitration. It collects time spent on disk spilling or file write. + This stats only applies for spillable operators. + * - reclaimedMemoryBytes + - bytes + - The reclaimed memory bytes of an operator during the memory arbitration. + This stats only applies for spillable operators. + * - globalArbitrationCount + - + - The number of times a request for more memory hit the arbitrator's + capacity limit and initiated a global arbitration attempt where + memory is reclaimed from viable candidates chosen among all running + queries based on a criterion. + * - localArbitrationCount + - + - The number of times a request for more memory hit the query memory + limit and initiated a local arbitration attempt where memory is + reclaimed from the requestor itself. + * - localArbitrationQueueWallNanos + - + - The time of an operator waiting in local arbitration queue. + * - localArbitrationLockWaitWallNanos + - + - The time of an operator waiting to acquire the local arbitration lock. + * - globalArbitrationLockWaitWallNanos + - + - The time of an operator waiting to acquire the global arbitration lock. + +HashBuild, HashAggregation +-------------------------- +These stats are reported only by HashBuild and HashAggregation operators. + +.. list-table:: + :widths: 50 25 50 + :header-rows: 1 + + * - Stats + - Unit + - Description + * - hashtable.capacity + - + - Number of slots across all buckets in the hash table. + * - hashtable.numRehashes + - + - Number of rehash() calls. + * - hashtable.numDistinct + - + - Number of distinct keys in the hash table. + * - hashtable.numTombstones + - + - Number of tombstone slots in the hash table. + * - hashtable.buildWallNanos + - nanos + - Time spent on building the hash table from rows collected by all the + hash build operators. This stat is only reported by the HashBuild operator. + +TableWriter +----------- +These stats are reported only by TableWriter operator + +.. list-table:: + :widths: 50 25 50 + :header-rows: 1 + + * - Stats + - Unit + - Description + * - earlyFlushedRawBytes + - bytes + - Number of bytes pre-maturely flushed from file writers because of memory reclaiming. + +Spilling +-------- +These stats are reported by operators that support spilling. + +.. list-table:: + :widths: 50 25 50 + :header-rows: 1 + + * - Stats + - Unit + - Description + * - spillFillWallNanos + - nanos + - The time spent on filling rows for spilling. + * - spillSortWallNanos + - nanos + - The time spent on sorting rows for spilling. + * - spillSerializationWallNanos + - nanos + - The time spent on serializing rows for spilling. + * - spillFlushWallNanos + - nanos + - The time spent on copy out serialized rows for disk write. If compression + is enabled, this includes the compression time. + * - spillWrites + - + - The number of spill writer flushes, equivalent to number of write calls to + underlying filesystem. + * - spillWriteWallNanos + - nanos + - The time spent on writing spilled rows to disk. + * - spillRuns + - + - The number of times that spilling runs on an operator. + * - exceededMaxSpillLevel + - + - The number of times that an operator exceeds the max spill limit. + * - spillReadBytes + - bytes + - The number of bytes read from spilled files. + * - spillReads + - + - The number of spill reader reads, equivalent to the number of read calls to the underlying filesystem. + * - spillReadWallNanos + - nanos + - The time spent on read data from spilled files. + * - spillDeserializationWallNanos + - nanos + - The time spent on deserializing rows read from spilled files. diff --git a/velox/docs/monthly-updates.rst b/velox/docs/monthly-updates.rst index bfbe28127b180..ed635b80c307d 100644 --- a/velox/docs/monthly-updates.rst +++ b/velox/docs/monthly-updates.rst @@ -5,14 +5,11 @@ Monthly Updates .. toctree:: :maxdepth: 1 - monthly-updates/september-2023 - monthly-updates/august-2023 - monthly-updates/july-2023 - monthly-updates/june-2023 - monthly-updates/may-2023 - monthly-updates/april-2023 - monthly-updates/march-2023 - monthly-updates/february-2023 - monthly-updates/january-2023 + monthly-updates/may-2024 + monthly-updates/april-2024 + monthly-updates/march-2024 + monthly-updates/february-2024 + monthly-updates/january-2024 + monthly-updates/2023/index monthly-updates/2022/index monthly-updates/2021/index diff --git a/velox/docs/monthly-updates/april-2023.rst b/velox/docs/monthly-updates/2023/april-2023.rst similarity index 100% rename from velox/docs/monthly-updates/april-2023.rst rename to velox/docs/monthly-updates/2023/april-2023.rst diff --git a/velox/docs/monthly-updates/august-2023.rst b/velox/docs/monthly-updates/2023/august-2023.rst similarity index 100% rename from velox/docs/monthly-updates/august-2023.rst rename to velox/docs/monthly-updates/2023/august-2023.rst diff --git a/velox/docs/monthly-updates/2023/december-2023.rst b/velox/docs/monthly-updates/2023/december-2023.rst new file mode 100644 index 0000000000000..8f4df40a953b0 --- /dev/null +++ b/velox/docs/monthly-updates/2023/december-2023.rst @@ -0,0 +1,99 @@ +******************** +December 2023 Update +******************** + +Documentation +============= + +* Add documentation for :doc:`Runtime Metrics`. + +Core Library +============ + +* Add support for ``k range frames`` in ``Window`` operator. +* Add support for aggregations over sorted inputs to ``StreamingAggregation``. +* Add support for TTL in AsyncDataCache and SsdCache. :pr:`6412` +* Add support for TypeSignature Parser using Flex and Bison. This is used to parse function signatures. +* Add support for spilling during the output processing stage of the ``OrderBy`` operator. +* Add support for metrics related to memory arbitration and spilling. :pr:`7940`, :pr:`8025` +* Add config ``max_spill_bytes`` to bound the storage used for spilling. The default value is set to 100GB. + If it is set to zero, then there is no limit. +* Add ``Status`` class that can be used to carry the success or error state of an operation. + This is similar to `arrow::Status `_. +* Add :ref:`Expand ` operator. +* Add config ``max_arbitrary_buffer_size`` to set the maximum size in bytes for a task's buffered + output when the output is distributed randomly among consumers. The producer drivers are blocked + when the buffer size exceeds this config. +* Fix reclaiming memory from hash build operators in grouped execution mode. :pr:`8178` +* Fix non-termination of hash join in certain conditions. :pr:`7925`, :pr:`8012`. +* Fix non-termination of the distinct aggregation in certain conditions. :pr:`7968`. +* Fix ``LimitNode`` offset and count values from overflowing. + +Presto Functions +================ + +* Add support for TIMESTAMP WITH TIME ZONE input type to :func:`format_datetime` function. +* Add support for UNKNOWN key type to :func:`map_keys` and :func:`map_values` functions. +* Add support for DECIMAL types to :func:`approx_distinct` aggregate function. +* Add support for ``cast(double|real as varchar)`` to return scientific notation when magnitude + of the input value is greater than or equal to 10^7, or less than 10^-3. +* Fix :func:`find_first` to return NULL when the input ArrayVector is NULL but + has non-zero offsets and sizes. +* Fix :func:`find_first` to support input ArrayVectors that are only NULL or empty. +* Fix :func:`find_first` to return NULL for inputs NULL array and 0 index. +* Fix :func:`find_first` to throw an error for inputs empty array and invalid start index. +* Fix :func:`array_sort` to fail gracefully if the specified comparator lambda + is not supported. +* Fix :func:`transform_keys` to check new keys for NULLs. +* Fix :func:`set_union`, :func:`set_agg` to preserve the order of inputs. +* Fix :func:`map` to produce the correct output if input arrays have NULL rows but with + invalid offsets and sizes. +* Fix accuracy of the DECIMAL type average computation. :pr:`7944` + +Spark Functions +=============== + +* Add :spark:func:`str_to_map`, :spark:func:`next_day`, :spark:func:`atan2` functions. +* Add support for DECIMAL types to :spark:func:`add` and :spark:func:`subtract` functions. + +Hive Connector +============== + +* Add support for multiple S3 FileSystems. :pr:`7388` +* Add support to write dictionary and constant encoded vectors to Parquet by flattening them. +* Add support to specify a schema when writing Parquet files. :pr:`6074` +* Add config ``max_split_preload_per_driver`` and remove the ``split_preload_per_driver`` flag. +* Fix memory leak in HdfsBuilder. + +Arrow +===== + +* Fix exporting an REE array by setting the child name to the canonical name defined in the Arrow spec. :pr:`7802` + +Performance and Correctness +=========================== + +* Add support for lambda functions to ExpressionFuzzer. +* Add ExchangeFuzzer. + +Build +===== + +* Add support for docker image with Presto. +* Add support for `azure-storage-files-datalake + `_ version 12.8.0. +* Allow specifying a custom curl version for the cpr library. :pr:`7853` +* Update aws-sdk-cpp version to 1.11.169 (from 1.10.57). + +Credits +======= + +Aditi Pandit, Amit Dutta, Bikramjeet Vig, Chengcheng Jin, Christian Zentgraf, Daniel Munoz, +Deepak Majeti, Ge Gao, Harvey Hunt, HolyLow, Hongze Zhang, Jacob Wujciak-Jens, Jia, Jia Ke, +Jialiang Tan, Jimmy Lu, Jubin Chheda, Karteekmurthys, Ke, Kevin Wilfong, Krishna Pai, +Krishna-Prasad-P-V, Laith Sakka, Ma-Jian1, Masha Basmanova, Orri Erling, PHILO-HE, +Patrick Sullivan, Pedro Eugenio Rocha Pedreira, Pedro Pedreira, Pramod,Ravi Rahman, +Richard Barnes, Sergey Pershin, Srikrishna Gopu, Wei He, Xiaoxuan Meng, Yangyang Gao, +Yedidya Feldblum, Zac, aditi-pandit, binwei, duanmeng, hengjiang.ly, joey.ljy, rui-mo, +shangjing.cxw, soumyaduriseti, xiaoxmeng, xiyu.zk, xumingming, yan ma, yangchuan ,yingsu00, +zhli, zhli1142015, 高阳阳 \ No newline at end of file diff --git a/velox/docs/monthly-updates/february-2023.rst b/velox/docs/monthly-updates/2023/february-2023.rst similarity index 100% rename from velox/docs/monthly-updates/february-2023.rst rename to velox/docs/monthly-updates/2023/february-2023.rst diff --git a/velox/docs/monthly-updates/2023/index.rst b/velox/docs/monthly-updates/2023/index.rst new file mode 100644 index 0000000000000..849058503e4a7 --- /dev/null +++ b/velox/docs/monthly-updates/2023/index.rst @@ -0,0 +1,19 @@ +*************** +2023 +*************** + +.. toctree:: + :maxdepth: 1 + + december-2023 + november-2023 + october-2023 + september-2023 + august-2023 + july-2023 + june-2023 + may-2023 + april-2023 + march-2023 + february-2023 + january-2023 diff --git a/velox/docs/monthly-updates/january-2023.rst b/velox/docs/monthly-updates/2023/january-2023.rst similarity index 100% rename from velox/docs/monthly-updates/january-2023.rst rename to velox/docs/monthly-updates/2023/january-2023.rst diff --git a/velox/docs/monthly-updates/july-2023.rst b/velox/docs/monthly-updates/2023/july-2023.rst similarity index 100% rename from velox/docs/monthly-updates/july-2023.rst rename to velox/docs/monthly-updates/2023/july-2023.rst diff --git a/velox/docs/monthly-updates/june-2023.rst b/velox/docs/monthly-updates/2023/june-2023.rst similarity index 100% rename from velox/docs/monthly-updates/june-2023.rst rename to velox/docs/monthly-updates/2023/june-2023.rst diff --git a/velox/docs/monthly-updates/march-2023.rst b/velox/docs/monthly-updates/2023/march-2023.rst similarity index 100% rename from velox/docs/monthly-updates/march-2023.rst rename to velox/docs/monthly-updates/2023/march-2023.rst diff --git a/velox/docs/monthly-updates/may-2023.rst b/velox/docs/monthly-updates/2023/may-2023.rst similarity index 100% rename from velox/docs/monthly-updates/may-2023.rst rename to velox/docs/monthly-updates/2023/may-2023.rst diff --git a/velox/docs/monthly-updates/2023/november-2023.rst b/velox/docs/monthly-updates/2023/november-2023.rst new file mode 100644 index 0000000000000..40bbf2525bc8a --- /dev/null +++ b/velox/docs/monthly-updates/2023/november-2023.rst @@ -0,0 +1,105 @@ +******************** +November 2023 Update +******************** + +Core Library +============ + +* Add spilling support for aggregations over distinct or sorted inputs. :pr:`7305`, :pr:`7526` +* Add support to lazily create the spill directory. :pr:`7660` +* Add config ``merge_exchange.max_buffer_size`` to limit the total memory used by exchange clients. :pr:`7410` +* Add configs ``sort_writer_max_output_rows`` and ``sort_writer_max_output_bytes`` to limit memory usage of sort writer. :pr:`7339` +* Add termination time to TaskStats. This is the time when the downstream workers finish consuming results. Clients such + as Prestissimo can use this metric to clean up tasks. :pr:`7479` +* Add Presto Type Parser based on Flex and Bison. This can be used by a Presto verifier to parse types in the response + from the Presto server :pr:`7568` +* Add support for named row fields in type signature and binding. Example: ``row(foo bigint)`` in a signature only + binds to inputs whose type is row with a single BIGINT field named `foo`. :pr:`7523` +* Add support to shrink cache if clients such as Prestissimo detect high memory usage on a worker. :pr:`7547`, :pr:`7645` +* Fix distinct aggregations with global grouping sets on empty input. Instead of empty results, for global grouping sets, + the expected result is a row per global grouping set with the groupId as the key value. :pr:`7353` +* Fix incorrect runtime stats reporting when memory arbitration is triggered. :pr:`7394` +* Fix ``Timestamp::toMillis()`` to overflow only if the final result overflows. :pr:`7506` + +Presto Functions +================ + +* Add :func:`cosine_similarity` scalar function. +* Add support for INTERVAL DAY TO SECOND type input to :func:`plus`, :func:`minus`, :func:`multiply` functions. +* Add support for combination of TIMESTAMP, INTERVAL DAY TO SECOND type inputs to :func:`plus`, :func:`minus` functions. +* Add support for INTERVAL DAY TO SECOND, DOUBLE input arguments to :func:`divide` function. +* Add support to allow non-constant IN list in IN Presto predicate. :pr:`7497` +* Register :func:`array_frequency` function for all primitive types. +* Fix :doc:`bitwise shift functions` to accept shift value `0`. +* Fix :doc:`url_extract_*` functions to return null on malformed inputs and support absolute URIs. +* Fix :func:`from_utf8` handling of invalid UTF-8 codepoint. :pr:`7442` +* Fix :func:`entropy` aggregate function to return `0.0` on null inputs. +* Fix :func:`array_sort` function from producing invalid dictionary vectors. :pr:`7800` +* Fix :func:`lead`, :func:`lag` window functions to return null when the offset is null. :pr:`7254` +* Fix DECIMAL to VARCHAR cast by adding trailing zeros when the value is `0`. :pr:`7588` + +Spark Functions +=============== + +* Add :spark:func:`month`, :spark:func:`quarter`, :spark:func:`unscaled_value`, :spark:func:`regex_replace` + scalar functions. +* Add :spark:func:`make_decimal`, :spark:func:`decimal_round` special form functions. +* Add support for DECIMAL compare with arguments of different precision and scale. :pr:`6207` +* Add support for complex type inputs to :spark:func:`map` function. +* Fix :spark:func:`dayofmonth` and :spark:func:`dayofyear` to allow only DATE type as input and return an INTEGER type. +* Fix :spark:func:`map` function from throwing an exception when used inside an if or switch statement. :pr:`7727` + +Hive Connector +============== + +* Add DirectBufferedInput: a selective BufferedInput without caching. :pr:`7217` +* Add support for reading UNSIGNED INTEGER types in Parquet format. :pr:`6728` +* Add spill support for DWRF sort writer. :pr:`7326` +* Add ``file_handle_cache_enabled`` :doc:`Hive Config` to enable or disable caching file handles. +* Add documentation for ``num_cached_file_handles`` :doc:`configuration property`. +* Add support for DECIMAL and VARCHAR types in BenchmarkParquetReader. :pr:`6275` + +Arrow +===== + +* Add support to export constant vector as `Arrow REE + array `_. :pr:`7327`, :pr:`7398` +* Add support for TIMESTAMP type in Arrow bridge. :pr:`7435` +* Fix Arrow bridge to ensure the null_count is always set and add support for null constants. :pr:`7411` + +Performance and Correctness +=========================== + +* Add PrestoQueryRunner that can be used to verify test results against Presto. :pr:`7628` +* Add support for plans with TableScan in Join Fuzzer. :pr:`7571` +* Add support for custom input generators in Aggregation Fuzzer. :pr:`7594` +* Add support for aggregations over sorted inputs in AggregationFuzzer :pr:`7620` +* Add support for custom result verifiers in AggregationFuzzer. :pr:`7674` +* Add custom verifiers for :func:`approx_percentile` and :func:`approx_distinct` in AggregationFuzzer. :pr:`7654` +* Optimize map subscript by caching input keys in a hash map. :pr:`7191` +* Optimize `FlatVector::copy()` slow path using a DecodedVector and pre-allocated the string buffer. :pr:`7357` +* Optimize `element_at` for maps with complex type keys by sorting the keys and using binary search. :pr:`7365` +* Optimize :func:`concat` by adding a fast path for primitive values. :pr:`7393` +* Optimize :func:`json_parse` function exception handling by switching to simdjson. :pr:`7658` +* Optimize :ref:`add_items` for VARCHAR type by avoiding a deep copy. :pr:`7395` +* Optimize remaining filter by lazily evaluating multi-referenced fields. :pr:`7433` +* Optimize ``TopN::addInput()`` by deferring copying of the non-key columns. :pr:`7172` +* Optimize by sorting the inputs once when multiple aggregations share sorting keys and orders. :pr:`7452` +* Optimize Exchange operator by allowing merging of small batches of data into larger vectors. :pr:`7404` + +Build +===== + +* Add DuckDB version 0.8.1 as an external dependency and remove DuckDB amalgamation. :pr:`6725` +* Add `libcpr `_ a lightweight http client. :pr:`7385` +* Upgrade Arrow dependency to 14.0.1 from 13.0.0. + +Credits +======= + +Alex Hornby, Amit Dutta, Andrii Rosa, Austin Dickey Bikramjeet Vig, Cheng Huang, Chengcheng Jin, Christopher Ponce de Leon, +Daniel Munoz, Deepak Majeti, Ge Gao, Genevieve (Genna) Helsel, Harvey Hunt, Jake Jung, Jia, Jia Ke, Jialiang Tan, +Jimmy Lu, John Elliott, Karteekmurthys, Ke, Kevin Wilfong, Krishna Pai, Laith Sakka, Masha Basmanova, Orri Erling, +PHILO-HE, Patrick Sullivan, Pedro Eugenio Rocha Pedreira, Pramod, Richard Barnes, Schierbeck, Cody, Sergey Pershin, +Wei He, Zhenyuan Zhao, aditi-pandit, curt, duanmeng, joey.ljy, lingbin, rui-mo, usurai, vibhatha, wypb, xiaoxmeng, +xumingming, yangchuan, yaqi-zhao, yingsu00, yiweiHeOSS, youxiduo, zhli, 高阳阳 \ No newline at end of file diff --git a/velox/docs/monthly-updates/2023/october-2023.rst b/velox/docs/monthly-updates/2023/october-2023.rst new file mode 100644 index 0000000000000..5fadd9b55e82c --- /dev/null +++ b/velox/docs/monthly-updates/2023/october-2023.rst @@ -0,0 +1,98 @@ +******************* +October 2023 Update +******************* + +Documentation +============= + +* Add documentation for simple UDAF interface. +* Add blog post about `reduce_agg `_ lambda aggregate function. +* Extend documentation for datetime Presto functions to explain handling of :ref:`time zones `. +* Extend documentation for :func:`reduce_agg` Presto lambda aggregate function. + +Core Library +============ + +* Add spill support to Window, RowNumber and TopNRowNumber operators. +* Add spill support after receiving all input to HashAggregation operator. :pr:`6903` +* Add spill stats to the output of printPlanWithStats. +* Add logic to adaptively abandon partial TopNRowNumber if cardinality reduction is not sufficient. :pr:`7195` +* Add optimized version of Window operator for the case when inputs are already partitioned and sorted. :pr:`5437` +* Add support for order-able and comparable arguments to function signatures. +* Add support for order-able and comparable arguments to the Simple Function interface. :pr:`7293` +* Fix Unnest operator to honor `preferred_output_batch_rows` configuration property and avoid producing huge vectors. :pr:`7051` + +Presto Functions +================ + +* Add :func:`find_first` and :func:`find_first_index` scalar lambda functions. +* Add :func:`any_match`, :func:`all_match`, :func:`none_match` scalar lambda functions. +* Add :func:`all_keys_match`, :func:`any_keys_match`, :func:`any_values_match`, + :func:`no_keys_match`, :func:`no_values_match` scalar lambda functions. +* Add :func:`remove_nulls` scalar function. +* Add :func:`ends_with` and :func:`starts_with` scalar functions. +* Add :func:`to_ieee754_32` scalar function. +* Add support for non-constant patterns and escape characters to :func:`like` function. :pr:`6917` +* Add support for BOOLEAN inputs to :func:`least` and :func:`greatest` scalar functions. +* Add support for INTEGER inputs to :func:`poisson_cdf` and :func:`binomial_cdf` scalar functions. +* Add support for maps with keys of UNKNOWN type in :func:`map_filter` scalar lambda function. +* Add support for REAL inputs to :func:`geometric_mean` aggregate function. +* Add support for floating point keys to :func:`map_union_sum` aggregate function. +* Add support for CAST to and from complex types with nested JSON values. :pr:`7256` +* Fix 1ms-off issue in :func:`from_unixtime` scalar function. :pr:`7047` +* Fix :func:`array_min` and :func:`array_max` for floating point numbers to match Presto. :pr:`7128` +* Fix :func:`checksum` aggregate function. :pr:`6910` +* Fix :func:`array_sort` and :func:`contains` scalar functions to reject inputs with nested nulls. +* Fix :func:`map_agg`, :func:`set_agg`, :func:`min_by` and :func:`max_by` aggregate functions to + reject inputs with nested nulls. +* Fix :func:`array_sort` and :func:`array_sort_desc` to restrict inputs to order-able types. :pr:`6928` +* Fix :func:`min`, :func:`min_by`, :func:`max`, :func:`max_by` aggregate functions to restrict inputs to order-able types. :pr:`7232` +* Fix CAST(VARCHAR as JSON) for Unicode characters. :pr:`7119` +* Fix CAST(JSON as ROW) to use case-insensitive match for keys. :pr:`7016` + +Spark Functions +=============== + +* Add :spark:func:`array_min`, :spark:func:`array_max`, :spark:func:`add_months`, + :spark:func:`conv`, :spark:func:`substring_index`, :spark:func:`datediff` scalar functions. +* Add support for DECIMAL inputs to :spark:func:`multiply` and :spark:func:`divide`. +* Fix :spark:func:`sum` aggregate function for BIGINT inputs to allow overflow. + +Hive Connector +============== + +* Add support for reading from Azure Storage. :pr:`6675` + +Performance and Correctness +=========================== + +* Optimize spilling by switching to `gfx::timsort `_ (from std::sort). :pr:`6745`. +* Add support for disabling caching in expression evaluation to reduce memory usage via `enable_expression_evaluation_cache` configuration property. :pr:`6898` +* Add support for validating output of every operator via `debug.validate_output_from_operators` configuration property. :pr:`6687` +* Add support for order-able function arguments to the Fuzzer. :pr:`6950` +* Fix edge cases in datetime processing during daylight saving transition. :pr:`7011` +* Fix comparisons of complex types values using floating point numbers in the RowContainer. :pr:`5833` +* Fix window aggregations for empty frames. :pr:`6872` +* Fix GroupID operator with duplicate grouping keys in the output. :pr:`6738` +* Fix global grouping set aggregations for empty inputs. :pr:`7112` +* Fix aggregation function framework to require raw input types for all aggregates to avoid confusion and incorrect results. :pr:`7037` + +Build Systems +============= + +* Add support for Conda Environments. :pr:`6282` + +Credits +======= + +Alex, Alex Hornby, Amit Dutta, Ann Rose Benny, Bikramjeet Vig, Chengcheng Jin, +Christian Zentgraf, Cody Ohlsen, Daniel Munoz, David Tolnay, Deepak Majeti, +Genevieve (Genna) Helsel, Huameng (Michael) Jiang, Jacob Wujciak-Jens, Jaihari +Loganathan, Jason Sylka, Jia Ke, Jialiang Tan, Jimmy Lu, John Elliott, Jubin +Chheda, Karteekmurthys, Ke, Kevin Wilfong, Krishna Pai, Krishna-Prasad-P-V, +Laith Sakka, Ma-Jian1, Mahadevuni Naveen Kumar, Mark Shroyer, Masha Basmanova, +Orri Erling, PHILO-HE, Patrick Sullivan, Pedro Eugenio Rocha Pedreira, Pramod, +Prasoon Telang, Pratik Joseph Dabre, Pratyush Verma, Rong Ma, Sergey Pershin, +Wei He, Zac, aditi-pandit, dependabot[bot], duanmeng, joey.ljy, lingbin, +rrando901, rui-mo, usurai, wypb, xiaoxmeng, xumingming, yan ma, yangchuan, +yingsu00, zhejiangxiaomai, 高阳阳 diff --git a/velox/docs/monthly-updates/september-2023.rst b/velox/docs/monthly-updates/2023/september-2023.rst similarity index 100% rename from velox/docs/monthly-updates/september-2023.rst rename to velox/docs/monthly-updates/2023/september-2023.rst diff --git a/velox/docs/monthly-updates/april-2024.rst b/velox/docs/monthly-updates/april-2024.rst new file mode 100644 index 0000000000000..919e8551bdbb1 --- /dev/null +++ b/velox/docs/monthly-updates/april-2024.rst @@ -0,0 +1,63 @@ +***************** +April 2024 Update +***************** + +Documentation +============= + +* Document operations on decimals for :doc:`Presto ` + and :doc:`Spark `. +* Document spill write stats. :pr:`9326` + +Core Library +============ + +* Fix bugs in Window operator. :pr:`9476`, :pr:`9271`, :pr:`9257` + +Presto Functions +================ + +* Add :func:`word_stem` and :func:`to_iso8601` scalar functions. +* Add support for DECIMAL inputs to :func:`arbitrary`, :func:`min` and :func:`max` aggregate functions. +* Fix :func:`json_extract` for paths with wildcards. + +Spark Functions +=============== + +* Add :spark:func:`array_size`, :spark:func:`flatten`, :spark:func:`year_of_week` scalar functions. +* Add :spark:func:`collect_list` and :spark:func:`regr_replacement` aggregate functions. + +Hive Connector +============== + +* Add support for storing decimal as integer in Parquet writer. +* Add hive.s3.connect-timeout, hive.s3.socket-timeout and hive.s3.max-connections configs. :pr:`9472` +* Fix complex type handing in Parquet reader. :pr:`9187` +* Fix DWRF reader to skip null map keys. + +Performance and Correctness +=========================== + +* Add aggregation and window fuzzer runs to every PR. +* Add nightly run of window fuzzer. +* Add check for aggregate function signature changes to every PR. +* Add biased aggregation fuzzer run for newly added aggregate functions to every PR. + +Build System +============ + +* Add nightly job to track build metrics. + +Credits +======= + +Andres Suarez, Andrii Rosa, Ankita Victor, Ashwin Krishna Kumar, Bikramjeet Vig, +Christian Zentgraf, Daniel Munoz, David McKnight, Deepak Majeti, Hengzhi Chen, +Huameng (Michael) Jiang, Jacob Wujciak-Jens, Jeongseok Lee, Jialiang Tan, Jimmy +Lu, Karteekmurthys, Ke, Kevin Wilfong, Krishna Pai, Lu Niu, Ludovic Henry, Ma, +Rong, Mahadevuni Naveen Kumar, Masha Basmanova, Mike Lui, Minhan Cao, PHILO-HE, +Pedro Eugenio Rocha Pedreira, Pedro Pedreira, Pramod, Qian Sun, Richard Barnes, +Sergey Pershin, Shabab Ayub, Tengfei Huang, Terry Wang, Wei He, Weitao Wan, +Wills Feng, Yang Zhang, Yihong Wang, Yoav Helfman, Zac Wen, Zhenyuan Zhao, +aditi-pandit, chliang, cindyyyang, duanmeng, jay.narale, joey.ljy, mohsaka, +rui-mo, svm1, willsfeng, wutiangan, wypb, xiaoxmeng, yingsu00, zhli1142015 diff --git a/velox/docs/monthly-updates/february-2024.rst b/velox/docs/monthly-updates/february-2024.rst new file mode 100644 index 0000000000000..18704d2bb5ac4 --- /dev/null +++ b/velox/docs/monthly-updates/february-2024.rst @@ -0,0 +1,68 @@ +******************** +February 2024 Update +******************** + +Core Library +============ + +* Add support for aggregations over distinct inputs to StreamingAggregation. +* Add support for deserializing a single column in Presto page format. +* Add support for deserializing an all-null column serialized as UNKNOWN type in Presto page format. +* Add stats for null skew in join operator. +* Convert TIMESTAMP_WITH_TIME_ZONE type to a primitive type. +* Add background profiler that starts Linux perf on the Velox process. +* Fix ``out of range in dynamic array`` error in Task::toJson. +* Delete unused ``max_arbitrary_buffer_size`` config. + +Presto Functions +================ + +* Add :func:`typeof`, :func:`from_iso8601_date` scalar functions. +* Add support for DECIMAL input type to :func:`set_agg` and :func:`set_union` aggregate functions. +* Add support for UNKNOWN input type to :func:`checksum` aggregate function. +* Add support for DATE +/- INTERVAL YEAR MONTH functions. +* Add support for ``UCT|UCT|GMT|GMT0`` as ``Z`` to :func:`parse_datetime` scalar function. + +Spark Functions +=============== + +* Add :spark:func:`array_repeat`, :spark:func:`date_from_unix_date`, :spark:func:`weekday`, :spark:func:`minute`, :spark:func:`second` scalar functions. +* Add :spark:func:`ntile` window function. + +Hive Connector +============== + +* Add ``ignore_missing_files`` config. +* Add write support to ABFS file system. +* Add support for proxy to S3 file system. + +Arrow +===== + +* Add support to export UNKNOWN type to Arrow array. +* Add support to convert Arrow REE arrays to Velox Vectors. + +Performance and Correctness +=========================== + +* Add FieldReference benchmark. +* Add :ref:`Window fuzzer `. +* Fix ``Too many open files`` error in Join fuzzer. + +Build System +============ + +* Add ``VELOX_BUILD_MINIMAL_WITH_DWIO`` CMake option. +* Move documentation, header and format check to Github Action. + +Credits +======= + +Aaron Feldman, Ankita Victor, Bikramjeet Vig, Christian Zentgraf, Daniel Munoz, +David McKnight, Deepak Majeti, Ge Gao, Hongze Zhang, Jacob Wujciak-Jens, Jia Ke, +Jialiang Tan, Jimmy Lu, Kevin Wilfong, Krishna Pai, Lu Niu, Masha Basmanova, +Nick Terrell, Orri Erling, PHILO-HE, Pedro Pedreira, Pramod, Pranjal Shankhdhar, +Richard Barnes, Schierbeck, Cody, Sergey Pershin, Wei He, Yedidya Feldblum, +Zac Wen, Zhenyuan Zhao, aditi-pandit, duanmeng, gayangya, hengjiang.ly, hitarth, +lingbin, mwish, rrando901, rui-mo, xiaodou, xiaoxmeng, xumingming, yingsu00, +zhli1142015, 高阳阳 diff --git a/velox/docs/monthly-updates/january-2024.rst b/velox/docs/monthly-updates/january-2024.rst new file mode 100644 index 0000000000000..dc917581e8e67 --- /dev/null +++ b/velox/docs/monthly-updates/january-2024.rst @@ -0,0 +1,84 @@ +******************** +January 2024 Update +******************** + +Documentation +============= + +* Add documentation about :doc:`Hash Table `. +* Add documentation about :doc:`memory management `. + +Core Library +============ + +* Add metrics to track time spent in memory arbitration. :pr:`8497`, :pr:`8482` +* Add metric to track average buffer time for exchange. :pr:`8534` +* Optimize count(distinct x) when x is of complex type. :pr:`8560` +* Optimize latency for exchange that uses arbitrary buffer. :pr:`8532`, :pr:`8480` +* Optimize MallocAllocator to reduce lock contention. :pr:`8477` +* Fix aggregation over all-null keys with ignoreNullKeys = true. :pr:`8422` +* Fix race condition in task completion that caused `Output buffers for task not found` failures. :pr:`8357` +* Fix evaluation of CAST expression under TRY. :pr:`8365` +* Fix `FlatVector::copy` for vectors with more than 2GB of data. :pr:`8516` +* Fix crash in `FlatVector::ensureWritable`. :pr:`8450` +* Fix interaction of spilling and yielding in Hash Join operator. :pr:`8520` +* Fix rawInputPositions metrics in Exchange operator. :pr:`8370` + +Presto Functions +================ + +* Add :func:`from_ieee754_64`, :func:`multimap_from_entries`, :func:`ngrams` functions. +* Add support for VARBINARY inputs to :func:`reverse` function. +* Add support for arrays of complex types to :func:`array_min` and :func:`array_max` functions. +* Add support for casting DOUBLE and VARCHAR as DECIMAL. +* Add support for UNKNOWN key to :func:`map_agg` function. +* Add support for timezone offsets to :func:`timezone_hour` and :func:`timezone_minute` functions. :pr:`8269` +* Optimize cast from JSON by using simdjson. :pr:`8216` +* Fix handling of timestamps with timezone in :func:`date_diff` function. :pr:`8540` +* Fix :func:`json_parse` for inputs with very large numbers. :pr:`8455` +* Fix kAtLeastN and kExactlyN fast paths in LIKE for inputs with multi-byte characters. :pr:`8150` +* Fix :func:`approx_distinct` aggregate function for TIMESTAMP inputs. :pr:`8164` +* Fix :func:`min` and :func:`max` when used in Window operator. :pr:`8311` + +Spark Functions +=============== + +* Add :spark:func:`from_unixtime`, :spark:func:`find_in_set`, :spark:func:`get_timestamp`, + :spark:func:`hour`, :spark:func:`hex`, :spark:func:`isnan`, :spark:func:`replace` functions. +* Add support for TINYINT and SMALLINT inputs to :spark:func:`date_add` and :spark:func:`date_sub` functions. +* Add support for casting DOUBLE and VARCHAR as DECIMAL. +* Fix crash in :spark:func:`conv` function. :pr:`8046` + +Hive Connector +============== + +* Fix crash in Parquet reader when processing empty row groups. :pr:`8000` +* Fix data sink to avoid writing partition columns to files. :pr:`8089` + +Performance and Correctness +=========================== + +* Add support for aggregations over distinct inputs to AggregationFuzzer. +* Reduce memory usage of histogram metrics. :pr:`8458` +* Add Join Fuzzer run to CI that runs on each PR. +* Add Aggregation Fuzzer run using Presto as source of truth to experimental CI. + +Build System +============ + +* Upgrade folly to v2023.12.04.00 (from v2022.11.14.00). +* Upgrade fmt to 10.1.1 (from 8.0.1). + +Credits +======= + +Amit Dutta, Benwei Shi, Bikramjeet Vig, Chen Zhang, Chengcheng Jin, Christian +Zentgraf, Deepak Majeti, Ge Gao, Hongze Zhang, Jacob Wujciak-Jens, Jia Ke, +Jialiang Tan, Jimmy Lu, Ke, Kevin Wilfong, Krishna Pai, Laith Sakka, Lu Niu, +Ma, Rong, Masha Basmanova, Mike Lui, Orri Erling, PHILO-HE, Pedro Eugenio +Rocha Pedreira, Pratik Joseph Dabre, Ravi Rahman, Richard Barnes, Schierbeck, +Cody, Sergey Pershin, Sitao Lv, Taras Galkovskyi, Wei He, Yedidya Feldblum, +Yuan Zhou, Yuping Fan, Zac Wen, aditi-pandit, binwei, duanmeng, hengjiang.ly, +icejoywoo, lingbin, mwish, rui-mo, wypb, xiaoxmeng, xumingming, yangchuan, +yingsu00, youxiduo, yuling.sh, zhli1142015, zky.zhoukeyong, zwangsheng + diff --git a/velox/docs/monthly-updates/march-2024.rst b/velox/docs/monthly-updates/march-2024.rst new file mode 100644 index 0000000000000..636f52ed3a4f4 --- /dev/null +++ b/velox/docs/monthly-updates/march-2024.rst @@ -0,0 +1,79 @@ +***************** +March 2024 Update +***************** + +Documentation +============= + +* Document `design philosophy `_ +* Document custom input generators and verifiers supported in the Aggregation Fuzzer. +* Document runtime stats reported by the HashTable. :pr:`9255` +* Document usage of generic types in Simple Function API. :pr:`9084` + +Core Library +============ + +* Add prefix-sort for fixed width sorting keys. +* Add null behavior and determinism scalar function metadata to the registry. :pr:`9209` +* Add order-sensitive aggregate function metadata to the registry. :pr:`9050` +* Add support for DECIMAL type to Simple Function API. :pr:`9096` +* Add support for lambda functions (reduce_agg) to StreamingAggregation. +* Deprecate threshold based spilling in Aggregation and OrderBy. +* Optimize Exchange protocol used by Presto for latency. :pr:`8845` + +Presto Functions +================ + +* Add :func:`day`, :func:`from_ieee754_32`, :func:`hamming_distance`, :func:`map_normalize`, + :func:`map_top_n` scalar functions. +* Add support for DECIMAL input type to :func:`floor` function. +* Add support for timestamp +/- IntervalYearMonth. +* Add :func:`regr_avgx`, :func:`regr_avgy`, :func:`regr_count`, :func:`regr_r2`, + :func:`regr_sxx`, :func:`regr_sxy`, and :func:`regr_syy` aggregation functions. + +Spark Functions +=============== + +* Add :spark:func:`array_remove`, :spark:func:`bit_length`, :spark:func:`bitwise_xor`, + :spark:func:`bitwise_not`, :spark:func:`make_ym_interval`, :spark:func:`from_utc_timestamp`, + :spark:func:`to_utc_timestamp`, :spark:func:`make_timestamp`, :spark:func:`map_subset`, + :spark:func:`unhex`, :spark:func:`unix_date`, :spark:func:`uuid` functions. +* Add :spark:func:`regexp_replace` function. +* Add :spark:func:`monotonically_increasing_id`, :spark:func:`spark_partition_id` functions. +* Add :spark:func:`kurtosis` and :spark:func:`skewness` aggregation functions. +* Add support for DECIMAL inputs to :spark:func:`sum` aggregation function. +* Add CAST(real as decimal). +* Add configuration property 'spark.partition_id'. + +Hive Connector +============== + +* Add support for S3 client no_proxy CIDR expression. :pr:`9160` +* Add support for synthetic columns '$file_size' and '$file_modified_time'. +* Optimize reading a small sample of rows. :pr:`8920`. +* Fix Parquet reader for files with different encodings across row groups. :pr:`9129` + +Performance and Correctness +=========================== + +* Add nightly run of Aggregation fuzzer using Presto as source of truth. +* Add nightly run of Exchange fuzzer. +* Add utility to randomly trigger OOMs and integrate it into Aggregation and Join fuzzers. +* Add group execution mode to Join fuzzer. +* Add support for random frame clause generation to Window fuzzer. +* Add custom input generator for map_union_sum Presto aggregation function. +* Add custom result verifier for arbitrary Presto aggregation function. + +Credits +======= + +8dukongjian, Amit Dutta, Ankita Victor, Bikramjeet Vig, Christian Zentgraf, +Daniel Munoz, Deepak Majeti, Ge Gao, InitialZJ, Jacob Wujciak-Jens, Jake Jung, +Jialiang Tan, Jimmy Lu, Karteekmurthys, Kevin Wilfong, Krishna Pai, Ma, Rong, +Mahadevuni Naveen Kumar, Marcus D. Hanwell, Masha Basmanova, Nicholas Ormrod, +Nick Terrell, Orri Erling, PHILO-HE, Patrick Sullivan, Pedro Pedreira, Pramod, +Pratik Joseph Dabre, Qian Sun, Richard Barnes, Sandino Flores, Schierbeck, +Cody, Sergey Pershin, Ubuntu, Wei He, Yang Zhang, Zac Wen, aditi-pandit, +duanmeng, f0rest9999, hengjiang.ly, joey.ljy, lingbin, mwish, rexan, rui-mo, +willsfeng, wypb, xiaodai1002, xiaoxmeng, xumingming, youxiduo, yuling.sh, +zhli1142015, zky.zhoukeyong diff --git a/velox/docs/monthly-updates/may-2024.rst b/velox/docs/monthly-updates/may-2024.rst new file mode 100644 index 0000000000000..9768ce5840b72 --- /dev/null +++ b/velox/docs/monthly-updates/may-2024.rst @@ -0,0 +1,87 @@ +*************** +May 2024 Update +*************** + +Documentation +============= + +* Publish blog post about `optimizing TRY and TRY_CAST `_ +* Publish `Technical Governance `_. + +Core Library +============ + +* Optimize TRY and TRY_CAST for cases when many rows fail. +* Add support for LEFT SEMI FILTER and RIGHT SEMI FILTER merge joins. +* Add support for DECIMAL input to aggregations over distinct values. :pr:`9850` +* Add support for specifying overwrite behavior when registering simple scalar functions. :pr:`9158` +* Fix LEFT merge join with extra filter. :pr:`9862` +* Fix Nested Loop join with empty build and extra filter. :pr:`9892` +* Fix crash when copying complex vectors. :pr:`9725` +* Fix reuse of LazyVectors in TableScan to avoid crashes in downstream operators. :pr:`9811` +* Add support for including metadata identifying the operator that generated an error in error messages to ease troubleshooting. :pr:`9695` + +Presto Functions +================ + +* Add :func:`at_timezone` function. +* Add support for VARBINARY input to :func:`approx_distinct` aggregate function. +* Add support for CAST(varchar AS timestamp with time zone). +* Add support for DECIMAL inputs to :func:`min_by` and :func:`max_by` aggregate functions. +* Fix :func:`arrays_overlap` function for empty arrays. :pr:`9922` +* Fix :func:`map_top_n` function to break ties by comparing keys. +* Fix CAST(IntervalDayTime AS Varchar) for negative intervals. :pr:`9871` +* Fix :func:`from_base64` for inputs without padding. :pr:`8647` +* Fix handling of equality and total ordering of NaN (Not-a-Number) floating point + values in :func:`array_min`, :func:`array_sort`, :func:`array_distinct`, + :func:`array_except`, :func:`array_intersect`, :func:`array_union`, + :func:`array_position`, :func:`array_remove`, :func:`arrays_overlap`, :func:`contains`, + map subscript and :func:`multimap_agg`. + +Spark Functions +=============== + +* Add :spark:func:`expm1`, :spark:func:`get`, :spark:func:`rint`, :spark:func:`shuffle`, + :spark:func:`soundex`, :func:`unix_seconds`, :spark:func:`width_bucket` functions. +* Add support for complex type inputs to :spark:func:`hash` and :spark:func:`xxhash64` functions. +* Fix CAST(tinyint/smallint/integer/bigint as varbinary). :pr:`9819` +* Fix return type for :spark:func:`sum` aggregate function with REAL input. :pr:`9818` + +Hive Connector +============== + +* Add support for projecting synthesized row-number column from Table Scan. :pr:`9174` + +Performance and Correctness +=========================== + +* Optimize memory arbitration to avoid interference between queries to reduce overall query execution time. +* Add cache expiration function to simple LRU cache to support remote IO throttling. +* Add Fuzzers for TableWriter and RowNumber operators. +* Add support for Nested Loop joins to Join Fuzzer. +* Add support for testing different sorting flags to Window Fuzzer. +* Add custom argument type generators for Presto decimal functions. :pr:`9715` +* Add support for logical input types in the evaluateOnce() unit test helper method. :pr:`9708` +* Re-enable testing of merge joins in Join Fuzzer. + +Build System +============ + +* Upgrade aws-sdk-cpp to 1.11.321 (from 1.11.169). +* Upgrade cmake to 3.28.3 (from 3.14). +* Upgrade simdjson to 3.9.3 (from 3.8.0). +* Add support for docker image with Spark Connect server to use with Fuzzer. :pr:`9759` +* Add dashboard with `build time metrics `_. + +Credits +======= + +Ankita Victor, Ashwin Krishna Kumar, Bikramjeet Vig, Bradley Dice, Daniel Munoz, +Deepak Majeti, Giuseppe Ottaviano, Jacob Wujciak-Jens, Jia Ke, Jialiang Tan, +Jimmy Lu, Joe Abraham, Karteekmurthys, Ke, Kevin Wilfong, Kk Pulla, Krishna +Pai, Ma, Rong, Masha Basmanova, NEUpanning, PHILO-HE, Patrick Sullivan, Pedro +Eugenio Rocha Pedreira, Richard Barnes, Sandino Flores, Sergey Pershin, Ubuntu, +Wei He, Weihan Tang, Yang Zhang, Zac Wen, Zuyu ZHANG, aditi-pandit, chliang, +duanmeng, gaoyangxiaozhu, jay.narale, joey.ljy, kevin, kikimo, lingbin, rui-mo, +svm1, xiaoxmeng, xumingming, yan ma, yanngyoung, yingsu00, zhli1142015, +zhouyifan279, zjuwangg, zky.zhoukeyong, 高阳阳 diff --git a/velox/docs/programming-guide/chapter01.rst b/velox/docs/programming-guide/chapter01.rst index 3f4f9e56e0234..a7b6cb303629b 100644 --- a/velox/docs/programming-guide/chapter01.rst +++ b/velox/docs/programming-guide/chapter01.rst @@ -15,7 +15,7 @@ Let’s start by getting access to a MemoryPool: #include "velox/common/memory/Memory.h" - auto pool = memory::addDefaultLeafMemoryPool(); + auto pool = memory::memoryManager()->addLeafPool(); `pool` is a std::shared_ptr. We can use it to allocate buffers. diff --git a/velox/docs/spark_functions.rst b/velox/docs/spark_functions.rst index f8ba812160829..24c825ac1ef59 100644 --- a/velox/docs/spark_functions.rst +++ b/velox/docs/spark_functions.rst @@ -2,21 +2,27 @@ Spark Functions *********************** +The semantics of Spark functions match Spark 3.5 with ANSI OFF. + .. toctree:: :maxdepth: 1 functions/spark/math functions/spark/bitwise + functions/spark/decimal functions/spark/comparison functions/spark/string functions/spark/datetime functions/spark/array functions/spark/map + functions/spark/misc functions/spark/regexp functions/spark/binary - functions/spark/json functions/spark/aggregate functions/spark/window + functions/spark/conversion + functions/spark/url + functions/spark/json Here is a list of all scalar and aggregate Spark functions available in Velox. Function names link to function descriptions. Check out coverage maps @@ -58,32 +64,32 @@ for :doc:`all ` functions. ================================ ================================ ================================ == ================================ == ================================ Scalar Functions Aggregate Functions Window Functions ==================================================================================================== == ================================ == ================================ - :spark:func:`abs` :spark:func:`floor` :spark:func:`power` :spark:func:`bit_xor` :spark:func:`nth_value` - :spark:func:`acos` :spark:func:`get_json_object` :spark:func:`rand` :spark:func:`first` - :spark:func:`acosh` :spark:func:`greaterthan` :spark:func:`regexp_extract` :spark:func:`first_ignore_null` - :spark:func:`add` :spark:func:`greaterthanorequal` :spark:func:`remainder` :spark:func:`last` - :spark:func:`aggregate` :spark:func:`greatest` :spark:func:`replace` :spark:func:`last_ignore_null` - :spark:func:`array` :spark:func:`hash` :spark:func:`rlike` - :spark:func:`array_contains` :spark:func:`hypot` :spark:func:`round` - :spark:func:`array_intersect` :spark:func:`in` :spark:func:`rtrim` - :spark:func:`array_sort` :spark:func:`instr` :spark:func:`sec` - :spark:func:`ascii` :spark:func:`isnotnull` :spark:func:`sha1` - :spark:func:`asinh` :spark:func:`isnull` :spark:func:`sha2` - :spark:func:`atanh` :spark:func:`least` :spark:func:`shiftleft` - :spark:func:`between` :spark:func:`left` :spark:func:`shiftright` - :spark:func:`bin` :spark:func:`length` :spark:func:`sinh` - :spark:func:`bitwise_and` :spark:func:`lessthan` :spark:func:`size` - :spark:func:`bitwise_or` :spark:func:`lessthanorequal` :spark:func:`sort_array` - :spark:func:`ceil` :spark:func:`log1p` :spark:func:`split` - :spark:func:`chr` :spark:func:`lower` :spark:func:`startswith` - :spark:func:`concat` :spark:func:`ltrim` :spark:func:`substring` - :spark:func:`contains` :spark:func:`map` :spark:func:`subtract` - :spark:func:`csc` :spark:func:`map_filter` :spark:func:`to_unix_timestamp` - :spark:func:`divide` :spark:func:`map_from_arrays` :spark:func:`transform` - :spark:func:`element_at` :spark:func:`md5` :spark:func:`trim` - :spark:func:`endswith` :spark:func:`might_contain` :spark:func:`unaryminus` - :spark:func:`equalnullsafe` :spark:func:`multiply` :spark:func:`unix_timestamp` - :spark:func:`equalto` :spark:func:`not` :spark:func:`upper` - :spark:func:`exp` :spark:func:`notequalto` :spark:func:`xxhash64` - :spark:func:`filter` :spark:func:`pmod` :spark:func:`year` + :spark:func:`abs` :spark:func:`floor` :spark:func:`power` :spark:func:`bit_xor` :spark:func:`nth_value` + :spark:func:`acos` :spark:func:`get_json_object` :spark:func:`rand` :spark:func:`first` + :spark:func:`acosh` :spark:func:`greaterthan` :spark:func:`regexp_extract` :spark:func:`first_ignore_null` + :spark:func:`add` :spark:func:`greaterthanorequal` :spark:func:`remainder` :spark:func:`last` + :spark:func:`aggregate` :spark:func:`greatest` :spark:func:`replace` :spark:func:`last_ignore_null` + :spark:func:`array` :spark:func:`hash` :spark:func:`rlike` + :spark:func:`array_contains` :spark:func:`hypot` :spark:func:`round` + :spark:func:`array_intersect` :spark:func:`in` :spark:func:`rtrim` + :spark:func:`array_sort` :spark:func:`instr` :spark:func:`sec` + :spark:func:`ascii` :spark:func:`isnotnull` :spark:func:`sha1` + :spark:func:`asinh` :spark:func:`isnull` :spark:func:`sha2` + :spark:func:`atanh` :spark:func:`least` :spark:func:`shiftleft` + :spark:func:`between` :spark:func:`left` :spark:func:`shiftright` + :spark:func:`bin` :spark:func:`length` :spark:func:`sinh` + :spark:func:`bitwise_and` :spark:func:`lessthan` :spark:func:`size` + :spark:func:`bitwise_or` :spark:func:`lessthanorequal` :spark:func:`sort_array` + :spark:func:`ceil` :spark:func:`log1p` :spark:func:`split` + :spark:func:`chr` :spark:func:`lower` :spark:func:`startswith` + :spark:func:`concat` :spark:func:`ltrim` :spark:func:`substring` + :spark:func:`contains` :spark:func:`map` :spark:func:`subtract` + :spark:func:`csc` :spark:func:`map_filter` :spark:func:`to_unix_timestamp` + :spark:func:`divide` :spark:func:`map_from_arrays` :spark:func:`transform` + :spark:func:`element_at` :spark:func:`md5` :spark:func:`trim` + :spark:func:`endswith` :spark:func:`might_contain` :spark:func:`unaryminus` + :spark:func:`equalnullsafe` :spark:func:`multiply` :spark:func:`unix_timestamp` + :spark:func:`equalto` :spark:func:`not` :spark:func:`upper` + :spark:func:`exp` :spark:func:`notequalto` :spark:func:`xxhash64` + :spark:func:`filter` :spark:func:`pmod` :spark:func:`year` ================================ ================================ ================================ == ================================ == ================================ diff --git a/velox/docs/velox-in-10-min.rst b/velox/docs/velox-in-10-min.rst index fe769250eba1b..9d202c77951a8 100644 --- a/velox/docs/velox-in-10-min.rst +++ b/velox/docs/velox-in-10-min.rst @@ -324,7 +324,7 @@ provide a split. .. code-block:: c++ plan = PlanBuilder() - .tableScan( + .tpchTableScan( tpch::Table::TBL_NATION, {"n_nationkey", "n_name"}, 1 /*scaleFactor*/) @@ -367,14 +367,14 @@ IDs. core::PlanNodeId nationScanId; core::PlanNodeId regionScanId; plan = PlanBuilder(planNodeIdGenerator) - .tableScan( + .tpchTableScan( tpch::Table::TBL_NATION, {"n_regionkey"}, 1 /*scaleFactor*/) .capturePlanNodeId(nationScanId) .hashJoin( {"n_regionkey"}, {"r_regionkey"}, PlanBuilder(planNodeIdGenerator) - .tableScan( + .tpchTableScan( tpch::Table::TBL_REGION, {"r_regionkey", "r_name"}, 1 /*scaleFactor*/) diff --git a/velox/duckdb/README.md b/velox/duckdb/README.md index 0302b7bbf903a..ea1d19ddea2c2 100644 --- a/velox/duckdb/README.md +++ b/velox/duckdb/README.md @@ -15,10 +15,6 @@ Then copy the generated files to velox/external/duckdb: export VELOX_PATH="" rsync -vrh src/amalgamation/duckdb* ${VELOX_PATH}/velox/external/duckdb/ -We also maintain a copy of TPC-H dataset generators that need to be updated: - - rsync -vrh --exclude={'CMakeLists.txt','LICENSE','*.py','dbgen/queries','dbgen/answers'} extension/tpch/ ${VELOX_PATH}/velox/external/duckdb/tpch/ - After the new files are copied, ensure that the new code compiles and that it doesn't break any tests. Velox relies on many internal APIs, so there is a good chance that this will not work out-of-the-box and that you will have to dig in diff --git a/velox/duckdb/conversion/CMakeLists.txt b/velox/duckdb/conversion/CMakeLists.txt index ebf7f4fcb8bdb..8361cb68e20f1 100644 --- a/velox/duckdb/conversion/CMakeLists.txt +++ b/velox/duckdb/conversion/CMakeLists.txt @@ -11,15 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_duckdb_conversion DuckConversion.cpp DuckWrapper.cpp) +velox_add_library(velox_duckdb_conversion DuckConversion.cpp) -target_link_libraries(velox_duckdb_conversion velox_core velox_vector duckdb - tpch_extension) +velox_link_libraries(velox_duckdb_conversion velox_core velox_vector + duckdb_static) -add_library(velox_duckdb_parser DuckParser.cpp) +velox_add_library(velox_duckdb_parser DuckParser.cpp) -target_link_libraries(velox_duckdb_parser velox_duckdb_conversion duckdb - velox_parse_expression) +velox_link_libraries(velox_duckdb_parser velox_duckdb_conversion + velox_parse_expression duckdb_static) if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) diff --git a/velox/duckdb/conversion/DuckConversion.cpp b/velox/duckdb/conversion/DuckConversion.cpp index 7764b3b99b041..63c99b97031b1 100644 --- a/velox/duckdb/conversion/DuckConversion.cpp +++ b/velox/duckdb/conversion/DuckConversion.cpp @@ -33,7 +33,7 @@ using ::duckdb::string_t; using ::duckdb::timestamp_t; variant decimalVariant(const Value& val) { - VELOX_DCHECK(val.type().id() == LogicalTypeId::DECIMAL) + VELOX_DCHECK(val.type().id() == LogicalTypeId::DECIMAL); switch (val.type().InternalType()) { case ::duckdb::PhysicalType::INT128: { auto unscaledValue = val.GetValueUnsafe<::duckdb::hugeint_t>(); @@ -140,6 +140,12 @@ TypePtr toVeloxType(LogicalType type, bool fileColumnNamesReadAsLowerCase) { return DATE(); case LogicalTypeId::TIMESTAMP: return TIMESTAMP(); + case LogicalTypeId::TIMESTAMP_TZ: { + if (auto customType = getCustomType("TIMESTAMP WITH TIME ZONE")) { + return customType; + } + [[fallthrough]]; + } case LogicalTypeId::INTERVAL: return INTERVAL_DAY_TIME(); case LogicalTypeId::BLOB: @@ -175,6 +181,19 @@ TypePtr toVeloxType(LogicalType type, bool fileColumnNamesReadAsLowerCase) { } return ROW(std::move(names), std::move(types)); } + case LogicalTypeId::UUID: { + if (auto customType = getCustomType("UUID")) { + return customType; + } + [[fallthrough]]; + } + case LogicalTypeId::USER: { + const auto name = ::duckdb::UserType::GetTypeName(type); + if (auto customType = getCustomType(name)) { + return customType; + } + [[fallthrough]]; + } default: throw std::runtime_error( "unsupported type for duckdb -> velox conversion: " + diff --git a/velox/duckdb/conversion/DuckConversion.h b/velox/duckdb/conversion/DuckConversion.h index de5d2fc4fdf7c..9888a0cb66a87 100644 --- a/velox/duckdb/conversion/DuckConversion.h +++ b/velox/duckdb/conversion/DuckConversion.h @@ -15,9 +15,10 @@ */ #pragma once -#include "velox/external/duckdb/duckdb.hpp" #include "velox/type/Type.h" +#include // @manual + namespace facebook::velox { class variant; } diff --git a/velox/duckdb/conversion/DuckParser.cpp b/velox/duckdb/conversion/DuckParser.cpp index 3e57f1330c1da..a3ce0702b67df 100644 --- a/velox/duckdb/conversion/DuckParser.cpp +++ b/velox/duckdb/conversion/DuckParser.cpp @@ -17,10 +17,23 @@ #include "velox/common/base/Exceptions.h" #include "velox/core/PlanNode.h" #include "velox/duckdb/conversion/DuckConversion.h" -#include "velox/external/duckdb/duckdb.hpp" #include "velox/parse/Expressions.h" #include "velox/type/Variant.h" +#include // @manual +#include // @manual +#include // @manual +#include // @manual +#include // @manual +#include // @manual +#include // @manual +#include // @manual +#include // @manual +#include // @manual +#include // @manual +#include // @manual +#include // @manual + namespace facebook::velox::duckdb { using ::duckdb::BetweenExpression; @@ -387,8 +400,8 @@ std::shared_ptr parseOperatorExpr( ExpressionType::VALUE_CONSTANT) { auto constExpr = dynamic_cast(castExpr->child.get()); - auto value = - constExpr->value.CastAs(castExpr->cast_type, !castExpr->try_cast); + auto value = constExpr->value.DefaultCastAs( + castExpr->cast_type, !castExpr->try_cast); values.emplace_back(duckValueToVariant(value)); valueType = toVeloxType(castExpr->cast_type); continue; @@ -557,7 +570,7 @@ std::shared_ptr parseLambdaExpr( const ParseOptions& options) { const auto& lambdaExpr = dynamic_cast<::duckdb::LambdaExpression&>(expr); auto capture = parseExpr(*lambdaExpr.lhs, options); - auto body = parseExpr(*lambdaExpr.rhs, options); + auto body = parseExpr(*lambdaExpr.expr, options); // capture is either a core::FieldAccessExpr or a 'row' core::CallExpr with 2 // or more core::FieldAccessExpr inputs. @@ -581,7 +594,7 @@ std::shared_ptr parseLambdaExpr( } else { VELOX_FAIL( "Unexpected left-hand-side expression for the lambda expression: {}", - capture->toString()) + capture->toString()); } return std::make_shared( @@ -629,8 +642,8 @@ std::shared_ptr parseExpr( } } -std::vector> parseExpression( - const std::string& exprString) { +::duckdb::vector<::duckdb::unique_ptr<::duckdb::ParsedExpression>> +parseExpression(const std::string& exprString) { ParserOptions options; options.preserve_identifier_case = false; @@ -682,7 +695,7 @@ bool isAscending(::duckdb::OrderType orderType, const std::string& exprString) { return true; case ::duckdb::OrderType::INVALID: default: - VELOX_FAIL("Cannot parse ORDER BY clause: {}", exprString) + VELOX_FAIL("Cannot parse ORDER BY clause: {}", exprString); } } @@ -699,7 +712,7 @@ bool isNullsFirst( return false; case ::duckdb::OrderByNullType::INVALID: default: - VELOX_FAIL("Cannot parse ORDER BY clause: {}", exprString) + VELOX_FAIL("Cannot parse ORDER BY clause: {}", exprString); } VELOX_UNREACHABLE(); @@ -758,8 +771,7 @@ AggregateExpr parseAggregateExpr( namespace { WindowType parseWindowType(const WindowExpression& expr) { - auto windowType = - [&](const ::duckdb::WindowBoundary& boundary) -> WindowType { + auto windowType = [&](const WindowBoundary& boundary) -> WindowType { if (boundary == WindowBoundary::CURRENT_ROW_ROWS || boundary == WindowBoundary::EXPR_FOLLOWING_ROWS || boundary == WindowBoundary::EXPR_PRECEDING_ROWS) { diff --git a/velox/duckdb/conversion/DuckWrapper.cpp b/velox/duckdb/conversion/DuckWrapper.cpp deleted file mode 100644 index 5e6bd9bc59b84..0000000000000 --- a/velox/duckdb/conversion/DuckWrapper.cpp +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "velox/duckdb/conversion/DuckWrapper.h" -#include "velox/common/base/BitUtil.h" -#include "velox/duckdb/conversion/DuckConversion.h" -#include "velox/external/duckdb/duckdb.hpp" -#include "velox/external/duckdb/tpch/include/tpch-extension.hpp" -#include "velox/vector/FlatVector.h" - -namespace facebook::velox::duckdb { -using ::duckdb::Connection; -using ::duckdb::DataChunk; -using ::duckdb::DuckDB; -using ::duckdb::Hugeint; -using ::duckdb::hugeint_t; -using ::duckdb::LogicalTypeId; -using ::duckdb::PhysicalType; -using ::duckdb::QueryResult; - -namespace { - -class DuckDBBufferReleaser { - public: - explicit DuckDBBufferReleaser( - ::duckdb::buffer_ptr<::duckdb::VectorBuffer> buffer) - : buffer_(std::move(buffer)) {} - - void addRef() const {} - void release() const {} - - private: - const ::duckdb::buffer_ptr<::duckdb::VectorBuffer> buffer_; -}; - -class DuckDBValidityReleaser { - public: - explicit DuckDBValidityReleaser(const ::duckdb::ValidityMask& validity) - : validity_(validity) {} - - void addRef() const {} - void release() const {} - - private: - const ::duckdb::ValidityMask validity_; -}; - -} // namespace - -DuckDBWrapper::DuckDBWrapper(core::ExecCtx* context, const char* path) - : context_(context) { - db_ = std::make_unique(path); - connection_ = std::make_unique(*db_); - db_->LoadExtension<::duckdb::TPCHExtension>(); -} - -DuckDBWrapper::~DuckDBWrapper() {} - -std::unique_ptr DuckDBWrapper::execute(const std::string& query) { - auto duckResult = connection_->Query(query); - return std::make_unique(context_, std::move(duckResult)); -} - -void DuckDBWrapper::print(const std::string& query) { - auto result = connection_->Query(query); - result->Print(); -} - -DuckResult::DuckResult( - core::ExecCtx* context, - std::unique_ptr queryResult) - : context_(context), queryResult_(std::move(queryResult)) { - auto columnCount = queryResult_->types.size(); - - std::vector names; - std::vector types; - names.reserve(columnCount); - types.reserve(columnCount); - for (auto i = 0; i < columnCount; i++) { - types.push_back(getType(i)); - names.push_back(getName(i)); - } - type_ = std::make_shared(std::move(names), std::move(types)); -} - -DuckResult::~DuckResult() {} - -bool DuckResult::success() { - return queryResult_->success; -} - -std::string DuckResult::errorMessage() { - return queryResult_->error; -} - -RowVectorPtr DuckResult::getVector() { - auto rowType = getType(); - std::vector outputColumns; - outputColumns.reserve(columnCount()); - for (auto i = 0; i < columnCount(); i++) { - outputColumns.push_back(getVector(i)); - } - - return std::make_shared( - context_->pool(), - rowType, - BufferPtr(nullptr), - currentChunk_->size(), - outputColumns); -} - -TypePtr DuckResult::getType(size_t columnIdx) { - assert(columnIdx < queryResult_->types.size()); - return toVeloxType(queryResult_->types[columnIdx]); -} - -std::string DuckResult::getName(size_t columnIdx) { - assert(columnIdx < queryResult_->names.size()); - return queryResult_->names[columnIdx]; -} - -inline bool isZeroCopyEligible(const ::duckdb::LogicalType& duckType) { - if (duckType.id() == LogicalTypeId::DECIMAL) { - if (duckType.InternalType() == PhysicalType::INT64 || - duckType.InternalType() == PhysicalType::INT128) { - return true; - } - return false; - } - - if (duckType.id() == LogicalTypeId::HUGEINT || - duckType.id() == LogicalTypeId::TIMESTAMP || - duckType.id() == LogicalTypeId::BOOLEAN || - duckType.id() == LogicalTypeId::BLOB || - duckType.id() == LogicalTypeId::VARCHAR) { - return false; - } - return true; -} - -template -VectorPtr convert( - ::duckdb::Vector& duckVector, - const TypePtr& veloxType, - size_t size, - memory::MemoryPool* pool, - uint8_t* validity = nullptr) { - auto vectorType = duckVector.GetVectorType(); - switch (vectorType) { - case ::duckdb::VectorType::FLAT_VECTOR: { - VectorPtr result; - auto& duckValidity = ::duckdb::FlatVector::Validity(duckVector); - auto* duckData = - ::duckdb::FlatVector::GetData(duckVector); - - // Some DuckDB vectors have different internal layout and cannot be - // trivially copied. - if (!isZeroCopyEligible(duckVector.GetType())) { - // TODO Figure out how to perform a zero-copy conversion. - result = BaseVector::create(veloxType, size, pool); - auto flatResult = result->as>(); - - for (auto i = 0; i < size; i++) { - if (duckValidity.RowIsValid(i) && - (!validity || bits::isBitSet(validity, i))) { - flatResult->set(i, OP::toVelox(duckData[i])); - } - } - - if (!duckValidity.AllValid()) { - auto rawNulls = flatResult->mutableRawNulls(); - memcpy(rawNulls, duckValidity.GetData(), bits::nbytes(size)); - } - } else { - auto valuesView = BufferView::create( - reinterpret_cast(duckData), - size * sizeof(typename OP::VELOX_TYPE), - DuckDBBufferReleaser(duckVector.GetBuffer())); - - BufferPtr nullsView(nullptr); - if (!duckValidity.AllValid()) { - nullsView = BufferView::create( - reinterpret_cast(duckValidity.GetData()), - bits::nbytes(size), - DuckDBValidityReleaser(duckValidity)); - } - - result = std::make_shared>( - pool, - veloxType, - nullsView, - size, - valuesView, - std::vector()); - } - - return result; - } - case ::duckdb::VectorType::DICTIONARY_VECTOR: { - auto& child = ::duckdb::DictionaryVector::Child(duckVector); - auto& selection = ::duckdb::DictionaryVector::SelVector(duckVector); - - // DuckDB vectors doesn't tell what their size is. We are going to use max - // index + 1 instead as the vector is guaranteed to be at least that - // large. - vector_size_t maxIndex = 0; - for (auto i = 0; i < size; i++) { - maxIndex = std::max(maxIndex, (vector_size_t)selection.get_index(i)); - } - VectorPtr base; - // Unused dictionary elements can be uninitialized. That can cause - // errors if we try to decode them. Here we create a bitmap of - // used values to avoid that. - if (child.GetType() == LogicalTypeId::HUGEINT || - child.GetType() == LogicalTypeId::TIMESTAMP || - child.GetType() == LogicalTypeId::VARCHAR) { - std::vector validityVector(bits::nbytes(maxIndex + 1), 0); - auto validity_ptr = validityVector.data(); - for (auto i = 0; i < size; i++) { - bits::setBit(validity_ptr, selection.get_index(i)); - } - base = convert(child, veloxType, maxIndex + 1, pool, validity_ptr); - } else { - base = convert(child, veloxType, maxIndex + 1, pool); - } - - auto indices = AlignedBuffer::allocate(size, pool); - memcpy( - indices->asMutable(), - selection.data(), - size * sizeof(vector_size_t)); - - return BaseVector::wrapInDictionary( - BufferPtr(nullptr), indices, size, base); - } - default: - VELOX_UNSUPPORTED( - "Unsupported DuckDB vector encoding: {}", - ::duckdb::VectorTypeToString(vectorType)); - } -} - -struct NumericCastToDouble { - template - static double operation(T input) { - return double(input); - } -}; - -template <> -double NumericCastToDouble::operation(hugeint_t input) { - return Hugeint::Cast(input); -} - -VectorPtr toVeloxVector( - int32_t size, - ::duckdb::Vector& duckVector, - const TypePtr& veloxType, - memory::MemoryPool* pool) { - VectorPtr veloxFlatVector; - - auto type = duckVector.GetType(); - switch (type.id()) { - case LogicalTypeId::BOOLEAN: - return convert>( - duckVector, veloxType, size, pool); - case LogicalTypeId::TINYINT: - return convert>( - duckVector, veloxType, size, pool); - case LogicalTypeId::SMALLINT: - return convert>( - duckVector, veloxType, size, pool); - case LogicalTypeId::INTEGER: - return convert>( - duckVector, veloxType, size, pool); - case LogicalTypeId::BIGINT: - return convert>( - duckVector, veloxType, size, pool); - case LogicalTypeId::HUGEINT: - return convert(duckVector, veloxType, size, pool); - case LogicalTypeId::FLOAT: - return convert>( - duckVector, veloxType, size, pool); - case LogicalTypeId::DOUBLE: - return convert>( - duckVector, veloxType, size, pool); - case LogicalTypeId::DECIMAL: { - uint8_t width; - uint8_t scale; - type.GetDecimalProperties(width, scale); - switch (type.InternalType()) { - case PhysicalType::INT16: - return convert( - duckVector, veloxType, size, pool); - case PhysicalType::INT32: - return convert( - duckVector, veloxType, size, pool); - case PhysicalType::INT64: - return convert( - duckVector, veloxType, size, pool); - case PhysicalType::INT128: - return convert( - duckVector, veloxType, size, pool); - default: - throw std::runtime_error( - "unrecognized internal type for decimal (this shouldn't happen"); - } - } - case LogicalTypeId::VARCHAR: - return convert(duckVector, veloxType, size, pool); - case LogicalTypeId::BLOB: - return convert(duckVector, veloxType, size, pool); - case LogicalTypeId::DATE: - return convert(duckVector, veloxType, size, pool); - case LogicalTypeId::TIMESTAMP: - return convert( - duckVector, veloxType, size, pool); - default: - throw std::runtime_error( - "Unsupported vector type for conversion: " + type.ToString()); - } -} - -VectorPtr DuckResult::getVector(size_t columnIdx) { - VELOX_CHECK_LT(columnIdx, columnCount()); - VELOX_CHECK( - currentChunk_, - "no chunk available: did you call next() and did it return true?"); - auto& duckVector = currentChunk_->data[columnIdx]; - auto resultType = getType(columnIdx); - return toVeloxVector( - currentChunk_->size(), duckVector, resultType, context_->pool()); -} - -bool DuckResult::next() { - currentChunk_ = queryResult_->Fetch(); - if (!currentChunk_) { - return false; - } - currentChunk_->Normalify(); - return currentChunk_->size() > 0; -} - -} // namespace facebook::velox::duckdb diff --git a/velox/duckdb/conversion/DuckWrapper.h b/velox/duckdb/conversion/DuckWrapper.h deleted file mode 100644 index 5465caf4a23cd..0000000000000 --- a/velox/duckdb/conversion/DuckWrapper.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include "velox/core/QueryCtx.h" -#include "velox/type/Type.h" -#include "velox/vector/BaseVector.h" -#include "velox/vector/ComplexVector.h" - -namespace duckdb { -class DuckDB; -class Connection; -class DataChunk; -class QueryResult; -class Vector; -} // namespace duckdb - -namespace facebook::velox::duckdb { - -class DuckResult; - -class DuckDBWrapper { - public: - //! Opens a DuckDB database object with an optional path to the physical - //! database (default: in-memory only) - explicit DuckDBWrapper(core::ExecCtx* context, const char* path = nullptr); - ~DuckDBWrapper(); - - //! Execute a SQL query in the loaded database, and return the result as a - //! DuckResult - std::unique_ptr execute(const std::string& query); - - //! Execute a SQL query in the loaded database and print the result to stdout - void print(const std::string& query); - - private: - core::ExecCtx* context_; - std::unique_ptr<::duckdb::DuckDB> db_; - std::unique_ptr<::duckdb::Connection> connection_; -}; - -class DuckResult { - public: - DuckResult( - core::ExecCtx* context, - std::unique_ptr<::duckdb::QueryResult> query_result); - ~DuckResult(); - - public: - //! Returns true if the query succeeded, or false if it failed - bool success(); - - //! Returns the error message in case the query failed - std::string errorMessage(); - - //! Returns the number of columns in the result - size_t columnCount() { - return type_ ? type_->size() : 0; - } - - //! Get the type of the result as a row - std::shared_ptr getType() { - return type_; - } - - //! Gets a vector of the query result; returns nullptr if the result has no - //! data (i.e. when next() has returned false, or when next() has not been - //! called yet) - RowVectorPtr getVector(); - - //! Fetches the next chunk from the result set; returns true if there is more - //! data, or false if finished Next needs to be called before any data is - //! fetched from the result - bool next(); - - private: - core::ExecCtx* context_; - std::shared_ptr type_; - - std::unique_ptr<::duckdb::QueryResult> queryResult_; - std::unique_ptr<::duckdb::DataChunk> currentChunk_; - - private: - //! Returns the type of the column at the specified column index; index should - //! be in range [0, column_count) - TypePtr getType(size_t columnIdx); - - //! Returns the name of the column at the specified column index; index should - //! be in range [0, column_count) - std::string getName(size_t columnIdx); - - //! Fetch a single vector of the result; index should - //! be in range [0, column_count) - VectorPtr getVector(size_t columnIdx); -}; - -VectorPtr toVeloxVector( - int32_t size, - ::duckdb::Vector& duckVector, - const TypePtr& veloxType, - memory::MemoryPool* pool); - -} // namespace facebook::velox::duckdb diff --git a/velox/duckdb/conversion/tests/CMakeLists.txt b/velox/duckdb/conversion/tests/CMakeLists.txt index 1ee92eac424a9..3a4607dd8a898 100644 --- a/velox/duckdb/conversion/tests/CMakeLists.txt +++ b/velox/duckdb/conversion/tests/CMakeLists.txt @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_executable(velox_duckdb_conversion_test - DuckConversionTest.cpp DuckParserTest.cpp DuckWrapperTest.cpp) +add_executable(velox_duckdb_conversion_test DuckConversionTest.cpp + DuckParserTest.cpp) add_test(velox_duckdb_conversion_test velox_duckdb_conversion_test) @@ -24,6 +24,6 @@ target_link_libraries( velox_functions_prestosql velox_functions_lib velox_functions_test_lib - gtest - gtest_main + GTest::gtest + GTest::gtest_main gflags::gflags) diff --git a/velox/duckdb/conversion/tests/DuckConversionTest.cpp b/velox/duckdb/conversion/tests/DuckConversionTest.cpp index c53c956bc297c..d73f222b81872 100644 --- a/velox/duckdb/conversion/tests/DuckConversionTest.cpp +++ b/velox/duckdb/conversion/tests/DuckConversionTest.cpp @@ -16,7 +16,6 @@ #include "velox/duckdb/conversion/DuckConversion.h" #include #include -#include "velox/external/duckdb/duckdb.hpp" #include "velox/type/Variant.h" using namespace facebook::velox; @@ -86,12 +85,18 @@ TEST(DuckConversionTest, duckValueToVariant) { } TEST(DuckConversionTest, duckValueToVariantUnsupported) { + /// We use ::duckdb::TransformStringToLogicalType() for scalar types instead + /// of LogicalType:: due this bug in GCC + /// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101957. The scalar types are + /// defined as static constexpr const causing a double definition only in the + /// debug build. std::vector unsupported = { - LogicalType::TIME, - LogicalType::INTERVAL, - LogicalType::LIST({LogicalType::INTEGER}), + ::duckdb::TransformStringToLogicalType("time"), + ::duckdb::TransformStringToLogicalType("interval"), + LogicalType::LIST({::duckdb::TransformStringToLogicalType("integer")}), LogicalType::STRUCT( - {{"a", LogicalType::INTEGER}, {"b", LogicalType::TINYINT}})}; + {{"a", ::duckdb::TransformStringToLogicalType("integer")}, + {"b", ::duckdb::TransformStringToLogicalType("tinyint")}})}; for (const auto& i : unsupported) { EXPECT_THROW(duckValueToVariant(Value(i)), std::runtime_error); @@ -135,10 +140,10 @@ TEST(DuckConversionTest, createTable) { auto testCreateTable = [&](const RowTypePtr& rowType) { auto result = con.Query("DROP TABLE IF EXISTS t"); - VELOX_CHECK(result->success, "{}", result->error); + VELOX_CHECK(!result->HasError(), "{}", result->GetError()); result = con.Query(makeCreateTableSql("t", *rowType)); - VELOX_CHECK(result->success, "{}", result->error); + VELOX_CHECK(!result->HasError(), "{}", result->GetError()); }; testCreateTable( diff --git a/velox/duckdb/conversion/tests/DuckParserTest.cpp b/velox/duckdb/conversion/tests/DuckParserTest.cpp index ab72297239a1e..b3608e6a571af 100644 --- a/velox/duckdb/conversion/tests/DuckParserTest.cpp +++ b/velox/duckdb/conversion/tests/DuckParserTest.cpp @@ -16,6 +16,8 @@ #include "velox/duckdb/conversion/DuckParser.h" #include "velox/common/base/tests/GTestUtils.h" #include "velox/core/PlanNode.h" +#include "velox/functions/prestosql/types/JsonType.h" +#include "velox/functions/prestosql/types/TimestampWithTimeZoneType.h" #include "velox/parse/Expressions.h" using namespace facebook::velox; @@ -367,6 +369,22 @@ TEST(DuckParserTest, cast) { parseExpr("cast(c0 as struct(a bigint, b real, c varchar))")->toString()); } +TEST(DuckParserTest, castToJson) { + registerJsonType(); + EXPECT_EQ("cast(\"c0\", JSON)", parseExpr("cast(c0 as json)")->toString()); + EXPECT_EQ("cast(\"c0\", JSON)", parseExpr("cast(c0 as JSON)")->toString()); +} + +TEST(DuckParserTest, castToTimestampWithTimeZone) { + registerTimestampWithTimeZoneType(); + EXPECT_EQ( + "cast(\"c0\", TIMESTAMP WITH TIME ZONE)", + parseExpr("cast(c0 as timestamp with time zone)")->toString()); + EXPECT_EQ( + "cast(\"c0\", TIMESTAMP WITH TIME ZONE)", + parseExpr("cast(c0 as TIMESTAMP WITH TIME ZONE)")->toString()); +} + TEST(DuckParserTest, ifCase) { EXPECT_EQ("if(99,1,0)", parseExpr("if(99, 1, 0)")->toString()); EXPECT_EQ( diff --git a/velox/duckdb/conversion/tests/DuckWrapperTest.cpp b/velox/duckdb/conversion/tests/DuckWrapperTest.cpp deleted file mode 100644 index 52b00bbb9e8ab..0000000000000 --- a/velox/duckdb/conversion/tests/DuckWrapperTest.cpp +++ /dev/null @@ -1,325 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "velox/duckdb/conversion/DuckWrapper.h" -#include "velox/external/duckdb/duckdb.hpp" -#include "velox/vector/tests/utils/VectorMaker.h" - -#include - -using namespace facebook::velox; -using namespace facebook::velox::duckdb; - -class BaseDuckWrapperTest : public testing::Test { - public: - template - void verifyUnaryResult( - const std::string& query, - const std::vector& expectedOutput, - const std::vector& nulls) { - auto result = db_->execute(query); - ASSERT_EQ(result->success(), true) - << "Query failed: " << result->errorMessage(); - ASSERT_EQ(result->columnCount(), 1); - size_t currentOffset = 0; - while (result->next()) { - auto rowVector = result->getVector(); - auto simpleVector = rowVector->childAt(0)->as>(); - ASSERT_NE(simpleVector, nullptr); - for (auto i = 0; i < simpleVector->size(); i++) { - auto rowNr = currentOffset + i; - ASSERT_LE(rowNr, expectedOutput.size()); - if (nulls[rowNr]) { - ASSERT_EQ(simpleVector->isNullAt(i), true); - } else { - ASSERT_EQ(simpleVector->isNullAt(i), false); - ASSERT_EQ(simpleVector->valueAt(i), expectedOutput[rowNr]); - } - } - currentOffset += simpleVector->size(); - } - ASSERT_EQ(currentOffset, expectedOutput.size()); - } - - template - void verifyUnaryResult( - const std::string& query, - const std::vector& expectedOutput) { - std::vector nulls(expectedOutput.size(), false); - verifyUnaryResult( - std::move(query), std::move(expectedOutput), std::move(nulls)); - } - - template - void verifyDuckToVeloxDecimal( - const std::string& query, - const std::vector>& expected) { - auto result = db_->execute(query); - ASSERT_EQ(result->success(), true) - << "Query failed: " << result->errorMessage(); - ASSERT_EQ(result->columnCount(), 1); - size_t currentOffset = 0; - while (result->next()) { - auto rowVector = result->getVector(); - auto simpleVector = rowVector->childAt(0)->as>(); - ASSERT_NE(simpleVector, nullptr); - for (auto i = 0; i < simpleVector->size(); i++) { - if (simpleVector->isNullAt(i)) { - ASSERT_FALSE(expected[i].has_value()); - continue; - } - ASSERT_EQ(simpleVector->valueAt(i), expected[i]); - } - } - } - - void execute(const std::string& query) { - auto result = db_->execute(query); - ASSERT_EQ(result->success(), true) - << "Query failed: " << result->errorMessage(); - } - - std::shared_ptr queryCtx_{std::make_shared()}; - std::shared_ptr pool_{memory::addDefaultLeafMemoryPool()}; - std::unique_ptr execCtx_{ - std::make_unique(pool_.get(), queryCtx_.get())}; - std::unique_ptr db_{ - std::make_unique(execCtx_.get())}; -}; - -TEST_F(BaseDuckWrapperTest, simpleSelect) { - // scalar query - verifyUnaryResult("SELECT 42::INTEGER", {42}); - - // bit more complex - verifyUnaryResult( - "SELECT a::INTEGER FROM (VALUES (1), (2), (3)) tbl(a)", {1, 2, 3}); - - // now with a table - execute("CREATE TABLE integers(i INTEGER)"); - execute("INSERT INTO integers VALUES (1), (2), (3), (NULL)"); - - verifyUnaryResult( - "SELECT * FROM integers", {1, 2, 3, 0}, {false, false, false, true}); -} - -TEST_F(BaseDuckWrapperTest, scalarTypes) { - // test various types - // boolean types - execute("CREATE TABLE booleans(i BOOLEAN)"); - execute("INSERT INTO booleans VALUES (true), (true), (false), (false)"); - verifyUnaryResult("SELECT * FROM booleans", {true, true, false, false}); - - // integer types - verifyUnaryResult("SELECT 42::TINYINT", {42}); - verifyUnaryResult("SELECT 42::SMALLINT", {42}); - verifyUnaryResult("SELECT 42::INTEGER", {42}); - verifyUnaryResult("SELECT 42::BIGINT", {42}); - - // hugeint is cast to double - verifyUnaryResult("SELECT 42::HUGEINT", {42}); - - // numeric types - verifyUnaryResult("SELECT 1::FLOAT", {1.0}); - verifyUnaryResult("SELECT 1::DOUBLE", {1.0}); - - // date/timestamp - verifyUnaryResult("SELECT DATE '1992-01-01'", {8035}); - verifyUnaryResult( - "SELECT TIMESTAMP '1992-01-01 13:04:20'", {Timestamp(694271060, 0)}); - - // varchar - verifyUnaryResult("SELECT 'shortstr'", {StringView("shortstr")}); - verifyUnaryResult( - "SELECT '12characters'", {StringView("12characters")}); - verifyUnaryResult( - "SELECT 'this is a long, non-inlined, example string'", - {StringView("this is a long, non-inlined, example string")}); - - // blob - verifyUnaryResult("SELECT '\\xFF'::BLOB", {StringView("\xFF")}); -} - -TEST_F(BaseDuckWrapperTest, types) { - // test various types - // integer types - verifyUnaryResult( - "SELECT i::TINYINT FROM (VALUES (1), (2), (3), (NULL)) tbl(i)", - {1, 2, 3, 0}, - {false, false, false, true}); - verifyUnaryResult( - "SELECT i::SMALLINT FROM (VALUES (1), (2), (3), (NULL)) tbl(i)", - {1, 2, 3, 0}, - {false, false, false, true}); - verifyUnaryResult( - "SELECT i::INTEGER FROM (VALUES (1), (2), (3), (NULL)) tbl(i)", - {1, 2, 3, 0}, - {false, false, false, true}); - verifyUnaryResult( - "SELECT i::BIGINT FROM (VALUES (1), (2), (3), (NULL)) tbl(i)", - {1, 2, 3, 0}, - {false, false, false, true}); - - // hugeint is cast to double - verifyUnaryResult( - "SELECT i::HUGEINT FROM (VALUES (1), (2), (4), (NULL)) tbl(i)", - {1, 2, 4, 0}, - {false, false, false, true}); - - // numeric types - verifyUnaryResult( - "SELECT i::FLOAT FROM (VALUES (1), (2), (4), (NULL)) tbl(i)", - {1, 2, 4, 0}, - {false, false, false, true}); - verifyUnaryResult( - "SELECT i::DOUBLE FROM (VALUES (1), (2), (4), (NULL)) tbl(i)", - {1, 2, 4, 0}, - {false, false, false, true}); - - // date/timestamp - verifyUnaryResult( - "SELECT i FROM (VALUES (DATE '1992-01-01'), (NULL)) tbl(i)", - {8035, 0}, - {false, true}); - verifyUnaryResult( - "SELECT i FROM (VALUES (TIMESTAMP '1992-01-01 13:04:20'), (NULL)) tbl(i)", - {Timestamp(694271060, 0), Timestamp(0, 0)}, - {false, true}); - - // varchar - verifyUnaryResult( - "SELECT * FROM (VALUES ('shortstr'), ('12characters'), ('this is a long, non-inlined, example string'), (NULL)) tbl(i)", - {StringView("shortstr"), - StringView("12characters"), - StringView("this is a long, non-inlined, example string"), - StringView("")}, - {false, false, false, true}); -} - -TEST_F(BaseDuckWrapperTest, tpchEmpty) { - // test TPC-H loading and querying with an empty database - execute("CALL dbgen(sf=0)"); - verifyUnaryResult( - "SELECT l_orderkey FROM lineitem WHERE l_orderkey=1", {}); -} - -TEST_F(BaseDuckWrapperTest, tpchSF1) { - // test TPC-H loading and querying SF0.01 - execute("CALL dbgen(sf=0.01)"); - // test conversion of date, decimal and string - verifyUnaryResult("SELECT l_discount FROM lineitem LIMIT 1", {4}); - verifyUnaryResult("SELECT l_shipdate FROM lineitem LIMIT 1", {9568}); - verifyUnaryResult( - "SELECT l_comment FROM lineitem LIMIT 1", - {StringView("egular courts above the")}); -} - -TEST_F(BaseDuckWrapperTest, duckToVeloxDecimal) { - // Test SMALLINT decimal to UnscaledShortDecimal conversion. - verifyDuckToVeloxDecimal( - "select * from (values (NULL), ('1.2'::decimal(2,1))," - "('2.2'::decimal(2,1)),('-4.2'::decimal(2,1)), (NULL))", - {std::nullopt, 12, 22, -42, std::nullopt}); - - // Test INTEGER decimal to UnscaledShortDecimal conversion. - verifyDuckToVeloxDecimal( - "select * from (values ('1111.1111'::decimal(8,4))," - "('2222.2222'::decimal(8,4)),('-3333.3333'::decimal(8,4)))", - {11111111, 22222222, -33333333}); - - // Test BIGINT decimal to UnscaledLongDecimal conversion. - verifyDuckToVeloxDecimal( - "select * from (values ('-111111.111111'::decimal(12,6))," - "('222222.222222'::decimal(12,6)),('333333.333333'::decimal(12,6)))", - {-111111111111, 222222222222, 333333333333}); - - verifyDuckToVeloxDecimal( - "select * from (values (NULL)," - "('12345678901234.789'::decimal(18,3) * 10000.555::decimal(20,3))," - "('-55555555555555.789'::decimal(18,3) * 10000.555::decimal(20,3)), (NULL)," - "('-22222222222222.789'::decimal(18,3) * 10000.555::decimal(20,3)))", - {std::nullopt, - HugeInt::build(0X1a24, 0Xfa35bb8777ffff77), - HugeInt::build(0XFFFFFFFFFFFF8A59, 0X99FC706655BFAC11), - std::nullopt, - HugeInt::build(0XFFFFFFFFFFFFD0F0, 0XA3FE935B081D8D69)}); -} - -TEST_F(BaseDuckWrapperTest, decimalDictCoversion) { - constexpr int32_t size = 6; - ::duckdb::LogicalType* duckDecimalType = - static_cast<::duckdb::LogicalType*>(duckdb_create_decimal_type(4, 2)); - ::duckdb::Vector data(*duckDecimalType, size); - auto dataPtr = reinterpret_cast(data.GetBuffer()->GetData()); - // Make dirty data which shouldn't be accessed. - memset(dataPtr, 0xAB, sizeof(int16_t) * size); - dataPtr[0] = 5000; - dataPtr[2] = 1000; - dataPtr[4] = 2000; - // Turn vector into dictionary. - ::duckdb::SelectionVector sel(size); - sel.set_index(0, 2); - sel.set_index(1, 4); - sel.set_index(2, 0); - sel.set_index(3, 4); - sel.set_index(4, 2); - sel.set_index(5, 0); - data.Slice(sel, size); - - auto decimalType = DECIMAL(4, 2); - auto actual = toVeloxVector(size, data, decimalType, pool_.get()); - std::vector expectedData({1000, 2000, 5000, 2000, 1000, 5000}); - - test::VectorMaker maker(pool_.get()); - auto expectedFlatVector = maker.flatVector(size, decimalType); - - for (auto i = 0; i < expectedData.size(); ++i) { - expectedFlatVector->set(i, expectedData[i]); - } - - for (auto i = 0; i < actual->size(); i++) { - ASSERT_TRUE(expectedFlatVector->equalValueAt(actual.get(), i, i)); - } - delete duckDecimalType; -} - -TEST_F(BaseDuckWrapperTest, dictConversion) { - ::duckdb::Vector data(::duckdb::LogicalTypeId::VARCHAR, 5); - auto dataPtr = - reinterpret_cast<::duckdb::string_t*>(data.GetBuffer()->GetData()); - // Make dirty data which shouldn't be accessed. - memset(dataPtr, 0xAB, sizeof(::duckdb::string_t) * 5); - dataPtr[2] = ::duckdb::string_t("value1"); - dataPtr[4] = ::duckdb::string_t("value2"); - - // Turn vector into dictionary. - ::duckdb::SelectionVector sel(5); - sel.set_index(0, 2); - sel.set_index(1, 4); - sel.set_index(2, 2); - sel.set_index(3, 4); - sel.set_index(4, 2); - data.Slice(sel, 5); - - auto actual = toVeloxVector(5, data, VarcharType::create(), pool_.get()); - - test::VectorMaker maker(pool_.get()); - std::vector expectedData( - {"value1", "value2", "value1", "value2", "value1"}); - auto expected = maker.flatVector(expectedData); - for (auto i = 0; i < actual->size(); i++) { - ASSERT_TRUE(expected->equalValueAt(actual.get(), i, i)); - } -} diff --git a/velox/dwio/CMakeLists.txt b/velox/dwio/CMakeLists.txt index 2dd3ea5fac27f..efcb3c06bebe3 100644 --- a/velox/dwio/CMakeLists.txt +++ b/velox/dwio/CMakeLists.txt @@ -11,25 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_link_libs INTERFACE) -target_link_libraries( +velox_add_library(velox_link_libs INTERFACE) +velox_link_libraries( velox_link_libs - INTERFACE velox_caching - velox_dwio_catalog_fbhive - velox_dwio_common - velox_dwio_common_exception - velox_encode - velox_exception - velox_memory - velox_process - velox_serialization - velox_type - velox_type_fbhive - velox_vector - Folly::folly - fmt::fmt) + INTERFACE + velox_caching + velox_dwio_catalog_fbhive + velox_dwio_common + velox_dwio_common_exception + velox_encode + velox_exception + velox_memory + velox_process + velox_serialization + velox_type + velox_type_fbhive + velox_vector + Folly::folly + fmt::fmt) add_subdirectory(common) add_subdirectory(catalog) add_subdirectory(dwrf) +add_subdirectory(orc) add_subdirectory(parquet) diff --git a/velox/dwio/catalog/fbhive/CMakeLists.txt b/velox/dwio/catalog/fbhive/CMakeLists.txt index f901b5727b335..e892107323684 100644 --- a/velox/dwio/catalog/fbhive/CMakeLists.txt +++ b/velox/dwio/catalog/fbhive/CMakeLists.txt @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_dwio_catalog_fbhive FileUtils.cpp) -target_link_libraries(velox_dwio_catalog_fbhive velox_dwio_common_exception - fmt::fmt Folly::folly) +velox_add_library(velox_dwio_catalog_fbhive FileUtils.cpp) +velox_link_libraries(velox_dwio_catalog_fbhive velox_dwio_common_exception + fmt::fmt Folly::folly) if(${VELOX_BUILD_TESTING}) add_subdirectory(test) diff --git a/velox/dwio/catalog/fbhive/FileUtils.cpp b/velox/dwio/catalog/fbhive/FileUtils.cpp index 792679d15abe1..94c75dffd995b 100644 --- a/velox/dwio/catalog/fbhive/FileUtils.cpp +++ b/velox/dwio/catalog/fbhive/FileUtils.cpp @@ -15,9 +15,12 @@ */ #include "FileUtils.h" -#include + #include -#include "folly/container/Array.h" + +#include +#include + #include "velox/dwio/common/exception/Exception.h" namespace facebook { @@ -29,7 +32,6 @@ namespace fbhive { namespace { constexpr size_t HEX_WIDTH = 2; -const std::string DEFAULT_PARTITION_VALUE{"__HIVE_DEFAULT_PARTITION__"}; constexpr auto charsToEscape = folly::make_array( '"', @@ -94,7 +96,7 @@ std::vector> extractPartitionKeyValues( parserFunc(part, entries); } return entries; -}; +} // Strong assumption that all expressions in the form of a=b means a partition // key value pair in '/' separated tokens. We could have stricter validation @@ -158,7 +160,8 @@ std::string FileUtils::unescapePathName(const std::string& data) { } std::string FileUtils::makePartName( - const std::vector>& entries) { + const std::vector>& entries, + bool partitionPathAsLowerCase) { size_t size = 0; size_t escapeCount = 0; std::for_each(entries.begin(), entries.end(), [&](auto& pair) { @@ -166,9 +169,10 @@ std::string FileUtils::makePartName( DWIO_ENSURE_GT(keySize, 0); size += keySize; escapeCount += countEscape(pair.first); + auto valSize = pair.second.size(); if (valSize == 0) { - size += DEFAULT_PARTITION_VALUE.size(); + size += kDefaultPartitionValue.size(); } else { size += valSize; escapeCount += countEscape(pair.second); @@ -182,10 +186,15 @@ std::string FileUtils::makePartName( if (ret.size() > 0) { ret += "/"; } - ret += escapePathName(toLower(pair.first)); + if (partitionPathAsLowerCase) { + ret += escapePathName(toLower(pair.first)); + } else { + ret += escapePathName(pair.first); + } + ret += "="; if (pair.second.size() == 0) { - ret += DEFAULT_PARTITION_VALUE; + ret += kDefaultPartitionValue; } else { ret += escapePathName(pair.second); } @@ -211,7 +220,7 @@ std::vector> FileUtils::parsePartKeyValues( std::string FileUtils::extractPartitionName(const std::string& filePath) { const auto& partitionParts = extractPartitionKeyValues(filePath); - return partitionParts.empty() ? "" : makePartName(partitionParts); + return partitionParts.empty() ? "" : makePartName(partitionParts, false); } } // namespace fbhive diff --git a/velox/dwio/catalog/fbhive/FileUtils.h b/velox/dwio/catalog/fbhive/FileUtils.h index 1c759ffb19e03..a8ca8bf07efdf 100644 --- a/velox/dwio/catalog/fbhive/FileUtils.h +++ b/velox/dwio/catalog/fbhive/FileUtils.h @@ -40,7 +40,8 @@ class FileUtils { /// Creates the partition directory path from the list of partition key/value /// pairs, will do url-encoding when needed. static std::string makePartName( - const std::vector>& entries); + const std::vector>& entries, + bool partitionPathAsLowerCase); /// Converts the hive-metastore-compliant path name back to the corresponding /// partition key/value pairs. @@ -49,6 +50,9 @@ class FileUtils { /// Converts a path name to a hive-metastore-compliant path name. static std::string extractPartitionName(const std::string& filePath); + + inline static const std::string kDefaultPartitionValue = + "__HIVE_DEFAULT_PARTITION__"; }; } // namespace fbhive diff --git a/velox/dwio/catalog/fbhive/test/CMakeLists.txt b/velox/dwio/catalog/fbhive/test/CMakeLists.txt index b3dc1b7d6211e..34b279ffdb8c8 100644 --- a/velox/dwio/catalog/fbhive/test/CMakeLists.txt +++ b/velox/dwio/catalog/fbhive/test/CMakeLists.txt @@ -14,5 +14,10 @@ add_executable(file_utils_test FileUtilsTests.cpp) add_test(file_utils_test file_utils_test) -target_link_libraries(file_utils_test velox_dwio_catalog_fbhive - velox_dwio_common_exception gtest gtest_main gmock) +target_link_libraries( + file_utils_test + velox_dwio_catalog_fbhive + velox_dwio_common_exception + GTest::gtest + GTest::gtest_main + GTest::gmock) diff --git a/velox/dwio/catalog/fbhive/test/FileUtilsTests.cpp b/velox/dwio/catalog/fbhive/test/FileUtilsTests.cpp index 4abe1c4905e09..b2909cc0da368 100644 --- a/velox/dwio/catalog/fbhive/test/FileUtilsTests.cpp +++ b/velox/dwio/catalog/fbhive/test/FileUtilsTests.cpp @@ -27,8 +27,11 @@ TEST(FileUtilsTests, MakePartName) { std::vector> pairs{ {"ds", "2016-01-01"}, {"FOO", ""}, {"a\nb:c", "a#b=c"}}; ASSERT_EQ( - FileUtils::makePartName(pairs), + FileUtils::makePartName(pairs, true), "ds=2016-01-01/foo=__HIVE_DEFAULT_PARTITION__/a%0Ab%3Ac=a%23b%3Dc"); + ASSERT_EQ( + FileUtils::makePartName(pairs, false), + "ds=2016-01-01/FOO=__HIVE_DEFAULT_PARTITION__/a%0Ab%3Ac=a%23b%3Dc"); } TEST(FileUtilsTests, ParsePartKeyValues) { diff --git a/velox/dwio/common/BitConcatenation.cpp b/velox/dwio/common/BitConcatenation.cpp index 2a2e86a45ec96..5f37053f66a6e 100644 --- a/velox/dwio/common/BitConcatenation.cpp +++ b/velox/dwio/common/BitConcatenation.cpp @@ -19,7 +19,7 @@ namespace facebook::velox::dwio::common { void BitConcatenation::append( - const uint64_t* FOLLY_NULLABLE bits, + const uint64_t* bits, int32_t begin, int32_t end) { int32_t numBits = end - begin; @@ -46,7 +46,7 @@ void BitConcatenation::appendOnes(int32_t numOnes) { } } -uint64_t* FOLLY_NONNULL BitConcatenation::ensureSpace(int32_t numBits) { +uint64_t* BitConcatenation::ensureSpace(int32_t numBits) { if (!*buffer_) { *buffer_ = AlignedBuffer::allocate(numBits_ + numBits, &pool_, true); } else if (numBits_ + numBits > (*buffer_)->capacity() * 8) { diff --git a/velox/dwio/common/BitConcatenation.h b/velox/dwio/common/BitConcatenation.h index cc812035d8b82..18109114ae665 100644 --- a/velox/dwio/common/BitConcatenation.h +++ b/velox/dwio/common/BitConcatenation.h @@ -38,7 +38,7 @@ class BitConcatenation { /// Appends 'bits' between bit offset 'begin' and 'end' to the result. /// A nullptr 'bits' is treated as a bit range with all bits set. - void append(const uint64_t* FOLLY_NULLABLE bits, int32_t begin, int32_t end); + void append(const uint64_t* bits, int32_t begin, int32_t end); /// Appends 'numOnes' ones. void appendOnes(int32_t numOnes); @@ -55,16 +55,16 @@ class BitConcatenation { private: // Allocates or reallocates '*buffer' to have space for 'numBits_ + newBits' // bits. Retuns a pointer to the first word of 'buffer_'. - uint64_t* FOLLY_NONNULL ensureSpace(int32_t newBits); + uint64_t* ensureSpace(int32_t newBits); void setSize() { if (*buffer_) { - (*buffer_)->setSize(bits::roundUp(numBits_, 8) / 8); + (*buffer_)->setSize(bits::divRoundUp(numBits_, 8)); } } memory::MemoryPool& pool_; - BufferPtr* FOLLY_NULLABLE buffer_{nullptr}; + BufferPtr* buffer_{nullptr}; int32_t numBits_{0}; bool hasZeros_{false}; }; diff --git a/velox/dwio/common/BitPackDecoder.cpp b/velox/dwio/common/BitPackDecoder.cpp index b8f9b3aadd259..917b0ab972c60 100644 --- a/velox/dwio/common/BitPackDecoder.cpp +++ b/velox/dwio/common/BitPackDecoder.cpp @@ -35,7 +35,7 @@ auto as4x64(__m256i x) { } template -void store8Ints(__m256i eightInts, int32_t i, T* FOLLY_NONNULL result) { +void store8Ints(__m256i eightInts, int32_t i, T* result) { if (sizeof(T) == 4) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(result + i), eightInts); } else { @@ -200,7 +200,7 @@ void unpack( bitOffset -= rowBias * bitWidth; if (bitOffset < 0) { // Decrement the pointer by enough bytes to have a non-negative bitOffset. - auto bytes = bits::roundUp(-bitOffset, 8) / 8; + auto bytes = bits::divRoundUp(-bitOffset, 8); bitOffset += bytes * 8; bits = reinterpret_cast( reinterpret_cast(bits) - bytes); @@ -213,12 +213,11 @@ void unpack( } return; } - auto FOLLY_NONNULL lastSafe = bufferEnd - sizeof(uint64_t); int32_t numSafeRows = numRows; bool anyUnsafe = false; if (bufferEnd) { const char* endByte = reinterpret_cast(bits) + - bits::roundUp(bitOffset + (rows.back() + 1) * bitWidth, 8) / 8; + bits::divRoundUp(bitOffset + (rows.back() + 1) * bitWidth, 8); // redzone is the number of bytes at the end of the accessed range that // could overflow the buffer if accessed 64 its wide. int64_t redZone = @@ -281,15 +280,15 @@ void unpack( if (anyUnsafe) { auto lastSafeWord = bufferEnd - sizeof(uint64_t); VELOX_DCHECK(lastSafeWord); - for (auto i = numSafeRows; i < numRows; ++i) { - auto bit = bitOffset + (rows[i]) * bitWidth; + for (auto i_2 = numSafeRows; i_2 < numRows; ++i_2) { + auto bit = bitOffset + (rows[i_2]) * bitWidth; auto byte = bit / 8; auto shift = bit & 7; - result[i] = safeLoadBits( - reinterpret_cast(bits) + byte, - shift, - bitWidth, - lastSafeWord) & + result[i_2] = safeLoadBits( + reinterpret_cast(bits) + byte, + shift, + bitWidth, + lastSafeWord) & mask; } } diff --git a/velox/dwio/common/BitPackDecoder.h b/velox/dwio/common/BitPackDecoder.h index fc87fc369e121..2aa785e2d29fe 100644 --- a/velox/dwio/common/BitPackDecoder.h +++ b/velox/dwio/common/BitPackDecoder.h @@ -21,7 +21,7 @@ #include "velox/vector/TypeAliases.h" #include -#include // @manual +#include namespace facebook::velox::dwio::common { @@ -90,13 +90,13 @@ static const uint32_t BITPACK_MASKS[] = { /// stay under 'bufferEnd'. template void unpack( - const uint64_t* FOLLY_NULLABLE bits, + const uint64_t* bits, int32_t bitOffset, RowSet rows, int32_t rowBias, uint8_t bitWidth, - const char* FOLLY_NULLABLE bufferEnd, - T* FOLLY_NONNULL result); + const char* bufferEnd, + T* result); /// Unpack numValues number of input values from inputBuffer. The results /// will be written to result. numValues must be a multiple of 8. The @@ -793,17 +793,17 @@ inline void unpack( // sure not to access bytes past lastSafeWord + 7. The definition is put here // because it's inlined. inline uint64_t safeLoadBits( - const char* FOLLY_NONNULL ptr, + const char* ptr, int32_t bitOffset, uint8_t bitWidth, - const char* FOLLY_NONNULL lastSafeWord) { + const char* lastSafeWord) { VELOX_DCHECK_GE(7, bitOffset); VELOX_DCHECK_GE(56, bitWidth); if (ptr < lastSafeWord) { return *reinterpret_cast(ptr) >> bitOffset; } int32_t byteWidth = - facebook::velox::bits::roundUp(bitOffset + bitWidth, 8) / 8; + facebook::velox::bits::divRoundUp(bitOffset + bitWidth, 8); return facebook::velox::bits::loadPartialWord( reinterpret_cast(ptr), byteWidth) >> bitOffset; diff --git a/velox/dwio/common/BufferUtil.h b/velox/dwio/common/BufferUtil.h index 92b468b0e5bcf..738d81efa1672 100644 --- a/velox/dwio/common/BufferUtil.h +++ b/velox/dwio/common/BufferUtil.h @@ -24,10 +24,30 @@ template inline void ensureCapacity( BufferPtr& data, size_t capacity, - velox::memory::MemoryPool* pool) { - if (!data || !data->isMutable() || - data->capacity() < BaseVector::byteSize(capacity)) { + velox::memory::MemoryPool* pool, + bool preserveOldData = false, + bool clearBits = false) { + size_t oldSize = 0; + if (!data) { data = AlignedBuffer::allocate(capacity, pool); + } else { + oldSize = data->size(); + if (!data->isMutable() || + data->capacity() < BaseVector::byteSize(capacity)) { + auto newData = AlignedBuffer::allocate(capacity, pool); + if (preserveOldData) { + std::memcpy( + newData->template asMutable(), + data->as(), + oldSize); + } + data = newData; + } + } + + if (clearBits) { + std::memset( + (void*)(data->asMutable() + oldSize), 0L, capacity - oldSize); } } diff --git a/velox/dwio/common/BufferedInput.cpp b/velox/dwio/common/BufferedInput.cpp index 3b89140aeff41..8791e418f49e1 100644 --- a/velox/dwio/common/BufferedInput.cpp +++ b/velox/dwio/common/BufferedInput.cpp @@ -15,6 +15,8 @@ */ #include +#include +#include #include "folly/io/Cursor.h" #include "velox/dwio/common/BufferedInput.h" @@ -25,6 +27,23 @@ using ::facebook::velox::common::Region; namespace facebook::velox::dwio::common { +static_assert(std::is_move_constructible()); + +namespace { +void copyIOBufToMemory(folly::IOBuf&& iobuf, folly::Range allocated) { + folly::io::Cursor cursor(&iobuf); + VELOX_CHECK_EQ(cursor.totalLength(), allocated.size(), "length mismatch."); + cursor.pull(allocated.data(), allocated.size()); +} +} // namespace + +uint64_t BufferedInput::nextFetchSize() const { + return std::accumulate( + regions_.cbegin(), regions_.cend(), 0L, [](uint64_t a, const Region& b) { + return a + b.length; + }); +} + void BufferedInput::load(const LogType logType) { // no regions to load if (regions_.size() == 0) { @@ -47,38 +66,44 @@ void BufferedInput::load(const LogType logType) { std::vector iobufs(regions_.size()); input_->vread(regions_, {iobufs.data(), iobufs.size()}, logType); for (size_t i = 0; i < regions_.size(); ++i) { - const auto& region = regions_[i]; - auto iobuf = std::move(iobufs[i]); - - auto allocated = allocate(region); - folly::io::Cursor cursor(&iobuf); - DWIO_ENSURE_EQ( - cursor.totalLength(), allocated.size(), "length mismatch."); - cursor.pull(allocated.data(), allocated.size()); + copyIOBufToMemory(std::move(iobufs[i]), allocate(regions_[i])); } - } else { for (const auto& region : regions_) { - auto allocated = allocate(region); - input_->read(allocated.data(), allocated.size(), region.offset, logType); + readToBuffer(region.offset, allocate(region), logType); } } - // clear the loaded regions + // clear the loaded regions. regions_.clear(); } +void BufferedInput::readToBuffer( + uint64_t offset, + folly::Range allocated, + const LogType logType) { + uint64_t usec = 0; + { + MicrosecondTimer timer(&usec); + input_->read(allocated.data(), allocated.size(), offset, logType); + } + if (auto* stats = input_->getStats()) { + stats->read().increment(allocated.size()); + stats->queryThreadIoLatency().increment(usec); + } +} + std::unique_ptr BufferedInput::enqueue( Region region, - const dwio::common::StreamIdentifier* /*si*/) { + const dwio::common::StreamIdentifier* /*sid*/) { if (region.length == 0) { return std::make_unique( static_cast(nullptr), 0); } - // if the region is already in buffer - such as metadata + // If the region is already in buffer - such as metadata. auto ret = readBuffer(region.offset, region.length); - if (ret) { + if (ret != nullptr) { return ret; } @@ -88,7 +113,13 @@ std::unique_ptr BufferedInput::enqueue( // Save "i", the position in which this region was enqueued. This will // help faster lookup using enqueuedToBufferOffset_ later. [region, this, i = regions_.size() - 1]() { - return readInternal(region.offset, region.length, i); + auto result = readInternal(region.offset, region.length, i); + VELOX_CHECK( + std::get<1>(result) != MAX_UINT64, + "Fail to read region offset={} length={}", + region.offset, + region.length); + return result; }); } @@ -129,8 +160,8 @@ void BufferedInput::sortRegions() { void BufferedInput::mergeRegions() { auto& r = regions_; + VELOX_CHECK(!r.empty(), "Assumes that there's at least one region"); auto& e = enqueuedToBufferOffset_; - size_t ia = 0; // We want to map here where each region ended in the final merged regions // vector. // For example, if this is the regions vector: {{6, 3}, {24, 3}, {3, 3}, {0, @@ -141,13 +172,12 @@ void BufferedInput::mergeRegions() { // position 0. The original region 1, became region 1, and original region 4 // became region 2 std::vector te(e.size()); - - DWIO_ENSURE(!r.empty(), "Assumes that there's at least one region"); - DWIO_ENSURE_GT(r[ia].length, 0, "invalid region"); - te[e[0]] = 0; + + size_t ia = 0; + VELOX_CHECK_GT(r[ia].length, 0, "invalid region"); for (size_t ib = 1; ib < r.size(); ++ib) { - DWIO_ENSURE_GT(r[ib].length, 0, "invalid region"); + VELOX_CHECK_GT(r[ib].length, 0, "invalid region"); if (!tryMerge(r[ia], r[ib])) { r[++ia] = r[ib]; } @@ -159,7 +189,7 @@ void BufferedInput::mergeRegions() { } bool BufferedInput::tryMerge(Region& first, const Region& second) { - DWIO_ENSURE_GE(second.offset, first.offset, "regions should be sorted."); + VELOX_CHECK_GE(second.offset, first.offset, "regions should be sorted."); const int64_t gap = second.offset - first.offset - first.length; // Duplicate regions (extension==0) is the only case allowed to merge for @@ -178,10 +208,8 @@ bool BufferedInput::tryMerge(Region& first, const Region& second) { input_->getStats()->incRawOverreadBytes(gap); } } - return true; } - return false; } @@ -189,12 +217,10 @@ std::unique_ptr BufferedInput::readBuffer( uint64_t offset, uint64_t length) const { const auto result = readInternal(offset, length); - - auto size = std::get<1>(result); + const auto size = std::get<1>(result); if (size == MAX_UINT64) { return {}; } - return std::make_unique(std::get<0>(result), size); } @@ -209,7 +235,7 @@ std::tuple BufferedInput::readInternal( std::optional index; if (i.has_value()) { - auto vi = i.value(); + const auto vi = i.value(); // There's a possibility that our user enqueued, then tried to read before // calling load(). In that case, enqueuedToBufferOffset_ will be empty or // have the values from a previous load. So I want to make sure that he ends @@ -218,14 +244,16 @@ std::tuple BufferedInput::readInternal( if (vi < enqueuedToBufferOffset_.size() && enqueuedToBufferOffset_[vi] < offsets_.size() && offsets_[enqueuedToBufferOffset_[vi]] <= offset) { - index = enqueuedToBufferOffset_[i.value()]; + index = enqueuedToBufferOffset_[vi]; } } + if (!index.has_value()) { // Binary search to get the first fileOffset for which: offset < fileOffset - auto it = std::upper_bound(offsets_.cbegin(), offsets_.cend(), offset); + const auto it = + std::upper_bound(offsets_.cbegin(), offsets_.cend(), offset); // If the first element was already greater than the target offset we don't - // have it + // have it. if (it != offsets_.cbegin()) { index = std::distance(offsets_.cbegin(), it) - 1; } @@ -235,14 +263,13 @@ std::tuple BufferedInput::readInternal( const uint64_t bufferOffset = offsets_[index.value()]; const auto& buffer = buffers_[index.value()]; if (bufferOffset + buffer.size() >= offset + length) { - DWIO_ENSURE_LE(bufferOffset, offset, "Invalid offset for readInternal"); - DWIO_ENSURE_LE( + VELOX_CHECK_LE(bufferOffset, offset, "Invalid offset for readInternal"); + VELOX_CHECK_LE( (offset - bufferOffset) + length, buffer.size(), "Invalid readOffset for read Internal ", fmt::format( "{} {} {} {}", offset, bufferOffset, length, buffer.size())); - return std::make_tuple(buffer.data() + (offset - bufferOffset), length); } } diff --git a/velox/dwio/common/BufferedInput.h b/velox/dwio/common/BufferedInput.h index 9adf325789ddc..7b12f601183f3 100644 --- a/velox/dwio/common/BufferedInput.h +++ b/velox/dwio/common/BufferedInput.h @@ -33,25 +33,25 @@ class BufferedInput { std::shared_ptr readFile, memory::MemoryPool& pool, const MetricsLogPtr& metricsLog = MetricsLog::voidLog(), - IoStatistics* FOLLY_NULLABLE stats = nullptr, + IoStatistics* stats = nullptr, uint64_t maxMergeDistance = kMaxMergeDistance, std::optional wsVRLoad = std::nullopt) - : input_{std::make_shared( - std::move(readFile), - metricsLog, - stats)}, - pool_{pool}, - maxMergeDistance_{maxMergeDistance}, - wsVRLoad_{wsVRLoad}, - allocPool_{std::make_unique(&pool)} {} + : BufferedInput( + std::make_shared( + std::move(readFile), + metricsLog, + stats), + pool, + maxMergeDistance, + wsVRLoad) {} BufferedInput( std::shared_ptr input, memory::MemoryPool& pool, uint64_t maxMergeDistance = kMaxMergeDistance, std::optional wsVRLoad = std::nullopt) - : input_(std::move(input)), - pool_(pool), + : input_{std::move(input)}, + pool_{&pool}, maxMergeDistance_{maxMergeDistance}, wsVRLoad_{wsVRLoad}, allocPool_{std::make_unique(&pool)} {} @@ -67,17 +67,21 @@ class BufferedInput { return input_->getName(); } - // The previous API was taking a vector of regions - // Now we allow callers to enqueue region any time/place - // and we do final load into buffer in 2 steps (enqueue....load) - // 'si' allows tracking which streams actually get read. This may control - // read-ahead and caching for BufferedInput implementations supporting - // these. + /// The previous API was taking a vector of regions. Now we allow callers to + /// enqueue region any time/place and we do final load into buffer in 2 steps + /// (enqueue....load). 'si' allows tracking which streams actually get read. + /// This may control read-ahead and caching for BufferedInput implementations + /// supporting these. virtual std::unique_ptr enqueue( velox::common::Region region, - const StreamIdentifier* FOLLY_NULLABLE si = nullptr); + const StreamIdentifier* sid = nullptr); - // load all regions to be read in an optimized way (IO efficiency) + /// Returns true if load synchronously. + virtual bool supportSyncLoad() const { + return true; + } + + /// load all regions to be read in an optimized way (IO efficiency) virtual void load(const LogType); virtual bool isBuffered(uint64_t offset, uint64_t length) const { @@ -87,15 +91,15 @@ class BufferedInput { virtual std::unique_ptr read(uint64_t offset, uint64_t length, LogType logType) const { std::unique_ptr ret = readBuffer(offset, length); - if (!ret) { - VLOG(1) << "Unplanned read. Offset: " << offset << ", Length: " << length; - // We cannot do enqueue/load here because load() clears previously - // loaded data. TODO: figure out how we can use the data cache for - // this access. - ret = std::make_unique( - input_, offset, length, pool_, logType, input_->getNaturalReadSize()); + if (ret != nullptr) { + return ret; } - return ret; + VLOG(1) << "Unplanned read. Offset: " << offset << ", Length: " << length; + // We cannot do enqueue/load here because load() clears previously + // loaded data. TODO: figure out how we can use the data cache for + // this access. + return std::make_unique( + input_, offset, length, *pool_, logType, input_->getNaturalReadSize()); } // True if there is free memory for prefetching the stripe. This is @@ -119,7 +123,8 @@ class BufferedInput { // Create a new (clean) instance of BufferedInput sharing the same // underlying file and memory pool. The enqueued regions are NOT copied. virtual std::unique_ptr clone() const { - return std::make_unique(input_, pool_); + return std::make_unique( + input_, *pool_, maxMergeDistance_, wsVRLoad_); } std::unique_ptr loadCompleteFile() { @@ -137,45 +142,31 @@ class BufferedInput { return input_; } - virtual folly::Executor* FOLLY_NULLABLE executor() const { + virtual folly::Executor* executor() const { return nullptr; } - virtual int64_t prefetchSize() const { - return 0; - } + virtual uint64_t nextFetchSize() const; protected: - std::shared_ptr input_; - memory::MemoryPool& pool_; + const std::shared_ptr input_; + memory::MemoryPool* const pool_; private: - uint64_t maxMergeDistance_; - std::optional wsVRLoad_; - std::unique_ptr allocPool_; - - // Regions enqueued for reading - std::vector regions_; - - // Offsets in the file to which the corresponding Region belongs - std::vector offsets_; - - // Buffers allocated for reading each Region. - std::vector> buffers_; - - // Maps the position in which the Region was originally enqueued to the - // position that it went to after sorting and merging. Thus this maps from the - // enqueued position to its corresponding buffer offset. - std::vector enqueuedToBufferOffset_; - std::unique_ptr readBuffer( uint64_t offset, uint64_t length) const; + std::tuple readInternal( uint64_t offset, uint64_t length, std::optional i = std::nullopt) const; + void readToBuffer( + uint64_t offset, + folly::Range allocated, + const LogType logType); + folly::Range allocate(const velox::common::Region& region) { // Save the file offset and the buffer to which we'll read it offsets_.push_back(region.offset); @@ -192,6 +183,24 @@ class BufferedInput { bool tryMerge( velox::common::Region& first, const velox::common::Region& second); + + uint64_t maxMergeDistance_; + std::optional wsVRLoad_; + std::unique_ptr allocPool_; + + // Regions enqueued for reading + std::vector regions_; + + // Offsets in the file to which the corresponding Region belongs + std::vector offsets_; + + // Buffers allocated for reading each Region. + std::vector> buffers_; + + // Maps the position in which the Region was originally enqueued to the + // position that it went to after sorting and merging. Thus this maps from the + // enqueued position to its corresponding buffer offset. + std::vector enqueuedToBufferOffset_; }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/CMakeLists.txt b/velox/dwio/common/CMakeLists.txt index dc7d02ada034d..55fadf6c2f210 100644 --- a/velox/dwio/common/CMakeLists.txt +++ b/velox/dwio/common/CMakeLists.txt @@ -21,55 +21,64 @@ elseif(${VELOX_BUILD_TEST_UTILS}) add_subdirectory(tests/utils) endif() -add_library( +velox_add_library( velox_dwio_common BitConcatenation.cpp BitPackDecoder.cpp BufferedInput.cpp - CachedBufferedInput.cpp CacheInputStream.cpp + CachedBufferedInput.cpp + ColumnLoader.cpp ColumnSelector.cpp DataBufferHolder.cpp DecoderUtil.cpp + DirectBufferedInput.cpp DirectDecoder.cpp + DirectInputStream.cpp DwioMetricsLog.cpp + ExecutorBarrier.cpp FileSink.cpp FlatMapHelper.cpp + OnDemandUnitLoader.cpp InputStream.cpp IntDecoder.cpp MetadataFilter.cpp Options.cpp OutputStream.cpp + ParallelFor.cpp Range.cpp Reader.cpp ReaderFactory.cpp ScanSpec.cpp - ColumnLoader.cpp + SeekableInputStream.cpp SelectiveByteRleColumnReader.cpp SelectiveColumnReader.cpp SelectiveRepeatedColumnReader.cpp SelectiveStructColumnReader.cpp - SeekableInputStream.cpp + SortingWriter.cpp + SortingWriter.h + Throttler.cpp TypeUtils.cpp TypeWithId.cpp - WriterFactory.cpp - SortingWriter.cpp - SortingWriter.h) + Writer.cpp + WriterFactory.cpp) -target_include_directories(velox_dwio_common PRIVATE ${Protobuf_INCLUDE_DIRS}) +velox_include_directories(velox_dwio_common PRIVATE ${Protobuf_INCLUDE_DIRS}) -target_link_libraries( +velox_link_libraries( velox_dwio_common velox_buffer velox_caching velox_common_io velox_common_compression + velox_common_config velox_dwio_common_encryption velox_dwio_common_exception velox_exception velox_expression velox_memory - velox_exec + velox_type_tz Boost::regex Folly::folly - glog::glog) + glog::glog + protobuf::libprotobuf) diff --git a/velox/dwio/common/CacheInputStream.cpp b/velox/dwio/common/CacheInputStream.cpp index c9665926a4eb4..ca05a178e3b63 100644 --- a/velox/dwio/common/CacheInputStream.cpp +++ b/velox/dwio/common/CacheInputStream.cpp @@ -35,20 +35,42 @@ CacheInputStream::CacheInputStream( const Region& region, std::shared_ptr input, uint64_t fileNum, + bool noCacheRetention, std::shared_ptr tracker, TrackingId trackingId, uint64_t groupId, int32_t loadQuantum) : bufferedInput_(bufferedInput), cache_(bufferedInput_->cache()), - ioStats_(ioStats), - input_(std::move(input)), + noCacheRetention_(noCacheRetention), region_(region), fileNum_(fileNum), tracker_(std::move(tracker)), trackingId_(trackingId), groupId_(groupId), - loadQuantum_(loadQuantum) {} + loadQuantum_(loadQuantum), + ioStats_(ioStats), + input_(std::move(input)) {} + +CacheInputStream::~CacheInputStream() { + clearCachePin(); + makeCacheEvictable(); +} + +void CacheInputStream::makeCacheEvictable() { + if (!noCacheRetention_) { + return; + } + // Walks through the potential prefetch or access cache space of this cache + // input stream, and marks those exist cache entries as immediate evictable. + uint64_t position = 0; + while (position < region_.length) { + const auto nextRegion = nextQuantizedLoadRegion(position); + const cache::RawFileCacheKey key{fileNum_, nextRegion.offset}; + cache_->makeEvictable(key); + position = nextRegion.offset + nextRegion.length; + } +} bool CacheInputStream::Next(const void** buffer, int32_t* size) { if (position_ >= region_.length) { @@ -60,56 +82,58 @@ bool CacheInputStream::Next(const void** buffer, int32_t* size) { *size = 0; return false; } + loadPosition(); *buffer = reinterpret_cast(run_ + offsetInRun_); *size = runSize_ - offsetInRun_; if (window_.has_value()) { - auto window = window_.value(); - if (position_ + *size > window.offset + window.length) { - *size = window.offset + window.length - position_; + if (position_ + *size > window_->offset + window_->length) { + *size = window_->offset + window_->length - position_; } } if (position_ + *size > region_.length) { *size = region_.length - position_; } offsetInRun_ += *size; + if (prefetchPct_ < 100) { - auto offsetInQuantum = position_ % loadQuantum_; - auto nextQuantum = position_ - offsetInQuantum + loadQuantum_; - auto prefetchThreshold = loadQuantum_ * prefetchPct_ / 100; - if (!prefetchStarted_ && offsetInQuantum + *size > prefetchThreshold && - position_ - offsetInQuantum + loadQuantum_ < region_.length) { + const auto offsetInQuantum = position_ % loadQuantum_; + const auto nextQuantumOffset = position_ - offsetInQuantum + loadQuantum_; + const auto prefetchThreshold = loadQuantum_ * prefetchPct_ / 100; + if (!prefetchStarted_ && (offsetInQuantum + *size > prefetchThreshold) && + (position_ - offsetInQuantum + loadQuantum_ < region_.length)) { // We read past 'prefetchPct_' % of the current load quantum and the // current load quantum is not the last in the region. Prefetch the next // load quantum. - auto prefetchSize = - std::min(region_.length, nextQuantum + loadQuantum_) - nextQuantum; + const auto prefetchSize = + std::min(region_.length, nextQuantumOffset + loadQuantum_) - + nextQuantumOffset; prefetchStarted_ = bufferedInput_->prefetch( - Region{region_.offset + nextQuantum, prefetchSize}); + Region{region_.offset + nextQuantumOffset, prefetchSize}); } } position_ += *size; - if (tracker_) { + if (tracker_ != nullptr) { tracker_->recordRead(trackingId_, *size, fileNum_, groupId_); } return true; } void CacheInputStream::BackUp(int32_t count) { - DWIO_ENSURE_GE(count, 0, "can't backup negative distances"); + VELOX_CHECK_GE(count, 0, "can't backup negative distances"); - uint64_t unsignedCount = static_cast(count); - DWIO_ENSURE(unsignedCount <= offsetInRun_, "Can't backup that much!"); + const uint64_t unsignedCount = static_cast(count); + VELOX_CHECK_LE(unsignedCount, offsetInRun_, "Can't backup that much!"); position_ -= unsignedCount; } -bool CacheInputStream::Skip(int32_t count) { +bool CacheInputStream::SkipInt64(int64_t count) { if (count < 0) { return false; } - uint64_t unsignedCount = static_cast(count); + const uint64_t unsignedCount = static_cast(count); if (unsignedCount + position_ <= region_.length) { position_ += unsignedCount; return true; @@ -127,7 +151,13 @@ void CacheInputStream::seekToPosition(PositionProvider& seekPosition) { } std::string CacheInputStream::getName() const { - return fmt::format("CacheInputStream {} of {}", position_, region_.length); + std::string result = + fmt::format("CacheInputStream {} of {}", position_, region_.length); + const auto ssdFile = ssdFileName(); + if (!ssdFile.empty()) { + result += fmt::format(" ssdFile={}", ssdFile); + } + return result; } size_t CacheInputStream::positionSize() { @@ -162,85 +192,96 @@ std::vector> makeRanges( return buffers; } } // namespace -void CacheInputStream::loadSync(Region region) { - // rawBytesRead is the number of bytes touched. Whether they come - // from disk, ssd or memory is itemized in different counters. A + +void CacheInputStream::loadSync(const Region& region) { process::TraceContext trace("loadSync"); int64_t hitSize = region.length; if (window_.has_value()) { - int64_t regionEnd = region.offset + region.length; - int64_t windowStart = region_.offset + window_.value().offset; - int64_t windowEnd = + const int64_t regionEnd = region.offset + region.length; + const int64_t windowStart = region_.offset + window_.value().offset; + const int64_t windowEnd = region_.offset + window_.value().offset + window_.value().length; hitSize = std::min(windowEnd, regionEnd) - std::max(windowStart, region.offset); } - // coalesced read from InputStream removes itself from this count - // so as not to double count when the individual parts are - // hit. + // rawBytesRead is the number of bytes touched. Whether they come from disk, + // ssd or memory is itemized in different counters. A coalesced read from + // InputStream removes itself from this count so as not to double count when + // the individual parts are hit. ioStats_->incRawBytesRead(hitSize); prefetchStarted_ = false; do { - folly::SemiFuture wait(false); + folly::SemiFuture cacheLoadWait(false); cache::RawFileCacheKey key{fileNum_, region.offset}; - if (noRetention_ && !pin_.empty()) { - pin_.checkedEntry()->makeEvictable(); - } - pin_.clear(); - pin_ = cache_->findOrCreate(key, region.length, &wait); + clearCachePin(); + pin_ = cache_->findOrCreate(key, region.length, &cacheLoadWait); if (pin_.empty()) { - VELOX_CHECK(wait.valid()); - auto& exec = folly::QueuedImmediateExecutor::instance(); - uint64_t usec = 0; + VELOX_CHECK(cacheLoadWait.valid()); + uint64_t waitUs{0}; { - MicrosecondTimer timer(&usec); - std::move(wait).via(&exec).wait(); + MicrosecondTimer timer(&waitUs); + std::move(cacheLoadWait) + .via(&folly::QueuedImmediateExecutor::instance()) + .wait(); } - ioStats_->queryThreadIoLatency().increment(usec); + ioStats_->queryThreadIoLatency().increment(waitUs); continue; } - auto entry = pin_.checkedEntry(); - if (entry->isExclusive()) { - // Missed memory cache. Trying to load from ssd cache, and if again - // missed, fall back to remote fetching. - entry->setGroupId(groupId_); - entry->setTrackingId(trackingId_); - if (loadFromSsd(region, *entry)) { - return; - } - auto ranges = makeRanges(entry, region.length); - uint64_t usec = 0; - { - MicrosecondTimer timer(&usec); - input_->read(ranges, region.offset, LogType::FILE); - } - ioStats_->read().increment(region.length); - ioStats_->queryThreadIoLatency().increment(usec); - entry->setExclusiveToShared(); - } else { + + auto* entry = pin_.checkedEntry(); + if (!entry->getAndClearFirstUseFlag()) { // Hit memory cache. - if (!entry->getAndClearFirstUseFlag()) { - ioStats_->ramHit().increment(hitSize); - } + ioStats_->ramHit().increment(hitSize); + } + if (!entry->isExclusive()) { + return; + } + + // Missed memory cache. Trying to load from ssd cache, and if again + // missed, fall back to remote fetching. + entry->setGroupId(groupId_); + entry->setTrackingId(trackingId_); + if (loadFromSsd(region, *entry)) { return; } + const auto ranges = makeRanges(entry, region.length); + uint64_t storageReadUs{0}; + { + MicrosecondTimer timer(&storageReadUs); + input_->read(ranges, region.offset, LogType::FILE); + } + ioStats_->read().increment(region.length); + ioStats_->queryThreadIoLatency().increment(storageReadUs); + ioStats_->incTotalScanTime(storageReadUs * 1'000); + entry->setExclusiveToShared(!noCacheRetention_); } while (pin_.empty()); } +void CacheInputStream::clearCachePin() { + if (pin_.empty()) { + return; + } + if (noCacheRetention_) { + pin_.checkedEntry()->makeEvictable(); + } + pin_.clear(); +} + bool CacheInputStream::loadFromSsd( - Region region, + const Region& region, cache::AsyncDataCacheEntry& entry) { - auto ssdCache = cache_->ssdCache(); - if (!ssdCache) { + auto* ssdCache = cache_->ssdCache(); + if (ssdCache == nullptr) { return false; } + auto& file = ssdCache->file(fileNum_); auto ssdPin = file.find(cache::RawFileCacheKey{fileNum_, region.offset}); - if (ssdPin.empty()) { return false; } + if (ssdPin.run().size() < entry.size()) { LOG(INFO) << fmt::format( "IOERR: Ssd entry for {} shorter than requested {}", @@ -248,79 +289,83 @@ bool CacheInputStream::loadFromSsd( ssdPin.run().size()); return false; } - uint64_t usec = 0; - // SsdFile::load wants vectors of pins. Put the pins in a - // temp vector and then put 'pin_' back in 'this'. 'pin_' - // is exclusive and not movable. + + uint64_t ssdLoadUs{0}; + // SsdFile::load wants vectors of pins. Put the pins in a temp vector and then + // put 'pin_' back in 'this'. 'pin_' is exclusive and not movable. std::vector ssdPins; ssdPins.push_back(std::move(ssdPin)); std::vector pins; pins.push_back(std::move(pin_)); try { - MicrosecondTimer timer(&usec); + MicrosecondTimer timer(&ssdLoadUs); file.load(ssdPins, pins); } catch (const std::exception& e) { - try { - LOG(ERROR) << "IOERR: Failed SSD loadSync " << entry.toString() - << e.what() << process::TraceContext::statusLine() - << fmt::format( - "stream region {} {}b, start of load {} file {}", - region_.offset, - region_.length, - region.offset - region_.offset, - fileIds().string(fileNum_)); - // Remove the non-loadable entry so that next access goes to - // storage. - file.erase(cache::RawFileCacheKey{fileNum_, region.offset}); - } catch (const std::exception&) { - // Ignore error inside logging the error. - } - throw; + LOG(ERROR) << "IOERR: Failed SSD loadSync " << entry.toString() << ' ' + << e.what() << process::TraceContext::statusLine() + << fmt::format( + "stream region {} {}b, start of load {} file {}", + region_.offset, + region_.length, + region.offset - region_.offset, + fileIds().string(fileNum_)); + // Remove the non-loadable entry so that next access goes to storage. + file.erase(cache::RawFileCacheKey{fileNum_, region.offset}); + pin_ = std::move(pins[0]); + return false; } + + VELOX_CHECK(pin_.empty()); pin_ = std::move(pins[0]); ioStats_->ssdRead().increment(region.length); - ioStats_->queryThreadIoLatency().increment(usec); + ioStats_->queryThreadIoLatency().increment(ssdLoadUs); + // Skip no-cache retention setting as data is loaded from ssd. entry.setExclusiveToShared(); return true; } +std::string CacheInputStream::ssdFileName() const { + auto ssdCache = cache_->ssdCache(); + if (!ssdCache) { + return ""; + } + return ssdCache->file(fileNum_).fileName(); +} + void CacheInputStream::loadPosition() { - auto offset = region_.offset; + const auto offset = region_.offset; if (pin_.empty()) { auto load = bufferedInput_->coalescedLoad(this); - if (load) { + if (load != nullptr) { folly::SemiFuture waitFuture(false); - uint64_t usec = 0; + uint64_t loadUs{0}; { - MicrosecondTimer timer(&usec); + MicrosecondTimer timer(&loadUs); try { - if (!load->loadOrFuture(&waitFuture)) { - auto& exec = folly::QueuedImmediateExecutor::instance(); - std::move(waitFuture).via(&exec).wait(); + if (!load->loadOrFuture(&waitFuture, !noCacheRetention_)) { + waitFuture.wait(); } } catch (const std::exception& e) { - // Log the error and continue. The error, if it persists, will be hit + // Log the error and continue. The error, if it persists, will be hit // again in looking up the specific entry and thrown from there. LOG(ERROR) << "IOERR: error in coalesced load " << e.what(); } } - ioStats_->queryThreadIoLatency().increment(usec); + ioStats_->queryThreadIoLatency().increment(loadUs); } - auto loadRegion = region_; - // Quantize position to previous multiple of 'loadQuantum_'. - loadRegion.offset += (position_ / loadQuantum_) * loadQuantum_; - // Set length to be the lesser of 'loadQuantum_' and distance to end of - // 'region_' - loadRegion.length = std::min( - loadQuantum_, region_.length - (loadRegion.offset - region_.offset)); - loadSync(loadRegion); + + const auto nextLoadRegion = nextQuantizedLoadRegion(position_); + // There is no need to update the metric in the loadData method because + // loadSync is always executed regardless and updates the metric. + loadSync(nextLoadRegion); } + auto* entry = pin_.checkedEntry(); - uint64_t positionInFile = offset + position_; + const uint64_t positionInFile = offset + position_; if (entry->offset() <= positionInFile && entry->offset() + entry->size() > positionInFile) { // The position is inside the range of 'entry'. - auto offsetInEntry = positionInFile - entry->offset(); + const auto offsetInEntry = positionInFile - entry->offset(); if (entry->data().numPages() == 0) { run_ = reinterpret_cast(entry->tinyData()); runSize_ = entry->size(); @@ -329,16 +374,28 @@ void CacheInputStream::loadPosition() { } else { entry->data().findRun(offsetInEntry, &runIndex_, &offsetInRun_); offsetOfRun_ = offsetInEntry - offsetInRun_; - auto run = entry->data().runAt(runIndex_); + const auto run = entry->data().runAt(runIndex_); run_ = run.data(); - runSize_ = run.numPages() * memory::AllocationTraits::kPageSize; + runSize_ = memory::AllocationTraits::pageBytes(run.numPages()); if (offsetOfRun_ + runSize_ > entry->size()) { runSize_ = entry->size() - offsetOfRun_; } } } else { - pin_.clear(); + clearCachePin(); loadPosition(); } } + +velox::common::Region CacheInputStream::nextQuantizedLoadRegion( + uint64_t prevLoadedPosition) const { + auto nextRegion = region_; + // Quantize position to previous multiple of 'loadQuantum_'. + nextRegion.offset += (prevLoadedPosition / loadQuantum_) * loadQuantum_; + // Set length to be the lesser of 'loadQuantum_' and distance to end of + // 'region_' + nextRegion.length = std::min( + loadQuantum_, region_.length - (nextRegion.offset - region_.offset)); + return nextRegion; +} } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/CacheInputStream.h b/velox/dwio/common/CacheInputStream.h index 141396d91d524..54650d9ebc435 100644 --- a/velox/dwio/common/CacheInputStream.h +++ b/velox/dwio/common/CacheInputStream.h @@ -35,29 +35,31 @@ class CacheInputStream : public SeekableInputStream { const velox::common::Region& region, std::shared_ptr input, uint64_t fileNum, + bool noCacheRetention, std::shared_ptr tracker, cache::TrackingId trackingId, uint64_t groupId, int32_t loadQuantum); + ~CacheInputStream() override; + bool Next(const void** data, int* size) override; void BackUp(int count) override; - bool Skip(int count) override; + bool SkipInt64(int64_t count) override; google::protobuf::int64 ByteCount() const override; void seekToPosition(PositionProvider& position) override; std::string getName() const override; size_t positionSize() override; - /// Returns a copy of 'this', ranging over the same bytes. The clone - /// is initially positioned at the position of 'this' and can be - /// moved independently within 'region_'. This is used for first - /// caching a range of bytes from a file and then giving out - /// delimited subranges of this to callers. skip() and - /// setRemainingBytes() set the bounds of the window exposed by the - /// clone. In specific, reading protocol buffers requires a stream - /// that begins and ends at the exact start and end of the - /// serialization. Reading these from cache requires an exactly - /// delimited stream. + /// Returns a copy of 'this', ranging over the same bytes. The clone is + /// initially positioned at the position of 'this' and can be moved + /// independently within 'region_'. This is used for first caching a range of + /// bytes from a file and then giving out delimited subranges of this to + /// callers. skip() and setRemainingBytes() set the bounds of the window + /// exposed by the clone. In specific, reading protocol buffers requires a + /// stream that begins and ends at the exact start and end of the + /// serialization. Reading these from cache requires an exactly delimited + /// stream. std::unique_ptr clone() { auto copy = std::make_unique( bufferedInput_, @@ -65,6 +67,7 @@ class CacheInputStream : public SeekableInputStream { region_, input_, fileNum_, + noCacheRetention_, tracker_, trackingId_, groupId_, @@ -76,7 +79,7 @@ class CacheInputStream : public SeekableInputStream { /// Sets the stream to range over a window that starts at the current position /// and is 'remainingBytes' bytes in size. 'remainingBytes' must be <= /// 'region_.length - position_'. The stream cannot be used for reading - /// outside of the window. Use together wiht clone() and skip(). + /// outside of the window. Use together with clone() and skip(). void setRemainingBytes(uint64_t remainingBytes); /// Causes the next load quantum to be scheduled for read-ahead when @@ -90,33 +93,47 @@ class CacheInputStream : public SeekableInputStream { prefetchPct_ = pct; } - /// Enables a mode where cache entries are made immediately evictable after - /// unpinning. - void setNoRetention() { - noRetention_ = true; + bool testingNoCacheRetention() const { + return noCacheRetention_; } private: // Ensures that the current position is covered by 'pin_'. void loadPosition(); + // Returns the next quantized region to load with given loaded position. + velox::common::Region nextQuantizedLoadRegion( + uint64_t prevLoadedPosition) const; + // Synchronously sets 'pin_' to cover 'region'. - void loadSync(velox::common::Region region); + void loadSync(const velox::common::Region& region); // Returns true if there is an SSD cache and 'entry' is present there and // successfully loaded. bool loadFromSsd( - velox::common::Region region, + const velox::common::Region& region, cache::AsyncDataCacheEntry& entry); + // Invoked to clear the cache pin of the accessed cache entry and mark it as + // immediate evictable if 'noCacheRetention_' flag is set. + void clearCachePin(); + + void makeCacheEvictable(); + + // Return SSD cache file path if exists; return empty string if no SSD cache + // file. + std::string ssdFileName() const; + CachedBufferedInput* const bufferedInput_; cache::AsyncDataCache* const cache_; - IoStatistics* ioStats_; - std::shared_ptr input_; + // True if a pin should be set to the lowest retention score after + // unpinning. This applies to sequential reads where second access + // to the page is not expected. + const bool noCacheRetention_; // The region of 'input' 'this' ranges over. const velox::common::Region region_; const uint64_t fileNum_; - std::shared_ptr tracker_; + const std::shared_ptr tracker_; const cache::TrackingId trackingId_; const uint64_t groupId_; @@ -124,6 +141,9 @@ class CacheInputStream : public SeekableInputStream { // pin_.entry()->size(). const int32_t loadQuantum_; + IoStatistics* const ioStats_; + const std::shared_ptr input_; + // Handle of cache entry. cache::CachePin pin_; @@ -143,21 +163,16 @@ class CacheInputStream : public SeekableInputStream { uint64_t position_ = 0; // A restricted view over 'region'. offset is relative to 'region_'. A cloned - // CacheInputStream can cover a subrange of the range of the original. + // CacheInputStream can cover a sub-range of the range of the original. std::optional window_; // Percentage of 'loadQuantum_' at which the next load quantum gets scheduled. // Over 100 means no prefetch. int32_t prefetchPct_{200}; - // True if prefetch f the next 'loadQuantum_' has been started. Cleared when + // True if prefetch the next 'loadQuantum_' has been started. Cleared when // moving to the next load quantum. bool prefetchStarted_{false}; - - // True if a pin should be set to lowest retention score after - // unpinning. This applies to sequential reads where a second access - // to the page is not expected. - bool noRetention_{false}; }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/CachedBufferedInput.cpp b/velox/dwio/common/CachedBufferedInput.cpp index 87f537552243e..d734663360328 100644 --- a/velox/dwio/common/CachedBufferedInput.cpp +++ b/velox/dwio/common/CachedBufferedInput.cpp @@ -39,20 +39,20 @@ using memory::MemoryAllocator; std::unique_ptr CachedBufferedInput::enqueue( Region region, - const StreamIdentifier* si = nullptr) { + const StreamIdentifier* sid = nullptr) { if (region.length == 0) { return std::make_unique( static_cast(nullptr), 0); } TrackingId id; - if (si) { - id = TrackingId(si->getId()); + if (sid != nullptr) { + id = TrackingId(sid->getId()); } VELOX_CHECK_LE(region.offset + region.length, fileSize_); requests_.emplace_back( RawFileCacheKey{fileNum_, region.offset}, region.length, id); - if (tracker_) { + if (tracker_ != nullptr) { tracker_->recordReference(id, region.length, fileNum_, groupId_); } auto stream = std::make_unique( @@ -61,6 +61,7 @@ std::unique_ptr CachedBufferedInput::enqueue( region, input_, fileNum_, + options_.noCacheRetention(), tracker_, id, groupId_, @@ -75,26 +76,25 @@ bool CachedBufferedInput::isBuffered(uint64_t /*offset*/, uint64_t /*length*/) } bool CachedBufferedInput::shouldPreload(int32_t numPages) { - // True if after scheduling this for preload, half the capacity - // would be in a loading but not yet accessed state. - if (requests_.empty() && !numPages) { + // True if after scheduling this for preload, half the capacity would be in a + // loading but not yet accessed state. + if (requests_.empty() && (numPages == 0)) { return false; } - for (auto& request : requests_) { - numPages += bits::roundUp( - std::min(request.size, options_.loadQuantum()), - memory::AllocationTraits::kPageSize) / - memory::AllocationTraits::kPageSize; + for (const auto& request : requests_) { + numPages += memory::AllocationTraits::numPages( + std::min(request.size, options_.loadQuantum())); } - auto cachePages = cache_->incrementCachedPages(0); - auto allocator = cache_->allocator(); - auto maxPages = memory::AllocationTraits::numPages(allocator->capacity()); - auto allocatedPages = allocator->numAllocated(); + const auto cachePages = cache_->incrementCachedPages(0); + auto* allocator = cache_->allocator(); + const auto maxPages = + memory::AllocationTraits::numPages(allocator->capacity()); + const auto allocatedPages = allocator->numAllocated(); if (numPages < maxPages - allocatedPages) { // There is free space for the read-ahead. return true; } - auto prefetchPages = cache_->incrementPrefetchPages(0); + const auto prefetchPages = cache_->incrementPrefetchPages(0); if (numPages + prefetchPages < cachePages / 2) { // The planned prefetch plus other prefetches are under half the cache. return true; @@ -103,7 +103,6 @@ bool CachedBufferedInput::shouldPreload(int32_t numPages) { } namespace { - bool isPrefetchPct(int32_t pct) { return pct >= FLAGS_cache_prefetch_min_pct; } @@ -120,17 +119,17 @@ std::vector makeRequestParts( // Large columns will be part of coalesced reads if the access frequency // qualifies for read ahead and if over 80% of the column gets accessed. Large // metadata columns (empty no trackingData) always coalesce. - bool prefetchOne = + const bool prefetchOne = request.trackingId.id() == StreamIdentifier::sequentialFile().id_; - auto readPct = + const auto readPct = (100 * trackingData.numReads) / (1 + trackingData.numReferences); - auto readDensity = + const auto readDensity = (100 * trackingData.readBytes) / (1 + trackingData.referencedBytes); - bool prefetch = trackingData.referencedBytes > 0 && + const bool prefetch = trackingData.referencedBytes > 0 && (isPrefetchPct(readPct) && readDensity >= 80); std::vector parts; for (uint64_t offset = 0; offset < request.size; offset += loadQuantum) { - int32_t size = std::min(loadQuantum, request.size - offset); + const int32_t size = std::min(loadQuantum, request.size - offset); extraRequests.push_back(std::make_unique( RawFileCacheKey{request.key.fileNum, request.key.offset + offset}, size, @@ -154,21 +153,21 @@ int32_t adjustedReadPct(const cache::TrackingData& trackingData) { } } // namespace -void CachedBufferedInput::load(const LogType) { +void CachedBufferedInput::load(const LogType /*unused*/) { // 'requests_ is cleared on exit. auto requests = std::move(requests_); - cache::SsdFile* FOLLY_NULLABLE ssdFile = nullptr; - auto ssdCache = cache_->ssdCache(); - if (ssdCache) { + cache::SsdFile* ssdFile{nullptr}; + auto* ssdCache = cache_->ssdCache(); + if (ssdCache != nullptr) { ssdFile = &ssdCache->file(fileNum_); } - // Extra requests made for preloadable regions that are larger then + + // Extra requests made for pre-loadable regions that are larger than // 'loadQuantum'. std::vector> extraRequests; - // We loop over access frequency buckets. For example readPct 80 - // will get all streams where 80% or more of the referenced data is - // actually loaded. - for (auto readPct : std::vector{80, 50, 20, 0}) { + // We loop over access frequency buckets. For example readPct 80 will get all + // streams where 80% or more of the referenced data is actually loaded. + for (const auto readPct : std::vector{80, 50, 20, 0}) { std::vector storageLoad; std::vector ssdLoad; for (auto& request : requests) { @@ -176,9 +175,9 @@ void CachedBufferedInput::load(const LogType) { continue; } cache::TrackingData trackingData; - bool prefetchAnyway = request.trackingId.empty() || + const bool prefetchAnyway = request.trackingId.empty() || request.trackingId.id() == StreamIdentifier::sequentialFile().id_; - if (!prefetchAnyway && tracker_) { + if (!prefetchAnyway && (tracker_ != nullptr)) { trackingData = tracker_->trackingData(request.trackingId); } if (prefetchAnyway || adjustedReadPct(trackingData) >= readPct) { @@ -189,7 +188,7 @@ void CachedBufferedInput::load(const LogType) { if (cache_->exists(part->key)) { continue; } - if (ssdFile) { + if (ssdFile != nullptr) { part->ssdPin = ssdFile->find(part->key); if (!part->ssdPin.empty() && part->ssdPin.run().size() < part->size) { @@ -217,8 +216,8 @@ void CachedBufferedInput::makeLoads( if (requests.empty() || (requests.size() < 2 && !prefetch)) { return; } - bool isSsd = !requests[0]->ssdPin.empty(); - int32_t maxDistance = isSsd ? 20000 : options_.maxCoalesceDistance(); + const bool isSsd = !requests[0]->ssdPin.empty(); + const int32_t maxDistance = isSsd ? 20000 : options_.maxCoalesceDistance(); std::sort( requests.begin(), requests.end(), @@ -229,21 +228,21 @@ void CachedBufferedInput::makeLoads( return left->key.offset < right->key.offset; } }); - // Combine adjacent short reads. + // Combine adjacent short reads. int32_t numNewLoads = 0; int64_t coalescedBytes = 0; coalesceIo( requests, maxDistance, - // Break batches up. Better load more short ones i parallel. + // Break batches up. Better load more short ones in parallel. 40, [&](int32_t index) { return isSsd ? requests[index]->ssdPin.run().offset() : requests[index]->key.offset; }, [&](int32_t index) { - auto size = requests[index]->size; + const auto size = requests[index]->size; coalescedBytes += size; return size; }, @@ -266,20 +265,22 @@ void CachedBufferedInput::makeLoads( ++numNewLoads; readRegion(ranges, prefetch); }); - if (prefetch && executor_) { + + if (prefetch && (executor_ != nullptr)) { std::vector doneIndices; for (auto i = 0; i < allCoalescedLoads_.size(); ++i) { auto& load = allCoalescedLoads_[i]; if (load->state() == CoalescedLoad::State::kPlanned) { - prefetchSize_ += load->size(); - executor_->add([pendingLoad = load]() { - process::TraceContext trace("Read Ahead"); - pendingLoad->loadOrFuture(nullptr); - }); + executor_->add( + [pendingLoad = load, ssdSavable = !options_.noCacheRetention()]() { + process::TraceContext trace("Read Ahead"); + pendingLoad->loadOrFuture(nullptr, ssdSavable); + }); } else { doneIndices.push_back(i); } } + // Remove the loads that were complete. There can be done loads if the same // CachedBufferedInput has multiple cycles of enqueues and loads. for (int32_t i = doneIndices.size() - 1; i >= 0; --i) { @@ -302,7 +303,8 @@ class DwioCoalescedLoadBase : public cache::CoalescedLoad { cache_(cache), ioStats_(std::move(ioStats)), groupId_(groupId) { - for (auto& request : requests) { + requests_.reserve(requests.size()); + for (const auto& request : requests) { size_ += request->size; requests_.push_back(std::move(*request)); } @@ -318,31 +320,33 @@ class DwioCoalescedLoadBase : public cache::CoalescedLoad { std::string toString() const override { int32_t payload = 0; - assert(!requests_.empty()); + VELOX_CHECK(!requests_.empty()); + int32_t total = requests_.back().key.offset + requests_.back().size - requests_[0].key.offset; - for (auto& request : requests_) { + for (const auto& request : requests_) { payload += request.size; } return fmt::format( "", requests_.size(), - total, - total - payload); + succinctBytes(total), + succinctBytes(total - payload)); } protected: - void updateStats(const CoalesceIoStats& stats, bool isPrefetch, bool isSsd) { - if (ioStats_) { - ioStats_->incRawOverreadBytes(stats.extraBytes); - if (isSsd) { - ioStats_->ssdRead().increment(stats.payloadBytes); - } else { - ioStats_->read().increment(stats.payloadBytes); - } - if (isPrefetch) { - ioStats_->prefetch().increment(stats.payloadBytes); - } + void updateStats(const CoalesceIoStats& stats, bool prefetch, bool ssd) { + if (ioStats_ == nullptr) { + return; + } + ioStats_->incRawOverreadBytes(stats.extraBytes); + if (ssd) { + ioStats_->ssdRead().increment(stats.payloadBytes); + } else { + ioStats_->read().increment(stats.payloadBytes); + } + if (prefetch) { + ioStats_->prefetch().increment(stats.payloadBytes); } } @@ -386,14 +390,14 @@ class DwioCoalescedLoad : public DwioCoalescedLoadBase { input_(std::move(input)), maxCoalesceDistance_(maxCoalesceDistance) {} - std::vector loadData(bool isPrefetch) override { + std::vector loadData(bool prefetch) override { std::vector pins; pins.reserve(keys_.size()); cache_.makePins( keys_, [&](int32_t index) { return sizes_[index]; }, [&](int32_t /*index*/, CachePin pin) { - if (isPrefetch) { + if (prefetch) { pin.checkedEntry()->setPrefetch(true); } pins.push_back(std::move(pin)); @@ -413,7 +417,7 @@ class DwioCoalescedLoad : public DwioCoalescedLoadBase { const std::vector>& buffers) { input_->read(buffers, offset, LogType::FILE); }); - updateStats(stats, isPrefetch, false); + updateStats(stats, prefetch, false); return pins; } @@ -431,14 +435,14 @@ class SsdLoad : public DwioCoalescedLoadBase { std::vector requests) : DwioCoalescedLoadBase(cache, ioStats, groupId, std::move(requests)) {} - std::vector loadData(bool isPrefetch) override { + std::vector loadData(bool prefetch) override { std::vector ssdPins; std::vector pins; cache_.makePins( keys_, [&](int32_t index) { return sizes_[index]; }, [&](int32_t index, CachePin pin) { - if (isPrefetch) { + if (prefetch) { pin.checkedEntry()->setPrefetch(true); } pins.push_back(std::move(pin)); @@ -448,8 +452,8 @@ class SsdLoad : public DwioCoalescedLoadBase { return pins; } assert(!ssdPins.empty()); // for lint. - auto stats = ssdPins[0].file()->load(ssdPins, pins); - updateStats(stats, isPrefetch, true); + const auto stats = ssdPins[0].file()->load(ssdPins, pins); + updateStats(stats, prefetch, true); return pins; } }; @@ -457,11 +461,12 @@ class SsdLoad : public DwioCoalescedLoadBase { } // namespace void CachedBufferedInput::readRegion( - std::vector requests, + const std::vector& requests, bool prefetch) { if (requests.empty() || (requests.size() == 1 && !prefetch)) { return; } + std::shared_ptr load; if (!requests[0]->ssdPin.empty()) { load = std::make_shared(*cache_, ioStats_, groupId_, requests); @@ -491,7 +496,7 @@ std::shared_ptr CachedBufferedInput::coalescedLoad( return nullptr; } auto load = std::move(it->second); - auto dwioLoad = dynamic_cast(load.get()); + auto* dwioLoad = dynamic_cast(load.get()); for (auto& request : dwioLoad->requests()) { loads.erase(request.stream); } @@ -510,6 +515,7 @@ std::unique_ptr CachedBufferedInput::read( Region{offset, length}, input_, fileNum_, + options_.noCacheRetention(), nullptr, TrackingId(), 0, @@ -517,9 +523,7 @@ std::unique_ptr CachedBufferedInput::read( } bool CachedBufferedInput::prefetch(Region region) { - int32_t numPages = - bits::roundUp(region.length, memory::AllocationTraits::kPageSize) / - memory::AllocationTraits::kPageSize; + const int32_t numPages = memory::AllocationTraits::numPages(region.length); if (!shouldPreload(numPages)) { return false; } diff --git a/velox/dwio/common/CachedBufferedInput.h b/velox/dwio/common/CachedBufferedInput.h index 5a8280e06fa8c..89f9a9aee0eb4 100644 --- a/velox/dwio/common/CachedBufferedInput.h +++ b/velox/dwio/common/CachedBufferedInput.h @@ -46,12 +46,12 @@ struct CacheRequest { bool processed{false}; - // True if this should be coalesced into a CoalescedLoad with other - // nearby requests with a similar load probability. This is false - // for sparsely accessed large columns where hitting one piece - // should not load the adjacent pieces. + /// True if this should be coalesced into a CoalescedLoad with other nearby + /// requests with a similar load probability. This is false for sparsely + /// accessed large columns where hitting one piece should not load the + /// adjacent pieces. bool coalesces{true}; - const SeekableInputStream* FOLLY_NONNULL stream; + const SeekableInputStream* stream; }; class CachedBufferedInput : public BufferedInput { @@ -60,15 +60,15 @@ class CachedBufferedInput : public BufferedInput { std::shared_ptr readFile, const MetricsLogPtr& metricsLog, uint64_t fileNum, - cache::AsyncDataCache* FOLLY_NONNULL cache, + cache::AsyncDataCache* cache, std::shared_ptr tracker, uint64_t groupId, std::shared_ptr ioStats, - folly::Executor* FOLLY_NULLABLE executor, + folly::Executor* executor, const io::ReaderOptions& readerOptions) : BufferedInput( std::move(readFile), - readerOptions.getMemoryPool(), + readerOptions.memoryPool(), metricsLog), cache_(cache), fileNum_(fileNum), @@ -77,18 +77,20 @@ class CachedBufferedInput : public BufferedInput { ioStats_(std::move(ioStats)), executor_(executor), fileSize_(input_->getLength()), - options_(readerOptions) {} + options_(readerOptions) { + checkLoadQuantum(); + } CachedBufferedInput( std::shared_ptr input, uint64_t fileNum, - cache::AsyncDataCache* FOLLY_NONNULL cache, + cache::AsyncDataCache* cache, std::shared_ptr tracker, uint64_t groupId, std::shared_ptr ioStats, - folly::Executor* FOLLY_NULLABLE executor, + folly::Executor* executor, const io::ReaderOptions& readerOptions) - : BufferedInput(std::move(input), readerOptions.getMemoryPool()), + : BufferedInput(std::move(input), readerOptions.memoryPool()), cache_(cache), fileNum_(fileNum), tracker_(std::move(tracker)), @@ -96,7 +98,9 @@ class CachedBufferedInput : public BufferedInput { ioStats_(std::move(ioStats)), executor_(executor), fileSize_(input_->getLength()), - options_(readerOptions) {} + options_(readerOptions) { + checkLoadQuantum(); + } ~CachedBufferedInput() override { for (auto& load : allCoalescedLoads_) { @@ -106,11 +110,15 @@ class CachedBufferedInput : public BufferedInput { std::unique_ptr enqueue( velox::common::Region region, - const StreamIdentifier* FOLLY_NULLABLE si) override; + const StreamIdentifier* sid) override; - void load(const LogType) override; + bool supportSyncLoad() const override { + return false; + } - bool isBuffered(uint64_t offset, uint64_t length) const override; + void load(const LogType /*unused*/) override; + + bool isBuffered(uint64_t /*unused*/, uint64_t /*unused*/) const override; std::unique_ptr read(uint64_t offset, uint64_t length, LogType logType) const override; @@ -144,23 +152,22 @@ class CachedBufferedInput : public BufferedInput { options_); } - cache::AsyncDataCache* FOLLY_NONNULL cache() const { + cache::AsyncDataCache* cache() const { return cache_; } - // Returns the CoalescedLoad that contains the correlated loads for - // 'stream' or nullptr if none. Returns nullptr on all but first - // call for 'stream' since the load is to be triggered by the first - // access. + /// Returns the CoalescedLoad that contains the correlated loads for 'stream' + /// or nullptr if none. Returns nullptr on all but first call for 'stream' + /// since the load is to be triggered by the first access. std::shared_ptr coalescedLoad( - const SeekableInputStream* FOLLY_NONNULL stream); + const SeekableInputStream* stream); - folly::Executor* FOLLY_NULLABLE executor() const override { + folly::Executor* executor() const override { return executor_; } - int64_t prefetchSize() const override { - return prefetchSize_; + uint64_t nextFetchSize() const override { + VELOX_NYI(); } private: @@ -168,19 +175,31 @@ class CachedBufferedInput : public BufferedInput { // is true, starts background loading. void makeLoads(std::vector requests, bool prefetch); - // Makes a CoalescedLoad for 'requests' to be read together, coalescing - // IO is appropriate. If 'prefetch' is set, schedules the CoalescedLoad - // on 'executor_'. Links the CoalescedLoad to all CacheInputStreams that it + // Makes a CoalescedLoad for 'requests' to be read together, coalescing IO is + // appropriate. If 'prefetch' is set, schedules the CoalescedLoad on + // 'executor_'. Links the CoalescedLoad to all CacheInputStreams that it // concerns. + void readRegion(const std::vector& requests, bool prefetch); + + // We only support up to 8MB load quantum size on SSD and there is no need for + // larger SSD read size performance wise. + void checkLoadQuantum() { + if (cache_->ssdCache() != nullptr) { + VELOX_CHECK_LE( + options_.loadQuantum(), + 1 << cache::SsdRun::kSizeBits, + "Load quantum exceeded SSD cache entry size limit."); + } + } - void readRegion(std::vector requests, bool prefetch); - - cache::AsyncDataCache* FOLLY_NONNULL cache_; + cache::AsyncDataCache* const cache_; const uint64_t fileNum_; - std::shared_ptr tracker_; + const std::shared_ptr tracker_; const uint64_t groupId_; - std::shared_ptr ioStats_; - folly::Executor* const FOLLY_NULLABLE executor_; + const std::shared_ptr ioStats_; + folly::Executor* const executor_; + const uint64_t fileSize_; + const io::ReaderOptions options_; // Regions that are candidates for loading. std::vector requests_; @@ -193,10 +212,6 @@ class CachedBufferedInput : public BufferedInput { // Distinct coalesced loads in 'coalescedLoads_'. std::vector> allCoalescedLoads_; - - const uint64_t fileSize_; - int64_t prefetchSize_{0}; - io::ReaderOptions options_; }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/Closeable.h b/velox/dwio/common/Closeable.h index 4572e08cf98a1..bdb8254378891 100644 --- a/velox/dwio/common/Closeable.h +++ b/velox/dwio/common/Closeable.h @@ -18,10 +18,7 @@ #include "velox/dwio/common/exception/Exception.h" -namespace facebook { -namespace velox { -namespace dwio { -namespace common { +namespace facebook::velox::dwio::common { // Base class for closeable object which need to be explicitly closed before // being destructed @@ -67,7 +64,4 @@ class Closeable { bool closed_; }; -} // namespace common -} // namespace dwio -} // namespace velox -} // namespace facebook +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/ColumnLoader.cpp b/velox/dwio/common/ColumnLoader.cpp index c04cb74db2715..8fd859f7a2b83 100644 --- a/velox/dwio/common/ColumnLoader.cpp +++ b/velox/dwio/common/ColumnLoader.cpp @@ -16,6 +16,8 @@ #include "velox/dwio/common/ColumnLoader.h" +#include "velox/common/process/TraceContext.h" + namespace facebook::velox::dwio::common { // Wraps '*result' in a dictionary to make the contiguous values @@ -45,13 +47,14 @@ void ColumnLoader::loadInternal( ValueHook* hook, vector_size_t resultSize, VectorPtr* result) { + process::TraceContext trace("ColumnLoader::loadInternal"); VELOX_CHECK_EQ( version_, structReader_->numReads(), "Loading LazyVector after the enclosing reader has moved"); - auto offset = structReader_->lazyVectorReadOffset(); - auto incomingNulls = structReader_->nulls(); - auto outputRows = structReader_->outputRows(); + const auto offset = structReader_->lazyVectorReadOffset(); + const auto* incomingNulls = structReader_->nulls(); + const auto outputRows = structReader_->outputRows(); raw_vector selectedRows; RowSet effectiveRows; ExceptionContextSetter exceptionContext( diff --git a/velox/dwio/common/ColumnLoader.h b/velox/dwio/common/ColumnLoader.h index 4234ff32578da..ed883d23a143f 100644 --- a/velox/dwio/common/ColumnLoader.h +++ b/velox/dwio/common/ColumnLoader.h @@ -38,8 +38,8 @@ class ColumnLoader : public velox::VectorLoader { VectorPtr* result) override; private: - SelectiveStructColumnReaderBase* structReader_; - SelectiveColumnReader* fieldReader_; + SelectiveStructColumnReaderBase* const structReader_; + SelectiveColumnReader* const fieldReader_; // This is checked against the version of 'structReader' on load. If // these differ, 'structReader' has been advanced since the creation // of 'this' and 'this' is no longer loadable. diff --git a/velox/dwio/common/ColumnSelector.cpp b/velox/dwio/common/ColumnSelector.cpp index 48d2030a658cd..ef57fff7ab0f6 100644 --- a/velox/dwio/common/ColumnSelector.cpp +++ b/velox/dwio/common/ColumnSelector.cpp @@ -91,7 +91,7 @@ FilterTypePtr ColumnSelector::buildNode( // column selector filter tree nodes_.reserve(nodes_.size() + type->size()); if (node.node == 0) { - auto rowType = type->asRow(); + auto& rowType = type->asRow(); for (size_t i = 0, size = type->size(); i < size; ++i) { bool inData = contentType && i < contentType->size(); current->addChild(buildNode( @@ -163,20 +163,17 @@ void ColumnSelector::copy( } } -/** - * Copy the selector tree and apply file schema to handle schema mismatch - */ ColumnSelector ColumnSelector::apply( const std::shared_ptr& origin, const std::shared_ptr& fileSchema) { - // current instance maybe null + // current instance maybe null. if (origin == nullptr) { return ColumnSelector(fileSchema); } // if selector has no schema, we just build a new tree with file schema // selector.getProjection will carry all logic information including nodes - auto onlyFilter = !origin->hasSchema(); + const bool onlyFilter = !origin->hasSchema(); ColumnSelector cs( onlyFilter ? fileSchema : origin->getSchema(), origin->getNodeFilter(), @@ -318,15 +315,15 @@ const FilterTypePtr& ColumnSelector::process(const std::string& column, bool) { std::pair extractColumnName( const std::string_view& name) { // right now this is the only supported expression for MAP key filter - auto pos = name.find('#'); - if (pos != std::string::npos) { - // special map column handling - auto colName = name.substr(0, pos); - auto expr = name.substr(pos + 1); - return std::make_pair(colName, expr); + const auto pos = name.find('#'); + if (pos == std::string::npos) { + return std::make_pair(name, ""); } - return std::make_pair(name, ""); + // Special map column handling. + const auto colName = name.substr(0, pos); + const auto expr = name.substr(pos + 1); + return std::make_pair(colName, expr); } void ColumnSelector::logFilter() const { @@ -342,4 +339,27 @@ void ColumnSelector::logFilter() const { getLog()->logColumnFilter(filter_, numColumns, numNodes, hasSchema()); } +std::shared_ptr ColumnSelector::fromScanSpec( + const velox::common::ScanSpec& spec, + const RowTypePtr& rowType) { + std::vector columnNames; + for (auto& child : spec.children()) { + if (child->isConstant()) { + continue; + } + std::string name = child->fieldName(); + if (!child->flatMapFeatureSelection().empty()) { + name += "#["; + name += folly::join(',', child->flatMapFeatureSelection()); + name += ']'; + } + columnNames.push_back(std::move(name)); + } + if (columnNames.empty()) { + static const RowTypePtr kEmpty{ROW({}, {})}; + return std::make_shared(kEmpty); + } + return std::make_shared(rowType, columnNames); +} + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/ColumnSelector.h b/velox/dwio/common/ColumnSelector.h index b5bca13905f2f..62408e55a2134 100644 --- a/velox/dwio/common/ColumnSelector.h +++ b/velox/dwio/common/ColumnSelector.h @@ -18,6 +18,7 @@ #include "velox/dwio/common/FilterNode.h" #include "velox/dwio/common/MetricsLog.h" +#include "velox/dwio/common/ScanSpec.h" #include "velox/dwio/common/TypeWithId.h" namespace facebook::velox::dwio::common { @@ -133,8 +134,8 @@ class ColumnSelector { checkSelectColNonDuplicate(fileColumnNamesReadAsLowerCase); } - // set a specific node to read state - // only means we only enable exact the node only. + /// Sets a specific node to read state + /// only means we only enable exact the node only. void setRead(const FilterTypePtr& node, bool only = false); /** @@ -144,7 +145,8 @@ class ColumnSelector { * @return the id in the tree */ const FilterTypePtr& getNode(size_t id) const { - DWIO_ENSURE(inRange(id), "node is out of range"); + VELOX_CHECK( + inRange(id), "node: {} is out of range of {}", id, nodes_.size()); return nodes_[id]; } @@ -251,23 +253,23 @@ class ColumnSelector { return FilterType::getInvalid(); } - // build selected schema based on current filter + /// Builds selected schema based on current filter. std::shared_ptr buildSelected() const; - // build selected schema based on current filter and reorder columns according - // to what filter specifies + /// Builds selected schema based on current filter and reorder columns + /// according to what filter specifies. std::shared_ptr buildSelectedReordered() const; - // build a column filter out of filter tree - // This only returns top columns today and can be extended to node level + /// Build a column filter out of filter tree. + /// This only returns top columns today and can be extended to node level const ColumnFilter& getProjection() const; - // a filter lambda function accept column index for query + /// A filter lambda function accept column index for query. std::function getFilter() const { return [this](uint64_t column) { return shouldReadColumn(column); }; } - // this is essentially the effective schema when column selector was built + /// This is essentially the effective schema when column selector was built. bool hasSchema() const { return schema_ != nullptr; } @@ -287,11 +289,15 @@ class ColumnSelector { const std::vector& keys, const std::vector& values); - // Create a file selector based on a logic selector and disk schema + /// Creates a file selector based on a logic selector and disk schema. static ColumnSelector apply( const std::shared_ptr& origin, const std::shared_ptr& fileSchema); + static std::shared_ptr fromScanSpec( + const velox::common::ScanSpec& spec, + const RowTypePtr& rowType); + private: // visit the tree with disk type static void copy( diff --git a/velox/dwio/common/ColumnVisitors.h b/velox/dwio/common/ColumnVisitors.h index 3ac3d5e219f74..346a44bf69e1d 100644 --- a/velox/dwio/common/ColumnVisitors.h +++ b/velox/dwio/common/ColumnVisitors.h @@ -49,29 +49,32 @@ struct DropValues { } }; -template -struct ExtractToReader { +class ExtractToReader { + public: using HookType = dwio::common::NoHook; static constexpr bool kSkipNulls = false; - explicit ExtractToReader(TReader* readerIn) : reader(readerIn) {} + explicit ExtractToReader(SelectiveColumnReader* reader) : reader_(reader) {} bool acceptsNulls() const { return true; } template - void addNull(vector_size_t rowIndex); + void addNull(vector_size_t /*rowIndex*/) { + reader_->template addNull(); + } template void addValue(vector_size_t /*rowIndex*/, V value) { - reader->addValue(value); + reader_->addValue(value); } - TReader* reader; - dwio::common::NoHook& hook() { return noHook(); } + + private: + SelectiveColumnReader* const reader_; }; template @@ -94,7 +97,7 @@ class ExtractToHook { template void addValue(vector_size_t rowIndex, V value) { - hook_.addValue(rowIndex, &value); + hook_.addValueTyped(rowIndex, value); } auto& hook() { @@ -123,7 +126,7 @@ class ExtractToGenericHook { template void addValue(vector_size_t rowIndex, V value) { - hook_->addValue(rowIndex, &value); + hook_->addValueTyped(rowIndex, value); } ValueHook& hook() { @@ -150,6 +153,11 @@ class ColumnVisitor { using DataType = T; static constexpr bool dense = isDense; static constexpr bool kHasBulkPath = true; + static constexpr bool kHasFilter = + !std::is_same_v; + static constexpr bool kHasHook = !std::is_same_v; + static constexpr bool kFilterOnly = std::is_same_v; + ColumnVisitor( TFilter& filter, SelectiveColumnReader* reader, @@ -269,7 +277,7 @@ class ColumnVisitor { } if (++rowIndex_ >= numRows_) { atEnd = true; - return rows_[numRows_ - 1] - previous; + return rowAt(numRows_ - 1) - previous; } if (TFilter::deterministic && isDense) { return 0; @@ -301,12 +309,12 @@ class ColumnVisitor { if (isDense) { return 0; } - return currentRow() - rows_[rowIndex_ - 1] - 1; + return currentRow() - rowAt(rowIndex_ - 1) - 1; } FOLLY_ALWAYS_INLINE vector_size_t process(T value, bool& atEnd) { if (!TFilter::deterministic) { - auto previous = currentRow(); + const auto previous = currentRow(); if (velox::common::applyFilter(filter_, value)) { filterPassed(value); } else { @@ -314,10 +322,11 @@ class ColumnVisitor { } if (++rowIndex_ >= numRows_) { atEnd = true; - return rows_[numRows_ - 1] - previous; + return rowAt(numRows_ - 1) - previous; } return currentRow() - previous - 1; } + // The filter passes or fails and we go to the next row if any. if (velox::common::applyFilter(filter_, value)) { filterPassed(value); @@ -331,7 +340,7 @@ class ColumnVisitor { if (isDense) { return 0; } - return currentRow() - rows_[rowIndex_ - 1] - 1; + return currentRow() - rowAt(rowIndex_ - 1) - 1; } // Returns space for 'size' items of T for a scan to fill. The scan @@ -341,26 +350,34 @@ class ColumnVisitor { return reader_->mutableValues(size); } - int32_t numRows() const { - return reader_->numRows(); - } - SelectiveColumnReader& reader() const { return *reader_; } - inline vector_size_t rowAt(vector_size_t index) { + inline vector_size_t rowAt(vector_size_t index) const { if (isDense) { return index; } return rows_[index]; } - bool atEnd() { + vector_size_t rowIndex() const { + return rowIndex_; + } + + void setRowIndex(vector_size_t index) { + rowIndex_ = index; + } + + void addRowIndex(vector_size_t size) { + rowIndex_ += size; + } + + bool atEnd() const { return rowIndex_ >= numRows_; } - vector_size_t currentRow() { + vector_size_t currentRow() const { if (isDense) { return rowIndex_; } @@ -371,7 +388,7 @@ class ColumnVisitor { return rows_; } - vector_size_t numRows() { + vector_size_t numRows() const { return numRows_; } @@ -402,6 +419,10 @@ class ColumnVisitor { return reader_->mutableOutputRows(size); } + int32_t numValuesBias() const { + return numValuesBias_; + } + void setNumValuesBias(int32_t bias) { numValuesBias_ = bias; } @@ -413,6 +434,14 @@ class ColumnVisitor { } } + void addNumValues(int size) { + auto numValues = reader_->numValues() + size; + reader_->setNumValues(numValues); + if constexpr (kHasFilter) { + reader_->setNumRows(numValues); + } + } + ExtractValues extractValues() const { return values_; } @@ -456,8 +485,8 @@ class ColumnVisitor { StringDictionaryColumnVisitor toStringDictionaryColumnVisitor(); - // Use for replacing *coall rows with non-null rows for fast path with - // processRun and processRle. + // Use for replacing all rows with non-null rows for fast path with processRun + // and processRle. void setRows(folly::Range newRows) { rows_ = newRows.data(); numRows_ = newRows.size(); @@ -477,8 +506,8 @@ class ColumnVisitor { template FOLLY_ALWAYS_INLINE void ColumnVisitor::filterFailed() { - auto preceding = filter_.getPrecedingPositionsToFail(); - auto succeeding = filter_.getSucceedingPositionsToFail(); + const auto preceding = filter_.getPrecedingPositionsToFail(); + const auto succeeding = filter_.getSucceedingPositionsToFail(); if (preceding) { reader_->dropResults(preceding); } @@ -490,12 +519,12 @@ ColumnVisitor::filterFailed() { template inline void ColumnVisitor::addResult( T value) { - values_.addValue(rowIndex_, value); + values_.addValue(rowIndex_ + numValuesBias_, value); } template inline void ColumnVisitor::addNull() { - values_.template addNull(rowIndex_); + values_.template addNull(rowIndex_ + numValuesBias_); } template @@ -504,12 +533,6 @@ inline void ColumnVisitor::addOutputRow( reader_->addOutputRow(row); } -template -template -void ExtractToReader::addNull(vector_size_t /*rowIndex*/) { - reader->template addNull(); -} - enum FilterResult { kUnknown = 0x40, kSuccess = 0x80, kFailure = 0 }; namespace detail { @@ -699,18 +722,18 @@ class DictionaryColumnVisitor DictionaryColumnVisitor( TFilter& filter, SelectiveColumnReader* reader, - RowSet rows, + const RowSet& rows, ExtractValues values) : ColumnVisitor( filter, reader, rows, values), - state_(reader->scanState().rawState), width_( reader->fileType().type()->kind() == TypeKind::BIGINT ? 8 : reader->fileType().type()->kind() == TypeKind::INTEGER ? 4 - : 2) {} + : 2), + state_(reader->scanState().rawState) {} FOLLY_ALWAYS_INLINE bool isInDict() { if (inDict()) { @@ -735,10 +758,11 @@ class DictionaryColumnVisitor } return super::process(signedValue, atEnd); } - vector_size_t previous = + + const vector_size_t previous = isDense && TFilter::deterministic ? 0 : super::currentRow(); - T valueInDictionary = dict()[value]; - if (std::is_same_v) { + const T valueInDictionary = dict()[value]; + if constexpr (!hasFilter()) { super::filterPassed(valueInDictionary); } else { // check the dictionary cache @@ -763,6 +787,7 @@ class DictionaryColumnVisitor } } } + if (++super::rowIndex_ >= super::numRows_) { atEnd = true; return (isDense && TFilter::deterministic) @@ -793,30 +818,43 @@ class DictionaryColumnVisitor T* values, int32_t& numValues) { DCHECK_EQ(input, values + numValues); - if (!hasFilter) { + if constexpr (!DictionaryColumnVisitor::hasFilter()) { if (hasHook) { translateByDict(input, numInput, values); super::values_.hook().addValues( scatter ? scatterRows + super::rowIndex_ - : velox::iota(super::numRows_, super::innerNonNullRows()) + + : velox::iota( + super::numRows_, + super::innerNonNullRows(), + super::numValuesBias_) + super::rowIndex_, values, - numInput, - sizeof(T)); + numInput); super::rowIndex_ += numInput; return; } - if (inDict()) { - translateScatter( - input, numInput, scatterRows, numValues, values); + if constexpr (std::is_same_v) { + auto* begin = (scatter ? scatterRows : super::rows_) + super::rowIndex_; + std::copy(begin, begin + numInput, filterHits + numValues); + if constexpr (!super::kFilterOnly) { + translateByDict(input, numInput, values + numValues); + } + numValues += numInput; } else { - translateScatter( - input, numInput, scatterRows, numValues, values); + if (inDict()) { + translateScatter( + input, numInput, scatterRows, numValues, values); + } else { + translateScatter( + input, numInput, scatterRows, numValues, values); + } + numValues = scatter ? scatterRows[super::rowIndex_ + numInput - 1] + 1 + : numValues + numInput; } super::rowIndex_ += numInput; - numValues = scatter ? scatterRows[super::rowIndex_ - 1] + 1 - : numValues + numInput; return; + } else { + static_assert(hasFilter); } // The filter path optionally extracts values but always sets // filterHits. It first loads a vector of indices. It translates @@ -828,8 +866,7 @@ class DictionaryColumnVisitor // written, the passing bitmap is used to load a permute mask to // permute the passing values to the left of a vector register and // write the whole register to the end of 'values' - constexpr bool kFilterOnly = - std::is_same_v; + constexpr bool kFilterOnly = super::kFilterOnly; constexpr int32_t kWidth = xsimd::batch::size; int32_t last = numInput & ~(kWidth - 1); for (auto i = 0; i < numInput; i += kWidth) { @@ -1074,8 +1111,15 @@ class DictionaryColumnVisitor return state_.filterCache; } - RawScanState state_; + static constexpr bool hasFilter() { + // Dictionary values cannot be null. See the explanation in + // `DictionaryValues::hasFilter'. + return !std::is_same_v && + !std::is_same_v; + } + const uint8_t width_; + RawScanState state_; }; template @@ -1124,7 +1168,7 @@ class StringDictionaryColumnVisitor } vector_size_t previous = isDense && TFilter::deterministic ? 0 : super::currentRow(); - if (std::is_same_v) { + if constexpr (!DictSuper::hasFilter()) { super::filterPassed(index); } else { // check the dictionary cache @@ -1137,7 +1181,7 @@ class StringDictionaryColumnVisitor super::filterFailed(); } else { if (velox::common::applyFilter( - super::filter_, valueInDictionary(value, inStrideDict))) { + super::filter_, valueInDictionary(index))) { super::filterPassed(index); if (TFilter::deterministic) { DictSuper::filterCache()[index] = FilterResult::kSuccess; @@ -1177,25 +1221,30 @@ class StringDictionaryColumnVisitor int32_t& numValues) { DCHECK(input == values + numValues); setByInDict(values + numValues, numInput); - if (!hasFilter) { + if constexpr (!DictSuper::hasFilter()) { if (hasHook) { for (auto i = 0; i < numInput; ++i) { - auto value = input[i]; super::values_.addValue( scatterRows ? scatterRows[super::rowIndex_ + i] : super::rowIndex_ + i, - value); + valueInDictionary(input[i])); } } - DCHECK_EQ(input, values + numValues); - if (scatter) { + if constexpr (std::is_same_v) { + auto* begin = (scatter ? scatterRows : super::rows_) + super::rowIndex_; + std::copy(begin, begin + numInput, filterHits + numValues); + numValues += numInput; + } else if constexpr (scatter) { dwio::common::scatterDense( input, scatterRows + super::rowIndex_, numInput, values); + numValues = scatterRows[super::rowIndex_ + numInput - 1] + 1; + } else { + numValues += numInput; } - numValues = scatter ? scatterRows[super::rowIndex_ + numInput - 1] + 1 - : numValues + numInput; super::rowIndex_ += numInput; return; + } else { + static_assert(hasFilter); } constexpr bool filterOnly = std::is_same_v; @@ -1223,16 +1272,7 @@ class StringDictionaryColumnVisitor while (bits) { int index = bits::getAndClearLastSetBit(bits); int32_t value = input[i + index]; - bool result; - if (value >= DictSuper::dictionarySize()) { - result = applyFilter( - super::filter_, - valueInDictionary(value - DictSuper::dictionarySize(), true)); - } else { - result = - applyFilter(super::filter_, valueInDictionary(value, false)); - } - if (result) { + if (applyFilter(super::filter_, valueInDictionary(value))) { DictSuper::filterCache()[value] = FilterResult::kSuccess; passed |= 1 << index; } else { @@ -1312,67 +1352,17 @@ class StringDictionaryColumnVisitor } } - folly::StringPiece valueInDictionary(int64_t index, bool inStrideDict) { - if (inStrideDict) { - return folly::StringPiece(reinterpret_cast( - DictSuper::state_.dictionary2.values)[index]); + folly::StringPiece valueInDictionary(int64_t index) { + auto stripeDictSize = DictSuper::state_.dictionary.numValues; + if (index < stripeDictSize) { + return reinterpret_cast( + DictSuper::state_.dictionary.values)[index]; } - return folly::StringPiece(reinterpret_cast( - DictSuper::state_.dictionary.values)[index]); + return reinterpret_cast( + DictSuper::state_.dictionary2.values)[index - stripeDictSize]; } }; -class ExtractStringDictionaryToGenericHook { - public: - static constexpr bool kSkipNulls = true; - using HookType = ValueHook; - - ExtractStringDictionaryToGenericHook( - ValueHook* hook, - RowSet rows, - RawScanState state) - - : hook_(hook), rows_(rows), state_(state) {} - - bool acceptsNulls() { - return hook_->acceptsNulls(); - } - - template - void addNull(vector_size_t rowIndex) { - hook_->addNull(rowIndex); - } - - void addValue(vector_size_t rowIndex, int32_t value) { - // We take the string from the stripe or stride dictionary - // according to the index. Stride dictionary indices are offset up - // by the stripe dict size. - if (value < dictionarySize()) { - auto view = folly::StringPiece( - reinterpret_cast(state_.dictionary.values)[value]); - hook_->addValue(rowIndex, &view); - } else { - VELOX_DCHECK(state_.inDictionary); - auto view = folly::StringPiece(reinterpret_cast( - state_.dictionary2.values)[value - dictionarySize()]); - hook_->addValue(rowIndex, &view); - } - } - - ValueHook& hook() { - return *hook_; - } - - private: - int32_t dictionarySize() const { - return state_.dictionary.numValues; - } - - ValueHook* const hook_; - RowSet const rows_; - RawScanState state_; -}; - template class DirectRleColumnVisitor : public ColumnVisitor { @@ -1390,13 +1380,6 @@ class DirectRleColumnVisitor rows, values) {} - // Use for replacing all rows with non-null rows for fast path with - // processRun and processRle. - void setRows(folly::Range newRows) { - super::rows_ = newRows.data(); - super::numRows_ = newRows.size(); - } - // Processes 'numInput' T's in 'input'. Sets 'values' and // 'numValues'' to the resulting values. 'scatterRows' may be // non-null if there is no filter and the decoded values should be @@ -1479,4 +1462,115 @@ class DirectRleColumnVisitor } }; +template +class StringColumnReadWithVisitorHelper { + public: + StringColumnReadWithVisitorHelper(SelectiveColumnReader& reader, RowSet rows) + : reader_(reader), rows_(rows) {} + + template + auto operator()(F&& readWithVisitor) { + const bool isDense = rows_.back() == rows_.size() - 1; + if (reader_.scanSpec()->keepValues()) { + if (auto* hook = reader_.scanSpec()->valueHook()) { + if (isDense) { + readHelper( + &alwaysTrue(), + ExtractToGenericHook(hook), + std::forward(readWithVisitor)); + } else { + readHelper( + &alwaysTrue(), + ExtractToGenericHook(hook), + std::forward(readWithVisitor)); + } + } else { + if (isDense) { + processFilter( + ExtractToReader(&reader_), std::forward(readWithVisitor)); + } else { + processFilter( + ExtractToReader(&reader_), std::forward(readWithVisitor)); + } + } + } else { + if (isDense) { + processFilter(DropValues(), std::forward(readWithVisitor)); + } else { + processFilter(DropValues(), std::forward(readWithVisitor)); + } + } + } + + private: + template + void readHelper( + velox::common::Filter* filter, + ExtractValues extractValues, + F readWithVisitor) { + readWithVisitor( + ColumnVisitor( + *static_cast(filter), &reader_, rows_, extractValues)); + } + + template + void processFilter(ExtractValues extractValues, F&& readWithVisitor) { + using FilterValueT = + std::conditional_t; + auto* filter = reader_.scanSpec()->filter(); + if (filter == nullptr) { + readHelper( + &alwaysTrue(), extractValues, std::forward(readWithVisitor)); + return; + } + switch (filter->kind()) { + case velox::common::FilterKind::kAlwaysTrue: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + case velox::common::FilterKind::kIsNull: + if constexpr (kEncodingHasNulls) { + reader_.filterNulls( + rows_, true, !std::is_same_v); + } else { + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + } + break; + case velox::common::FilterKind::kIsNotNull: + if constexpr ( + kEncodingHasNulls && std::is_same_v) { + reader_.filterNulls(rows_, false, false); + } else { + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + } + break; + case velox::common::FilterKind::kBytesRange: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + case velox::common::FilterKind::kNegatedBytesRange: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + case velox::common::FilterKind::kBytesValues: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + case velox::common::FilterKind::kNegatedBytesValues: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + default: + readHelper( + filter, extractValues, std::forward(readWithVisitor)); + break; + } + } + + SelectiveColumnReader& reader_; + const RowSet rows_; +}; + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/DecoderUtil.cpp b/velox/dwio/common/DecoderUtil.cpp index 7cdf943db3902..7b47fb2b9adbd 100644 --- a/velox/dwio/common/DecoderUtil.cpp +++ b/velox/dwio/common/DecoderUtil.cpp @@ -54,7 +54,7 @@ bool nonNullRowsFromSparse( RowSet rows, raw_vector& innerRows, raw_vector& outerRows, - uint64_t* resultNulls, + uint8_t* resultNullBytes, int32_t& tailSkip) { constexpr int32_t kStep = xsimd::batch::size; bool anyNull = false; @@ -66,7 +66,6 @@ bool nonNullRowsFromSparse( int32_t numNulls = 0; int32_t numInner = 0; int32_t lastNonNull = -1; - auto resultNullBytes = reinterpret_cast(resultNulls); // Returns the index in terms of non-null rows for // 'rows[i]'. Assumes that i is increasing between calls. @@ -92,16 +91,7 @@ bool nonNullRowsFromSparse( if (isDense(rows.data() + i, width)) { uint16_t flags = load8Bits(nulls, rows[i]) & widthMask; if (outputNulls) { - if constexpr (kStep == 8) { - resultNullBytes[i / 8] = flags; - } else { - VELOX_DCHECK_EQ(kStep, 4); - if (i % 8 == 0) { - resultNullBytes[i / 8] = flags; - } else { - resultNullBytes[i / 8] |= flags << 4; - } - } + bits::storeBitsToByte(flags, resultNullBytes, i); anyNull |= flags != widthMask; } if (!flags) { @@ -131,16 +121,7 @@ bool nonNullRowsFromSparse( auto next8Rows = xsimd::load_unaligned(rows.data() + i); uint16_t flags = simd::gather8Bits(nulls, next8Rows, width); if (outputNulls) { - if constexpr (kStep == 8) { - resultNullBytes[i / 8] = flags; - } else { - VELOX_DCHECK_EQ(kStep, 4); - if (i % 8 == 0) { - resultNullBytes[i / 8] = flags; - } else { - resultNullBytes[i / 8] |= flags << 4; - } - } + bits::storeBitsToByte(flags, resultNullBytes, i); anyNull |= flags != widthMask; } if (!flags) { @@ -176,7 +157,7 @@ template bool nonNullRowsFromSparse( RowSet rows, raw_vector& innerRows, raw_vector& outerRows, - uint64_t* resultNulls, + uint8_t* resultNullBytes, int32_t& tailSkip); template bool nonNullRowsFromSparse( @@ -184,7 +165,7 @@ template bool nonNullRowsFromSparse( RowSet rows, raw_vector& innerRows, raw_vector& outerRows, - uint64_t* resultNulls, + uint8_t* resultNullBytes, int32_t& tailSkip); template bool nonNullRowsFromSparse( @@ -192,7 +173,7 @@ template bool nonNullRowsFromSparse( RowSet rows, raw_vector& innerRows, raw_vector& outerRows, - uint64_t* resultNulls, + uint8_t* resultNullBytes, int32_t& tailSkip); template diff --git a/velox/dwio/common/DecoderUtil.h b/velox/dwio/common/DecoderUtil.h index dd393e452b7ad..34822647fc3b7 100644 --- a/velox/dwio/common/DecoderUtil.h +++ b/velox/dwio/common/DecoderUtil.h @@ -180,7 +180,7 @@ void fixedWidthScan( [&](T value, int32_t rowIndex) { if (!hasFilter) { if (hasHook) { - hook.addValue(scatterRows[rowIndex], &value); + hook.addValueTyped(scatterRows[rowIndex], value); } else { auto targetRow = scatter ? scatterRows[rowIndex] : rowIndex; rawValues[targetRow] = value; @@ -214,8 +214,7 @@ void fixedWidthScan( hook.addValues( scatterRows + rowIndex, buffer + firstRow - rowOffset, - kStep, - sizeof(T)); + kStep); } else { if (scatter) { scatterDense( @@ -266,7 +265,9 @@ void fixedWidthScan( if (!hasFilter) { if (hasHook) { hook.addValues( - scatterRows + rowIndex, &values, kWidth, sizeof(T)); + scatterRows + rowIndex, + reinterpret_cast(&values), + kWidth); } else { if (scatter) { scatterDense( @@ -321,8 +322,9 @@ void fixedWidthScan( } if (!hasFilter) { if (hasHook) { - hook.addValues( - scatterRows + rowIndex, &values, width, sizeof(T)); + T values2[values.size]; + values.store_unaligned(values2); + hook.addValues(scatterRows + rowIndex, values2, width); } else { if (scatter) { scatterDense( @@ -385,15 +387,28 @@ bool nonNullRowsFromSparse( RowSet rows, raw_vector& innerRows, raw_vector& outerRows, - uint64_t* resultNulls, + uint8_t* resultNulls, int32_t& tailSkip); +template +bool nonNullRowsFromSparse( + const uint64_t* nulls, + RowSet rows, + raw_vector& innerRows, + raw_vector& outerRows, + uint64_t* resultNulls, + int32_t& tailSkip) { + auto* resultNullBytes = reinterpret_cast(resultNulls); + return nonNullRowsFromSparse( + nulls, rows, innerRows, outerRows, resultNullBytes, tailSkip); +} + // See SelectiveColumnReader::useBulkPath. template bool useFastPath(Visitor& visitor) { - return (!std::is_same_v)&&process:: - hasAvx2() && - Visitor::FilterType::deterministic && Visitor::kHasBulkPath && + return (!std::is_same_v) && + process::hasAvx2() && Visitor::FilterType::deterministic && + Visitor::kHasBulkPath && (std:: is_same_v || !hasNulls || !visitor.allowNulls()) && @@ -401,6 +416,12 @@ bool useFastPath(Visitor& visitor) { Visitor::HookType::kSkipNulls); } +template +bool useFastPath(Visitor& visitor, bool hasNulls) { + return hasNulls ? useFastPath(visitor) + : useFastPath(visitor); +} + // Scatters 'numValues' elements of 'data' starting at data[sourceBegin] to // indices given starting with target[targetBegin]. The scatter is done from // last to first so as not to overwrite source data when copying from lower to @@ -454,7 +475,7 @@ void processFixedWidthRun( constexpr bool hasHook = !std::is_same_v; if (!hasFilter) { if (hasHook) { - hook.addValues(scatterRows + rowIndex, values, rows.size(), sizeof(T)); + hook.addValues(scatterRows + rowIndex, values, rows.size()); } else if (scatter) { scatterNonNulls(rowIndex, numInput, numValues, scatterRows, values); numValues = scatterRows[rowIndex + numInput - 1] + 1; diff --git a/velox/dwio/common/DirectBufferedInput.cpp b/velox/dwio/common/DirectBufferedInput.cpp new file mode 100644 index 0000000000000..caf6678580809 --- /dev/null +++ b/velox/dwio/common/DirectBufferedInput.cpp @@ -0,0 +1,326 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/DirectBufferedInput.h" +#include "velox/common/memory/Allocation.h" +#include "velox/common/process/TraceContext.h" +#include "velox/dwio/common/DirectInputStream.h" + +DECLARE_int32(cache_prefetch_min_pct); + +using ::facebook::velox::common::Region; + +namespace facebook::velox::dwio::common { + +using cache::CoalescedLoad; +using cache::ScanTracker; +using cache::TrackingId; + +std::unique_ptr DirectBufferedInput::enqueue( + Region region, + const StreamIdentifier* sid = nullptr) { + if (!coalescedLoads_.empty()) { + // Results of previous load are no more available here. + coalescedLoads_.clear(); + streamToCoalescedLoad_.wlock()->clear(); + } + if (region.length == 0) { + return std::make_unique( + static_cast(nullptr), 0); + } + + TrackingId id; + if (sid != nullptr) { + id = TrackingId(sid->getId()); + } + VELOX_CHECK_LE(region.offset + region.length, fileSize_); + requests_.emplace_back(region, id); + if (tracker_) { + tracker_->recordReference(id, region.length, fileNum_, groupId_); + } + auto stream = std::make_unique( + this, + ioStats_.get(), + region, + input_, + fileNum_, + tracker_, + id, + groupId_, + options_.loadQuantum()); + requests_.back().stream = stream.get(); + return stream; +} + +bool DirectBufferedInput::isBuffered(uint64_t /*offset*/, uint64_t /*length*/) + const { + return false; +} + +bool DirectBufferedInput::shouldPreload(int32_t numPages) { + return false; +} + +namespace { + +// True if the percentage is high enough to warrant prefetch. +bool isPrefetchablePct(int32_t pct) { + return pct >= FLAGS_cache_prefetch_min_pct; +} + +int32_t adjustedReadPct(const cache::TrackingData& trackingData) { + // When called, there will be one more reference that read, since references + // are counted before reading. + if (trackingData.numReferences < 2) { + return 0; + } + return (100 * trackingData.numReads) / (trackingData.numReferences - 1); +} +} // namespace + +void DirectBufferedInput::load(const LogType /*unused*/) { + // After load, new requests cannot be merged into pre-load ones. + auto requests = std::move(requests_); + + // We loop over access frequency buckets. For example readPct 80 + // will get all streams where 80% or more of the referenced data is + // actually loaded. + for (auto readPct : std::vector{80, 50, 20, 0}) { + std::vector storageLoad; + for (auto& request : requests) { + if (request.processed) { + continue; + } + cache::TrackingData trackingData; + const bool prefetchAnyway = request.trackingId.empty() || + request.trackingId.id() == StreamIdentifier::sequentialFile().id_; + if (!prefetchAnyway && tracker_) { + trackingData = tracker_->trackingData(request.trackingId); + } + if (prefetchAnyway || adjustedReadPct(trackingData) >= readPct) { + request.processed = true; + storageLoad.push_back(&request); + } + } + makeLoads(std::move(storageLoad), isPrefetchablePct(readPct)); + } +} + +void DirectBufferedInput::makeLoads( + std::vector requests, + bool prefetch) { + if (requests.empty() || (requests.size() < 2 && !prefetch)) { + // A single request has no other requests to coalesce with and is not + // eligible to prefetch. This will be loaded by itself on first use. + return; + } + const int32_t maxDistance = options_.maxCoalesceDistance(); + const auto loadQuantum = options_.loadQuantum(); + // If reading densely accessed, coalesce into large for best throughput, if + // for sparse, coalesce to quantum to reduce overread. Not all sparse access + // is correlated. + const auto maxCoalesceBytes = + prefetch ? options_.maxCoalesceBytes() : loadQuantum; + std::sort( + requests.begin(), + requests.end(), + [&](const LoadRequest* left, const LoadRequest* right) { + return left->region.offset < right->region.offset; + }); + + // Combine adjacent short reads. + int32_t numNewLoads = 0; + int64_t coalescedBytes = 0; + coalesceIo( + requests, + maxDistance, + // Break batches up. Better load more short ones i parallel. + 1000, // limit coalesce by size, not count. + [&](int32_t index) { return requests[index]->region.offset; }, + [&](int32_t index) -> int32_t { + auto size = requests[index]->region.length; + if (size > loadQuantum) { + coalescedBytes += loadQuantum; + return loadQuantum; + } + coalescedBytes += size; + return size; + }, + [&](int32_t index) { + if (coalescedBytes > maxCoalesceBytes) { + coalescedBytes = 0; + return kNoCoalesce; + } + return 1; + }, + [&](LoadRequest* request, std::vector& ranges) { + ranges.push_back(request); + }, + [&](int32_t /*gap*/, std::vector /*ranges*/) { /*no op*/ }, + [&](const std::vector& /*requests*/, + int32_t /*begin*/, + int32_t /*end*/, + uint64_t /*offset*/, + const std::vector& ranges) { + ++numNewLoads; + readRegion(ranges, prefetch); + }); + if (prefetch && executor_) { + for (auto i = 0; i < coalescedLoads_.size(); ++i) { + auto& load = coalescedLoads_[i]; + if (load->state() == CoalescedLoad::State::kPlanned) { + executor_->add([pendingLoad = load]() { + process::TraceContext trace("Read Ahead"); + pendingLoad->loadOrFuture(nullptr); + }); + } + } + } +} + +void DirectBufferedInput::readRegion( + std::vector requests, + bool prefetch) { + if (requests.empty() || (requests.size() == 1 && !prefetch)) { + return; + } + auto load = std::make_shared( + input_, ioStats_, groupId_, requests, *pool_, options_.loadQuantum()); + coalescedLoads_.push_back(load); + streamToCoalescedLoad_.withWLock([&](auto& loads) { + for (auto& request : requests) { + loads[request->stream] = load; + } + }); +} + +std::shared_ptr DirectBufferedInput::coalescedLoad( + const SeekableInputStream* stream) { + return streamToCoalescedLoad_.withWLock( + [&](auto& loads) -> std::shared_ptr { + auto it = loads.find(stream); + if (it == loads.end()) { + return nullptr; + } + auto load = std::move(it->second); + loads.erase(it); + return load; + }); +} + +std::unique_ptr DirectBufferedInput::read( + uint64_t offset, + uint64_t length, + LogType /*logType*/) const { + VELOX_CHECK_LE(offset + length, fileSize_); + return std::make_unique( + const_cast(this), + ioStats_.get(), + Region{offset, length}, + input_, + fileNum_, + nullptr, + TrackingId(), + 0, + options_.loadQuantum()); +} + +namespace { +void appendRanges( + memory::Allocation& allocation, + size_t length, + std::vector>& buffers) { + uint64_t offsetInRuns = 0; + for (int i = 0; i < allocation.numRuns(); ++i) { + auto run = allocation.runAt(i); + const uint64_t bytes = memory::AllocationTraits::pageBytes(run.numPages()); + const uint64_t readSize = std::min(bytes, length - offsetInRuns); + buffers.push_back(folly::Range(run.data(), readSize)); + offsetInRuns += readSize; + } +} +} // namespace + +std::vector DirectCoalescedLoad::loadData(bool prefetch) { + std::vector> buffers; + int64_t lastEnd = requests_[0].region.offset; + int64_t size = 0; + int64_t overread = 0; + + for (auto& request : requests_) { + const auto& region = request.region; + if (region.offset > lastEnd) { + buffers.push_back(folly::Range( + nullptr, + reinterpret_cast( + static_cast(region.offset - lastEnd)))); + overread += buffers.back().size(); + } + + if (region.length > DirectBufferedInput::kTinySize) { + if (&request != &requests_.back()) { + // Case where request is a little over quantum but is followed by + // another within the max distance. Coalesces and allows reading the + // region of max quantum + max distance in one piece. + request.loadSize = region.length; + } else { + request.loadSize = std::min(region.length, loadQuantum_); + } + const auto numPages = + memory::AllocationTraits::numPages(request.loadSize); + pool_.allocateNonContiguous(numPages, request.data); + appendRanges(request.data, request.loadSize, buffers); + } else { + request.loadSize = region.length; + request.tinyData.resize(region.length); + buffers.push_back(folly::Range(request.tinyData.data(), region.length)); + } + lastEnd = region.offset + request.loadSize; + size += request.loadSize; + } + + uint64_t usecs = 0; + { + MicrosecondTimer timer(&usecs); + input_->read(buffers, requests_[0].region.offset, LogType::FILE); + } + + ioStats_->read().increment(size + overread); + ioStats_->incRawBytesRead(size); + ioStats_->incTotalScanTime(usecs * 1'000); + ioStats_->queryThreadIoLatency().increment(usecs); + ioStats_->incRawOverreadBytes(overread); + if (prefetch) { + ioStats_->prefetch().increment(size + overread); + } + return {}; +} + +int32_t DirectCoalescedLoad::getData( + int64_t offset, + memory::Allocation& data, + std::string& tinyData) { + for (auto& request : requests_) { + if (request.region.offset == offset) { + data = std::move(request.data); + tinyData = std::move(request.tinyData); + return request.loadSize; + } + } + return 0; +} + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/DirectBufferedInput.h b/velox/dwio/common/DirectBufferedInput.h new file mode 100644 index 0000000000000..561b2d795f595 --- /dev/null +++ b/velox/dwio/common/DirectBufferedInput.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "velox/common/caching/AsyncDataCache.h" +#include "velox/common/caching/FileGroupStats.h" +#include "velox/common/caching/ScanTracker.h" +#include "velox/common/io/IoStatistics.h" +#include "velox/common/io/Options.h" +#include "velox/dwio/common/BufferedInput.h" +#include "velox/dwio/common/CacheInputStream.h" +#include "velox/dwio/common/InputStream.h" + +namespace facebook::velox::dwio::common { + +struct LoadRequest { + LoadRequest() = default; + LoadRequest(velox::common::Region& _region, cache::TrackingId _trackingId) + : region(_region), trackingId(_trackingId) {} + + velox::common::Region region; + cache::TrackingId trackingId; + bool processed{false}; + + const SeekableInputStream* stream; + + /// Buffers to be handed to 'stream' after load. + memory::Allocation data; + std::string tinyData; + /// Number of bytes in 'data/tinyData'. + int32_t loadSize{0}; +}; + +/// Represents planned loads that should be performed as a single IO. +class DirectCoalescedLoad : public cache::CoalescedLoad { + public: + DirectCoalescedLoad( + std::shared_ptr input, + std::shared_ptr ioStats, + uint64_t groupId, + const std::vector& requests, + memory::MemoryPool& pool, + int32_t loadQuantum) + : CoalescedLoad({}, {}), + ioStats_(ioStats), + groupId_(groupId), + input_(std::move(input)), + loadQuantum_(loadQuantum), + pool_(pool) { + requests_.reserve(requests.size()); + for (auto i = 0; i < requests.size(); ++i) { + requests_.push_back(std::move(*requests[i])); + } + }; + + /// Loads the regions. Returns {} since no cache entries are made. The loaded + /// data is retrieved with getData(). + std::vector loadData(bool prefetch) override; + + /// Returns the buffer for 'region' in either 'data' or 'tinyData'. 'region' + /// must match a region given to DirectBufferedInput::enqueue(). + int32_t + getData(int64_t offset, memory::Allocation& data, std::string& tinyData); + + const std::vector& requests() { + return requests_; + } + + int64_t size() const override { + int64_t size = 0; + for (auto& request : requests_) { + size += request.region.length; + } + return size; + } + + private: + const std::shared_ptr ioStats_; + const uint64_t groupId_; + const std::shared_ptr input_; + const int32_t loadQuantum_; + memory::MemoryPool& pool_; + std::vector requests_; +}; + +class DirectBufferedInput : public BufferedInput { + public: + static constexpr int32_t kTinySize = 2'000; + + DirectBufferedInput( + std::shared_ptr readFile, + const MetricsLogPtr& metricsLog, + uint64_t fileNum, + std::shared_ptr tracker, + uint64_t groupId, + std::shared_ptr ioStats, + folly::Executor* executor, + const io::ReaderOptions& readerOptions) + : BufferedInput( + std::move(readFile), + readerOptions.memoryPool(), + metricsLog), + fileNum_(fileNum), + tracker_(std::move(tracker)), + groupId_(groupId), + ioStats_(std::move(ioStats)), + executor_(executor), + fileSize_(input_->getLength()), + options_(readerOptions) {} + + ~DirectBufferedInput() override { + for (auto& load : coalescedLoads_) { + load->cancel(); + } + } + + std::unique_ptr enqueue( + velox::common::Region region, + const StreamIdentifier* sid) override; + + bool supportSyncLoad() const override { + return false; + } + + void load(const LogType /*unused*/) override; + + bool isBuffered(uint64_t offset, uint64_t length) const override; + + bool shouldPreload(int32_t numPages = 0) override; + + bool shouldPrefetchStripes() const override { + return false; + } + + void setNumStripes(int32_t numStripes) override { + auto* stats = tracker_->fileGroupStats(); + if (stats) { + stats->recordFile(fileNum_, groupId_, numStripes); + } + } + + virtual std::unique_ptr clone() const override { + return std::unique_ptr(new DirectBufferedInput( + input_, fileNum_, tracker_, groupId_, ioStats_, executor_, options_)); + } + + memory::MemoryPool* pool() const { + return pool_; + } + + /// Returns the CoalescedLoad that contains the correlated loads for + /// 'stream' or nullptr if none. Returns nullptr on all but first + /// call for 'stream' since the load is to be triggered by the first + /// access. + std::shared_ptr coalescedLoad( + const SeekableInputStream* stream); + + std::unique_ptr + read(uint64_t offset, uint64_t length, LogType logType) const override; + + folly::Executor* executor() const override { + return executor_; + } + + uint64_t nextFetchSize() const override { + VELOX_NYI(); + } + + private: + /// Constructor used by clone(). + DirectBufferedInput( + std::shared_ptr input, + uint64_t fileNum, + std::shared_ptr tracker, + uint64_t groupId, + std::shared_ptr ioStats, + folly::Executor* executor, + const io::ReaderOptions& readerOptions) + : BufferedInput(std::move(input), readerOptions.memoryPool()), + fileNum_(fileNum), + tracker_(std::move(tracker)), + groupId_(groupId), + ioStats_(std::move(ioStats)), + executor_(executor), + fileSize_(input_->getLength()), + options_(readerOptions) {} + + // Sorts requests and makes CoalescedLoads for nearby requests. If 'prefetch' + // is true, starts background loading. + void makeLoads(std::vector requests, bool prefetch); + + // Makes a CoalescedLoad for 'requests' to be read together, coalescing IO if + // appropriate. If 'prefetch' is set, schedules the CoalescedLoad on + // 'executor_'. Links the CoalescedLoad to all DirectInputStreams that it + // covers. + void readRegion(std::vector requests, bool prefetch); + + const uint64_t fileNum_; + const std::shared_ptr tracker_; + const uint64_t groupId_; + const std::shared_ptr ioStats_; + folly::Executor* const executor_; + const uint64_t fileSize_; + + // Regions that are candidates for loading. + std::vector requests_; + + // Coalesced loads spanning multiple streams in one IO. + folly::Synchronized>> + streamToCoalescedLoad_; + + // Distinct coalesced loads in 'coalescedLoads_'. + std::vector> coalescedLoads_; + + io::ReaderOptions options_; +}; + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/DirectDecoder.cpp b/velox/dwio/common/DirectDecoder.cpp index 8365a5ec21e1c..69a631b2b94d4 100644 --- a/velox/dwio/common/DirectDecoder.cpp +++ b/velox/dwio/common/DirectDecoder.cpp @@ -23,10 +23,11 @@ namespace facebook::velox::dwio::common { template void DirectDecoder::seekToRowGroup( dwio::common::PositionProvider& location) { - // move the input stream - IntDecoder::inputStream->seekToPosition(location); - // force a re-read from the stream - IntDecoder::bufferEnd = IntDecoder::bufferStart; + // Moves the input stream. + IntDecoder::inputStream_->seekToPosition(location); + // Forces a re-read from the stream. + IntDecoder::bufferEnd_ = IntDecoder::bufferStart_; + this->pendingSkip_ = 0; } template void DirectDecoder::seekToRowGroup( @@ -34,20 +35,13 @@ template void DirectDecoder::seekToRowGroup( template void DirectDecoder::seekToRowGroup( dwio::common::PositionProvider& location); -template -void DirectDecoder::skip(uint64_t numValues) { - IntDecoder::skipLongs(numValues); -} - -template void DirectDecoder::skip(uint64_t numValues); -template void DirectDecoder::skip(uint64_t numValues); - template template void DirectDecoder::nextValues( T* data, uint64_t numValues, const uint64_t* nulls) { + skipPending(); uint64_t position = 0; // skipNulls() if (nulls) { @@ -60,7 +54,7 @@ void DirectDecoder::nextValues( // this is gross and very not DRY, but helps avoid branching if (position < numValues) { if (nulls) { - if (!IntDecoder::useVInts) { + if (!IntDecoder::useVInts_) { if constexpr (std::is_same_v) { VELOX_NYI(); } @@ -77,7 +71,7 @@ void DirectDecoder::nextValues( } } } else { - if (!IntDecoder::useVInts) { + if (!IntDecoder::useVInts_) { if constexpr (std::is_same_v) { VELOX_NYI(); } diff --git a/velox/dwio/common/DirectDecoder.h b/velox/dwio/common/DirectDecoder.h index 71831eced02fe..644e41348c45b 100644 --- a/velox/dwio/common/DirectDecoder.h +++ b/velox/dwio/common/DirectDecoder.h @@ -16,7 +16,6 @@ #pragma once -#include "velox/common/base/Nulls.h" #include "velox/dwio/common/DecoderUtil.h" #include "velox/dwio/common/IntDecoder.h" @@ -36,40 +35,27 @@ class DirectDecoder : public IntDecoder { void seekToRowGroup(dwio::common::PositionProvider&) override; - void skip(uint64_t numValues) override; + using IntDecoder::skip; + + void skipPending() final { + const auto toSkip = this->pendingSkip_; + this->pendingSkip_ = 0; + this->skipLongs(toSkip); + } template - void nextValues( - T* FOLLY_NONNULL data, - uint64_t numValues, - const uint64_t* FOLLY_NULLABLE nulls); + void nextValues(T* data, uint64_t numValues, const uint64_t* nulls); - void next( - int64_t* FOLLY_NONNULL data, - uint64_t numValues, - const uint64_t* FOLLY_NULLABLE nulls) override { + void next(int64_t* data, uint64_t numValues, const uint64_t* nulls) override { nextValues(data, numValues, nulls); } - template - inline void skip( - int32_t numValues, - int32_t current, - const uint64_t* FOLLY_NULLABLE nulls) { - if (!numValues) { - return; - } - if (hasNulls) { - numValues = bits::countNonNulls(nulls, current, current + numValues); - } - IntDecoder::skipLongsFast(numValues); - } - template void readWithVisitor( - const uint64_t* FOLLY_NULLABLE nulls, + const uint64_t* nulls, Visitor visitor, bool useFastPath = true) { + skipPending(); if constexpr (!std::is_same_v) { if (useFastPath && dwio::common::useFastPath(visitor)) { @@ -77,8 +63,9 @@ class DirectDecoder : public IntDecoder { return; } } + int32_t current = visitor.start(); - skip(current, 0, nulls); + this->template skip(current, 0, nulls); const bool allowNulls = hasNulls && visitor.allowNulls(); for (;;) { bool atEnd = false; @@ -87,7 +74,7 @@ class DirectDecoder : public IntDecoder { if (!allowNulls) { toSkip = visitor.checkAndSkipNulls(nulls, current, atEnd); if (!Visitor::dense) { - skip(toSkip, current, nullptr); + this->template skip(toSkip, current, nullptr); } if (atEnd) { return; @@ -110,10 +97,11 @@ class DirectDecoder : public IntDecoder { } else { toSkip = visitor.process(super::template readInt(), atEnd); } + skip: ++current; if (toSkip) { - skip(toSkip, current, nulls); + this->template skip(toSkip, current, nulls); current += toSkip; } if (atEnd) { @@ -140,23 +128,24 @@ class DirectDecoder : public IntDecoder { // Returns a pointer to the next element of 'size' bytes in the // buffer. If the element would straddle buffers, it is copied to // *temp and temp is returned. - const void* FOLLY_NONNULL readFixed(int32_t size, void* FOLLY_NONNULL temp) { - auto ptr = super::bufferStart; - if (ptr && ptr + size <= super::bufferEnd) { - super::bufferStart += size; + const void* readFixed(int32_t size, void* temp) { + skipPending(); + auto ptr = super::bufferStart_; + if (ptr && ptr + size <= super::bufferEnd_) { + super::bufferStart_ += size; return ptr; } readBytes( size, - super::inputStream.get(), + super::inputStream_.get(), temp, - super::bufferStart, - super::bufferEnd); + super::bufferStart_, + super::bufferEnd_); return temp; } template - void fastPath(const uint64_t* FOLLY_NULLABLE nulls, Visitor& visitor) { + void fastPath(const uint64_t* nulls, Visitor& visitor) { using T = typename Visitor::DataType; constexpr bool hasFilter = !std:: @@ -200,18 +189,23 @@ class DirectDecoder : public IntDecoder { visitor.setHasNulls(); } if (innerVector->empty()) { - skip(tailSkip, 0, nullptr); + this->template skip(tailSkip, 0, nullptr); visitor.setAllNull(hasFilter ? 0 : numRows); return; } } - if (super::useVInts) { + if (hasHook && visitor.numValuesBias() > 0) { + for (auto& row : *outerVector) { + row += visitor.numValuesBias(); + } + } + if (super::useVInts_) { if (Visitor::dense) { super::bulkRead(numNonNull, data); } else { super::bulkReadRows(*innerVector, data); } - skip(tailSkip, 0, nullptr); + this->template skip(tailSkip, 0, nullptr); auto dataRows = innerVector ? folly::Range(innerVector->data(), innerVector->size()) : folly::Range(rows, outerVector->size()); @@ -234,15 +228,15 @@ class DirectDecoder : public IntDecoder { visitor.rawValues(numRows), hasFilter ? visitor.outputRows(numRows) : nullptr, numValues, - *super::inputStream, - super::bufferStart, - super::bufferEnd, + *super::inputStream_, + super::bufferStart_, + super::bufferEnd_, visitor.filter(), visitor.hook()); - skip(tailSkip, 0, nullptr); + this->template skip(tailSkip, 0, nullptr); } } else { - if (super::useVInts) { + if (super::useVInts_) { if (Visitor::dense) { super::bulkRead(numRows, visitor.rawValues(numRows)); } else { @@ -255,7 +249,10 @@ class DirectDecoder : public IntDecoder { rowsAsRange, 0, rowsAsRange.size(), - hasHook ? velox::iota(numRows, visitor.innerNonNullRows()) + hasHook ? velox::iota( + numRows, + visitor.innerNonNullRows(), + visitor.numValuesBias()) : nullptr, visitor.rawValues(numRows), hasFilter ? visitor.outputRows(numRows) : nullptr, @@ -265,14 +262,17 @@ class DirectDecoder : public IntDecoder { } else { dwio::common::fixedWidthScan( rowsAsRange, - hasHook ? velox::iota(numRows, visitor.innerNonNullRows()) + hasHook ? velox::iota( + numRows, + visitor.innerNonNullRows(), + visitor.numValuesBias()) : nullptr, visitor.rawValues(numRows), hasFilter ? visitor.outputRows(numRows) : nullptr, numValues, - *super::inputStream, - super::bufferStart, - super::bufferEnd, + *super::inputStream_, + super::bufferStart_, + super::bufferEnd_, visitor.filter(), visitor.hook()); } diff --git a/velox/dwio/common/DirectInputStream.cpp b/velox/dwio/common/DirectInputStream.cpp new file mode 100644 index 0000000000000..3d8c8a13f636c --- /dev/null +++ b/velox/dwio/common/DirectInputStream.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "velox/common/process/TraceContext.h" +#include "velox/common/time/Timer.h" +#include "velox/dwio/common/DirectBufferedInput.h" +#include "velox/dwio/common/DirectInputStream.h" + +using ::facebook::velox::common::Region; + +namespace facebook::velox::dwio::common { + +using velox::cache::ScanTracker; +using velox::cache::TrackingId; +using velox::memory::MemoryAllocator; + +DirectInputStream::DirectInputStream( + DirectBufferedInput* bufferedInput, + IoStatistics* ioStats, + const Region& region, + std::shared_ptr input, + uint64_t fileNum, + std::shared_ptr tracker, + TrackingId trackingId, + uint64_t groupId, + int32_t loadQuantum) + : bufferedInput_(bufferedInput), + ioStats_(ioStats), + input_(std::move(input)), + region_(region), + fileNum_(fileNum), + tracker_(std::move(tracker)), + trackingId_(trackingId), + groupId_(groupId), + loadQuantum_(loadQuantum) {} + +bool DirectInputStream::Next(const void** buffer, int32_t* size) { + if (offsetInRegion_ >= region_.length) { + *size = 0; + return false; + } + loadPosition(); + + *buffer = reinterpret_cast(run_ + offsetInRun_); + *size = runSize_ - offsetInRun_; + if (offsetInRegion_ + *size > region_.length) { + *size = region_.length - offsetInRegion_; + } + offsetInRun_ += *size; + offsetInRegion_ += *size; + + if (tracker_) { + tracker_->recordRead(trackingId_, *size, fileNum_, groupId_); + } + return true; +} + +void DirectInputStream::BackUp(int32_t count) { + VELOX_CHECK_GE(count, 0, "can't backup negative distances"); + + const uint64_t unsignedCount = static_cast(count); + VELOX_CHECK_LE(unsignedCount, offsetInRun_, "Can't backup that much!"); + offsetInRegion_ -= unsignedCount; +} + +bool DirectInputStream::SkipInt64(int64_t count) { + if (count < 0) { + return false; + } + const uint64_t unsignedCount = static_cast(count); + if (unsignedCount + offsetInRegion_ <= region_.length) { + offsetInRegion_ += unsignedCount; + return true; + } + offsetInRegion_ = region_.length; + return false; +} + +google::protobuf::int64 DirectInputStream::ByteCount() const { + return static_cast(offsetInRegion_); +} + +void DirectInputStream::seekToPosition(PositionProvider& seekPosition) { + offsetInRegion_ = seekPosition.next(); + VELOX_CHECK_LE(offsetInRegion_, region_.length); +} + +std::string DirectInputStream::getName() const { + return fmt::format( + "DirectInputStream {} of {}", offsetInRegion_, region_.length); +} + +size_t DirectInputStream::positionSize() { + // not compressed, so only need 1 position (uncompressed position) + return 1; +} + +namespace { +std::vector> +makeRanges(size_t size, memory::Allocation& data, std::string& tinyData) { + std::vector> buffers; + if (data.numPages() > 0) { + buffers.reserve(data.numRuns()); + uint64_t offsetInRuns = 0; + for (int i = 0; i < data.numRuns(); ++i) { + auto run = data.runAt(i); + uint64_t bytes = memory::AllocationTraits::pageBytes(run.numPages()); + uint64_t readSize = std::min(bytes, size - offsetInRuns); + buffers.push_back(folly::Range(run.data(), readSize)); + offsetInRuns += readSize; + } + } else { + buffers.push_back(folly::Range(tinyData.data(), size)); + } + return buffers; +} +} // namespace + +void DirectInputStream::loadSync() { + if (region_.length < DirectBufferedInput::kTinySize && + data_.numPages() == 0) { + tinyData_.resize(region_.length); + } else { + const auto numPages = + memory::AllocationTraits::numPages(loadedRegion_.length); + if (numPages > data_.numPages()) { + bufferedInput_->pool()->allocateNonContiguous(numPages, data_); + } + } + + process::TraceContext trace("DirectInputStream::loadSync"); + + ioStats_->incRawBytesRead(loadedRegion_.length); + auto ranges = makeRanges(loadedRegion_.length, data_, tinyData_); + uint64_t usecs = 0; + { + MicrosecondTimer timer(&usecs); + input_->read(ranges, loadedRegion_.offset, LogType::FILE); + } + ioStats_->read().increment(loadedRegion_.length); + ioStats_->queryThreadIoLatency().increment(usecs); + ioStats_->incTotalScanTime(usecs * 1'000); +} + +void DirectInputStream::loadPosition() { + VELOX_CHECK_LT(offsetInRegion_, region_.length); + if (!loaded_) { + loaded_ = true; + auto load = bufferedInput_->coalescedLoad(this); + if (load != nullptr) { + folly::SemiFuture waitFuture(false); + uint64_t loadUs = 0; + { + MicrosecondTimer timer(&loadUs); + if (!load->loadOrFuture(&waitFuture)) { + waitFuture.wait(); + } + loadedRegion_.offset = region_.offset; + loadedRegion_.length = load->getData(region_.offset, data_, tinyData_); + } + ioStats_->queryThreadIoLatency().increment(loadUs); + } else { + // Standalone stream, not part of coalesced load. + loadedRegion_.offset = 0; + loadedRegion_.length = 0; + } + } + + // Check if position outside of loaded bounds. + if (loadedRegion_.length == 0 || + region_.offset + offsetInRegion_ < loadedRegion_.offset || + region_.offset + offsetInRegion_ >= + loadedRegion_.offset + loadedRegion_.length) { + loadedRegion_.offset = region_.offset + offsetInRegion_; + loadedRegion_.length = (offsetInRegion_ + loadQuantum_ <= region_.length) + ? loadQuantum_ + : (region_.length - offsetInRegion_); + + // Since the loadSync method updates the metric, but it is conditionally + // executed, we also need to update the metric in the loadData method. + loadSync(); + } + + const auto offsetInData = + offsetInRegion_ - (loadedRegion_.offset - region_.offset); + if (data_.numPages() == 0) { + run_ = reinterpret_cast(tinyData_.data()); + runSize_ = tinyData_.size(); + offsetInRun_ = offsetInData; + offsetOfRun_ = 0; + } else { + data_.findRun(offsetInData, &runIndex_, &offsetInRun_); + offsetOfRun_ = offsetInData - offsetInRun_; + auto run = data_.runAt(runIndex_); + run_ = run.data(); + runSize_ = memory::AllocationTraits::pageBytes(run.numPages()); + if (offsetOfRun_ + runSize_ > loadedRegion_.length) { + runSize_ = loadedRegion_.length - offsetOfRun_; + } + } + VELOX_CHECK_LT(offsetInRun_, runSize_); +} + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/DirectInputStream.h b/velox/dwio/common/DirectInputStream.h new file mode 100644 index 0000000000000..3715da6666822 --- /dev/null +++ b/velox/dwio/common/DirectInputStream.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/common/caching/FileIds.h" +#include "velox/common/caching/ScanTracker.h" +#include "velox/common/io/IoStatistics.h" +#include "velox/dwio/common/InputStream.h" +#include "velox/dwio/common/SeekableInputStream.h" + +namespace facebook::velox::dwio::common { + +class DirectBufferedInput; + +/// An input stream over possibly coalesced loads. Created by +/// DirectBufferedInput. Similar to CacheInputStream but does not use cache. +class DirectInputStream : public SeekableInputStream { + public: + DirectInputStream( + DirectBufferedInput* bufferedInput, + IoStatistics* ioStats, + const velox::common::Region& region, + std::shared_ptr input, + uint64_t fileNum, + std::shared_ptr tracker, + cache::TrackingId trackingId, + uint64_t groupId, + int32_t loadQuantum); + + bool Next(const void** data, int* size) override; + void BackUp(int count) override; + bool SkipInt64(int64_t count) override; + google::protobuf::int64 ByteCount() const override; + + void seekToPosition(PositionProvider& position) override; + std::string getName() const override; + size_t positionSize() override; + + /// Testing function to access loaded state. + void testingData( + velox::common::Region& loadedRegion, + memory::Allocation*& data, + std::string*& tinyData) { + loadedRegion = loadedRegion_; + data = &data_; + tinyData = &tinyData_; + } + + private: + // Ensures that the current position is covered by 'data_'. + void loadPosition(); + + // Synchronously sets 'data_' to cover loadedRegion_'. + void loadSync(); + + DirectBufferedInput* const bufferedInput_; + IoStatistics* const ioStats_; + const std::shared_ptr input_; + // The region of 'input' 'this' ranges over. + const velox::common::Region region_; + const uint64_t fileNum_; + std::shared_ptr tracker_; + const cache::TrackingId trackingId_; + const uint64_t groupId_; + + // Maximum number of bytes read from 'input' at a time. + const int32_t loadQuantum_; + + // The part of 'region_' that is loaded into 'data_'/'tinyData_'. Relative to + // file start. + velox::common::Region loadedRegion_; + + // Allocation with loaded data. Has space for region.length or loadQuantum_ + // bytes, whichever is less. + memory::Allocation data_; + + // Contains the data if the range is too small for Allocation. + std::string tinyData_; + + // Pointer to start of current run in 'entry->data()' or + // 'entry->tinyData()'. + uint8_t* run_{nullptr}; + + // Offset of current run from start of 'data_' + uint64_t offsetOfRun_; + + // Position of stream relative to 'run_'. + int offsetInRun_{0}; + + // Index of run in 'data_' + int runIndex_ = -1; + + // Number of valid bytes starting at 'run_' + uint32_t runSize_ = 0; + // Position relative to 'region_.offset'. + uint64_t offsetInRegion_ = 0; + + // Set to true when data is first loaded. + bool loaded_{false}; +}; + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/ExecutorBarrier.cpp b/velox/dwio/common/ExecutorBarrier.cpp new file mode 100644 index 0000000000000..4d5d106f6dccb --- /dev/null +++ b/velox/dwio/common/ExecutorBarrier.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/ExecutorBarrier.h" + +namespace facebook::velox::dwio::common { + +namespace { + +class BarrierElement { + public: + BarrierElement(size_t& count, std::mutex& mutex, std::condition_variable& cv) + : count_{count}, mutex_{&mutex}, cv_{cv} { + std::lock_guard lock{*mutex_}; + ++count_; + } + + BarrierElement(BarrierElement&& other) noexcept + : count_{other.count_}, mutex_{other.mutex_}, cv_{other.cv_} { + // Move away + other.mutex_ = nullptr; + } + + BarrierElement(const BarrierElement& other) = delete; + BarrierElement& operator=(BarrierElement&& other) = delete; + BarrierElement& operator=(const BarrierElement& other) = delete; + + ~BarrierElement() { + // If this object wasn't moved away + if (mutex_) { + std::lock_guard lock{*mutex_}; + if (--count_ == 0) { + cv_.notify_all(); + } + } + } + + private: + size_t& count_; + std::mutex* mutex_; + std::condition_variable& cv_; +}; + +} // namespace + +auto ExecutorBarrier::wrapMethod(folly::Func f) { + return [f = std::move(f), + this, + barrierElement = BarrierElement(count_, mutex_, cv_)]() mutable { + try { + f(); + } catch (...) { + std::lock_guard lock{mutex_}; + if (!exception_.has_exception_ptr()) { + exception_ = folly::exception_wrapper(std::current_exception()); + } + } + }; +} + +void ExecutorBarrier::add(folly::Func f) { + executor_.add(wrapMethod(std::move(f))); +} + +void ExecutorBarrier::addWithPriority(folly::Func f, int8_t priority) { + executor_.addWithPriority(wrapMethod(std::move(f)), priority); +} + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/ExecutorBarrier.h b/velox/dwio/common/ExecutorBarrier.h new file mode 100644 index 0000000000000..c3004ea1e489a --- /dev/null +++ b/velox/dwio/common/ExecutorBarrier.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "folly/ExceptionWrapper.h" +#include "folly/Executor.h" + +namespace facebook::velox::dwio::common { + +class ExecutorBarrier : public folly::Executor { + public: + explicit ExecutorBarrier(folly::Executor& executor) + : executor_{executor}, count_{0} {} + + explicit ExecutorBarrier(std::shared_ptr executor) + : owned_{std::move(executor)}, executor_{*owned_}, count_{0} {} + + ~ExecutorBarrier() override { + // If this object gets destroyed while there are still tasks pending, those + // tasks will try to access invalid memory addresses in the current object. + std::unique_lock lock{mutex_}; + cv_.wait(lock, [&]() { return count_ == 0; }); + // We won't throw from the destructor so we don't check for exceptions + // Also, I don't need to clear the exception because this is the destructor. + } + + /// Enqueue a function to be executed by this executor. This and all + /// variants must be threadsafe. + void add(folly::Func) override; + + /// Enqueue a function with a given priority, where 0 is the medium priority + /// This is up to the implementation to enforce + void addWithPriority(folly::Func, int8_t priority) override; + + uint8_t getNumPriorities() const override { + return executor_.getNumPriorities(); + } + + void waitAll() { + std::unique_lock lock{mutex_}; + cv_.wait(lock, [&]() { return count_ == 0; }); + if (exception_.has_exception_ptr()) { + folly::exception_wrapper ew; + // Clear the exception for the next time + std::swap(ew, exception_); + ew.throw_exception(); + } + } + + private: + auto wrapMethod(folly::Func f); + + std::shared_ptr owned_; + folly::Executor& executor_; + size_t count_; + std::mutex mutex_; + std::condition_variable cv_; + folly::exception_wrapper exception_; +}; + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/FileSink.cpp b/velox/dwio/common/FileSink.cpp index 26873abbfc040..cebfe54c3bbf2 100644 --- a/velox/dwio/common/FileSink.cpp +++ b/velox/dwio/common/FileSink.cpp @@ -66,13 +66,20 @@ void FileSink::writeImpl( std::vector>& buffers, const std::function&)>& callback) { DWIO_ENSURE(!isClosed(), "Cannot write to closed sink."); - uint64_t size = 0; - for (auto& buf : buffers) { - size += callback(buf); + const uint64_t oldSize = size_; + uint64_t writeTimeUs{0}; + { + MicrosecondTimer timer(&writeTimeUs); + for (auto& buf : buffers) { + // NOTE: we need to update 'size_' after each 'callback' invocation as + // some file sink implementation like MemorySink depends on the updated + // 'size_' for new write. + size_ += callback(buf); + } } - size_ += size; if (stats_ != nullptr) { - stats_->incRawBytesWritten(size); + stats_->incRawBytesWritten(size_ - oldSize); + stats_->incWriteIOTimeUs(writeTimeUs); } // Writing buffer is treated as transferring ownership. So clearing the // buffers after all buffers are written. @@ -184,8 +191,7 @@ MemorySink::MemorySink(size_t capacity, const Options& options) void MemorySink::write(std::vector>& buffers) { writeImpl(buffers, [&](auto& buffer) { const auto size = buffer.size(); - DWIO_ENSURE_LE(size_ + size, data_.capacity()); - ::memcpy(data_.data() + size_, buffer.data(), size); + data_.extendAppend(size_, buffer.data(), size); return size; }); } diff --git a/velox/dwio/common/FileSink.h b/velox/dwio/common/FileSink.h index f37120a284d52..fd9fec9fa8602 100644 --- a/velox/dwio/common/FileSink.h +++ b/velox/dwio/common/FileSink.h @@ -18,16 +18,13 @@ #include +#include "velox/common/config/Config.h" #include "velox/common/file/File.h" #include "velox/common/io/IoStatistics.h" #include "velox/dwio/common/Closeable.h" #include "velox/dwio/common/DataBuffer.h" #include "velox/dwio/common/MetricsLog.h" -namespace facebook::velox { -class Config; -} - namespace facebook::velox::dwio::common { using namespace facebook::velox::io; @@ -40,7 +37,12 @@ class FileSink : public Closeable { bool bufferWrite{true}; /// Connector properties are required to create a FileSink on FileSystems /// such as S3. - const std::shared_ptr& connectorProperties{nullptr}; + const std::shared_ptr& connectorProperties{ + nullptr}; + /// Config used to create sink files. This config is provided to underlying + /// file system and the config is free form. The form should be defined by + /// the underlying file system. + const std::string fileCreateConfig{""}; memory::MemoryPool* pool{nullptr}; MetricsLogPtr metricLogger{MetricsLog::voidLog()}; IoStatistics* stats{nullptr}; @@ -107,7 +109,7 @@ class FileSink : public Closeable { const std::function&)>& callback); const std::string name_; - const std::shared_ptr connectorProperties_; + const std::shared_ptr connectorProperties_; memory::MemoryPool* const pool_; const MetricsLogPtr metricLogger_; IoStatistics* const stats_; @@ -152,10 +154,6 @@ class WriteFileSink final : public FileSink { class LocalFileSink : public FileSink { public: LocalFileSink(const std::string& name, const Options& options); -#ifdef VELOX_ENABLE_BACKWARD_COMPATIBILITY - LocalFileSink(const std::string& name, MetricsLogPtr metricLogger) - : LocalFileSink(name, {.metricLogger = std::move(metricLogger)}) {} -#endif ~LocalFileSink() override { destroy(); diff --git a/velox/dwio/common/FilterNode.h b/velox/dwio/common/FilterNode.h index dd4696d3ae86e..5aafd0964e283 100644 --- a/velox/dwio/common/FilterNode.h +++ b/velox/dwio/common/FilterNode.h @@ -102,14 +102,14 @@ struct FilterNode { return name == other.name; } - bool match(const std::string_view& name) const { + bool match(const std::string_view& name_to_match_2) const { // no match if any is invalid if (!valid()) { // even current is invlaid return false; } - return this->name == name; + return name == name_to_match_2; } // expect the incoming list has all valid nodes @@ -117,8 +117,8 @@ struct FilterNode { std::vector::const_iterator in( const std::vector& list) const { return std::find_if( - list.cbegin(), list.cend(), [this](const FilterNode& node) { - return node.match(*this); + list.cbegin(), list.cend(), [this](const FilterNode& filter_node_2) { + return filter_node_2.match(*this); }); } @@ -155,21 +155,6 @@ using ColumnFilter = std::vector; class FilterType; using FilterTypePtr = std::shared_ptr; class FilterType { - private: - const FilterNode node_; - const std::weak_ptr parent_; - std::vector children_; - // a flat to decide if current node is needed - bool read_; - // a flag to indicate if current node is in content - bool inContent_; - // request type in the filter tree node - std::shared_ptr requestType_; - // data type in the filter tree node - std::shared_ptr dataType_; - // sequence filter for given node - empty if no filter - SeqFilter seqFilter_; - public: // a single value indicating not found (invalid node) static const FilterTypePtr& getInvalid() { @@ -191,7 +176,7 @@ class FilterType { read_{node.node == 0}, inContent_{inContent}, requestType_{std::move(type)}, - dataType_{std::move(contentType)}, + fileType_{std::move(contentType)}, seqFilter_{std::make_shared>()} {} FilterType( @@ -214,7 +199,6 @@ class FilterType { return node_; } - public: inline void setRead() { read_ = true; } @@ -236,11 +220,11 @@ class FilterType { } inline const std::shared_ptr& getDataType() const { - return dataType_; + return fileType_; } - inline void setDataType(const std::shared_ptr& dataType) { - dataType_ = dataType; + inline void setDataType(const std::shared_ptr& fileType) { + fileType_ = fileType; } inline bool valid() const { @@ -256,10 +240,10 @@ class FilterType { return requestType_->kind(); } - // return node ID in the type tree + /// Returns node ID in the type tree inline uint64_t getId() const { // Cannot get ID for invalid node - DWIO_ENSURE_EQ(valid(), true); + VELOX_CHECK(valid()); return node_.node; } @@ -267,8 +251,8 @@ class FilterType { return node_.node == 0; } - inline void addChild(const FilterTypePtr& child) { - children_.push_back(child); + inline void addChild(FilterTypePtr child) { + children_.push_back(std::move(child)); } inline void setSequenceFilter(const SeqFilter& seqFilter) { @@ -287,6 +271,21 @@ class FilterType { return seqFilter_->empty() || seqFilter_->find(sequence) != seqFilter_->end(); } + + private: + const FilterNode node_; + const std::weak_ptr parent_; + std::vector children_; + // a flat to decide if current node is needed + bool read_; + // a flag to indicate if current node is in content + bool inContent_; + // request type in the filter tree node + std::shared_ptr requestType_; + // data type in the filter tree node + std::shared_ptr fileType_; + // sequence filter for given node - empty if no filter + SeqFilter seqFilter_; }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/FlatMapHelper.cpp b/velox/dwio/common/FlatMapHelper.cpp index 7abc8ee3ac6b9..8f97779efc206 100644 --- a/velox/dwio/common/FlatMapHelper.cpp +++ b/velox/dwio/common/FlatMapHelper.cpp @@ -43,6 +43,7 @@ void reset(VectorPtr& vector, vector_size_t size, bool hasNulls) { void initializeStringVector( VectorPtr& vector, + const TypePtr& type, memory::MemoryPool& pool, const std::vector& vectors) { vector_size_t size = 0; @@ -64,7 +65,7 @@ void initializeStringVector( } } initializeFlatVector( - vector, pool, VARCHAR(), size, hasNulls, std::move(buffers)); + vector, pool, type, size, hasNulls, std::move(buffers)); } } // namespace detail @@ -103,19 +104,19 @@ void initializeVectorImpl( template <> void initializeVectorImpl( VectorPtr& vector, - const TypePtr& /* type */, + const TypePtr& type, memory::MemoryPool& pool, const std::vector& vectors) { - detail::initializeStringVector(vector, pool, vectors); + detail::initializeStringVector(vector, type, pool, vectors); } template <> void initializeVectorImpl( VectorPtr& vector, - const TypePtr& /* type */, + const TypePtr& type, memory::MemoryPool& pool, const std::vector& vectors) { - detail::initializeStringVector(vector, pool, vectors); + detail::initializeStringVector(vector, type, pool, vectors); } namespace { diff --git a/velox/dwio/common/FlatMapHelper.h b/velox/dwio/common/FlatMapHelper.h index ad67f12aeade4..ac97b5c3b1347 100644 --- a/velox/dwio/common/FlatMapHelper.h +++ b/velox/dwio/common/FlatMapHelper.h @@ -37,14 +37,6 @@ void resetIfNotWritable(VectorPtr& vector, const T&... buffer) { FOLLY_POP_WARNING } -// Initialize string vector. -void initializeStringVector( - VectorPtr& vector, - memory::MemoryPool& pool, - vector_size_t size, - bool hasNulls, - std::vector&& stringBuffers); - } // namespace detail // Struct for keeping track flatmap key stream metrics. diff --git a/velox/dwio/common/FormatData.h b/velox/dwio/common/FormatData.h index 369e899effcfe..bff3c420bc507 100644 --- a/velox/dwio/common/FormatData.h +++ b/velox/dwio/common/FormatData.h @@ -17,9 +17,8 @@ #pragma once #include "velox/common/memory/Memory.h" -#include "velox/dwio/common/ColumnSelector.h" +#include "velox/dwio/common/PositionProvider.h" #include "velox/dwio/common/ScanSpec.h" -#include "velox/dwio/common/SeekableInputStream.h" #include "velox/dwio/common/Statistics.h" #include "velox/dwio/common/TypeWithId.h" #include "velox/type/Filter.h" @@ -34,14 +33,14 @@ class FormatData { template T& as() { - return *reinterpret_cast(this); + return *static_cast(this); } /// Reads nulls if the format has nulls separate from the encoded /// data. If there are no nulls, 'nulls' is set to nullptr, else to /// a suitable sized and padded Buffer. 'incomingNulls' may be given /// if there are enclosing level nulls that should be merged into - /// the read reasult. If provided, this has 'numValues' bits and + /// the read result. If provided, this has 'numValues' bits and /// each zero marks an incoming null for which no bit is read from /// the nulls stream of 'this'. For Parquet, 'nulls' is always set /// to nullptr because nulls are represented by the data pages @@ -51,7 +50,7 @@ class FormatData { /// of a column are of interest, e.g. is null filter. virtual void readNulls( vector_size_t numValues, - const uint64_t* FOLLY_NULLABLE incomingNulls, + const uint64_t* incomingNulls, BufferPtr& nulls, bool nullsOnly = false) = 0; diff --git a/velox/dwio/common/InputStream.cpp b/velox/dwio/common/InputStream.cpp index afd76c7cb6add..e2dd496c16d88 100644 --- a/velox/dwio/common/InputStream.cpp +++ b/velox/dwio/common/InputStream.cpp @@ -16,15 +16,10 @@ #include "velox/dwio/common/InputStream.h" -#include #include #include #include -#include -#include -#include #include -#include #include #include #include @@ -39,6 +34,16 @@ using ::facebook::velox::common::Region; namespace facebook::velox::dwio::common { +namespace { +int64_t totalBufferSize(const std::vector>& buffers) { + int64_t bufferSize = 0; + for (auto& buffer : buffers) { + bufferSize += buffer.size(); + } + return bufferSize; +} +} // namespace + folly::SemiFuture InputStream::readAsync( const std::vector>& buffers, uint64_t offset, @@ -67,50 +72,43 @@ void ReadFileInputStream::read( uint64_t length, uint64_t offset, MetricsLog::MetricsType purpose) { - if (!buf) { - throw std::invalid_argument("Buffer is null"); - } + VELOX_CHECK_NOT_NULL(buf); logRead(offset, length, purpose); - auto readStartMicros = getCurrentTimeMicro(); - std::string_view data_read = readFile_->pread(offset, length, buf); + uint64_t readTimeUs{0}; + std::string_view readData; + { + MicrosecondTimer timer(&readTimeUs); + readData = readFile_->pread(offset, length, buf); + } if (stats_) { stats_->incRawBytesRead(length); - stats_->incTotalScanTime((getCurrentTimeMicro() - readStartMicros) * 1000); + stats_->incTotalScanTime(readTimeUs * 1'000); } - DWIO_ENSURE_EQ( - data_read.size(), + VELOX_CHECK_EQ( + readData.size(), length, - "Should read exactly as requested. File name: ", + "Should read exactly as requested. File name: {}, offset: {}, length: {}, read: {}", getName(), - ", offset: ", offset, - ", length: ", length, - ", read: ", - data_read.size()); + readData.size()); } void ReadFileInputStream::read( const std::vector>& buffers, uint64_t offset, LogType logType) { - int64_t bufferSize = 0; - for (auto& buffer : buffers) { - bufferSize += buffer.size(); - } + const int64_t bufferSize = totalBufferSize(buffers); logRead(offset, bufferSize, logType); - auto size = readFile_->preadv(offset, buffers); - DWIO_ENSURE_EQ( + const auto size = readFile_->preadv(offset, buffers); + VELOX_CHECK_EQ( size, bufferSize, - "Should read exactly as requested. File name: ", + "Should read exactly as requested. File name: {}, offset: {}, length: {}, read: {}", getName(), - ", offset: ", offset, - ", length: ", bufferSize, - ", read: ", size); } @@ -118,10 +116,7 @@ folly::SemiFuture ReadFileInputStream::readAsync( const std::vector>& buffers, uint64_t offset, LogType logType) { - int64_t bufferSize = 0; - for (auto& buffer : buffers) { - bufferSize += buffer.size(); - } + const int64_t bufferSize = totalBufferSize(buffers); logRead(offset, bufferSize, logType); return readFile_->preadvAsync(offset, buffers); } @@ -134,18 +129,18 @@ void ReadFileInputStream::vread( folly::Range regions, folly::Range iobufs, const LogType purpose) { - DWIO_ENSURE_GT(regions.size(), 0, "regions to read can't be empty"); + VELOX_CHECK_GT(regions.size(), 0, "regions to read can't be empty"); const size_t length = std::accumulate( regions.cbegin(), regions.cend(), size_t(0), [&](size_t acc, const auto& r) { return acc + r.length; }); logRead(regions[0].offset, length, purpose); - auto readStartMs = getCurrentTimeMs(); + auto readStartMicros = getCurrentTimeMicro(); readFile_->preadv(regions, iobufs); if (stats_) { stats_->incRawBytesRead(length); - stats_->incTotalScanTime(getCurrentTimeMs() - readStartMs); + stats_->incTotalScanTime((getCurrentTimeMicro() - readStartMicros) * 1000); } } diff --git a/velox/dwio/common/InputStream.h b/velox/dwio/common/InputStream.h index 4c1753cdc7f04..19bf4dcccda0e 100644 --- a/velox/dwio/common/InputStream.h +++ b/velox/dwio/common/InputStream.h @@ -46,7 +46,7 @@ class InputStream { explicit InputStream( const std::string& path, const MetricsLogPtr& metricsLog = MetricsLog::voidLog(), - IoStatistics* FOLLY_NULLABLE stats = nullptr) + IoStatistics* stats = nullptr) : path_{path}, metricsLog_{metricsLog}, stats_(stats) {} virtual ~InputStream() = default; @@ -54,7 +54,7 @@ class InputStream { /** * Get the stats object */ - IoStatistics* FOLLY_NULLABLE getStats() const { + IoStatistics* getStats() const { return stats_; } @@ -76,7 +76,7 @@ class InputStream { * @param length the number of bytes to read. * @param offset the position in the stream to read from. */ - virtual void read(void* FOLLY_NONNULL, uint64_t, uint64_t, LogType) = 0; + virtual void read(void*, uint64_t, uint64_t, LogType) = 0; /** * Read starting at offset into buffers, filling the buffers left to right. A @@ -124,9 +124,6 @@ class InputStream { folly::Range iobufs, const LogType purpose) = 0; - // case insensitive find - static uint32_t ifind(const std::string& src, const std::string& target); - const std::string& getName() const; virtual void logRead(uint64_t offset, uint64_t length, LogType purpose); @@ -134,19 +131,19 @@ class InputStream { protected: std::string path_; MetricsLogPtr metricsLog_; - IoStatistics* FOLLY_NULLABLE stats_; + IoStatistics* stats_; }; -// An input stream that reads from an already opened ReadFile. +/// An input stream that reads from an already opened ReadFile. class ReadFileInputStream final : public InputStream { public: - // Take shared ownership of |readFile|. + /// Takes shared ownership of |readFile|. explicit ReadFileInputStream( std::shared_ptr, const MetricsLogPtr& metricsLog = MetricsLog::voidLog(), - IoStatistics* FOLLY_NULLABLE stats = nullptr); + IoStatistics* stats = nullptr); - virtual ~ReadFileInputStream() {} + ~ReadFileInputStream() override = default; uint64_t getLength() const final override { return readFile_->size(); @@ -156,7 +153,7 @@ class ReadFileInputStream final : public InputStream { return readFile_->getNaturalReadSize(); } - void read(void* FOLLY_NONNULL, uint64_t, uint64_t, LogType) override; + void read(void*, uint64_t, uint64_t, LogType) override; void read( const std::vector>& buffers, diff --git a/velox/dwio/common/IntDecoder.cpp b/velox/dwio/common/IntDecoder.cpp index a8e1243ca339c..9ee24dd7cf77c 100644 --- a/velox/dwio/common/IntDecoder.cpp +++ b/velox/dwio/common/IntDecoder.cpp @@ -30,21 +30,23 @@ FOLLY_ALWAYS_INLINE void IntDecoder::skipVarints(uint64_t items) { template FOLLY_ALWAYS_INLINE uint64_t IntDecoder::skipVarintsInBuffer(uint64_t items) { + VELOX_DCHECK_EQ(pendingSkip_, 0); static constexpr uint64_t kVarintMask = 0x8080808080808080L; - if (bufferStart == bufferEnd) { + if (bufferStart_ == bufferEnd_) { const void* bufferPointer; int32_t size; - if (!inputStream->Next(&bufferPointer, &size)) { - VELOX_CHECK(false, "Skipping past end of strean"); + if (!inputStream_->Next(&bufferPointer, &size)) { + VELOX_FAIL("Skipping past end of strean"); } - bufferStart = static_cast(bufferPointer); - bufferEnd = bufferStart + size; + bufferStart_ = static_cast(bufferPointer); + bufferEnd_ = bufferStart_ + size; } + uint64_t toSkip = items; - while (bufferEnd - bufferStart >= sizeof(uint64_t)) { + while (bufferEnd_ - bufferStart_ >= sizeof(uint64_t)) { uint64_t controlBits = - (~*reinterpret_cast(bufferStart) & kVarintMask); - auto endCount = __builtin_popcountll(controlBits); + (~*reinterpret_cast(bufferStart_) & kVarintMask); + const auto endCount = __builtin_popcountll(controlBits); if (endCount >= toSkip) { // The range to skip ends within 'word'. Clear all but the // last end marker bits and count trailing zeros to see what @@ -52,71 +54,74 @@ IntDecoder::skipVarintsInBuffer(uint64_t items) { for (int32_t i = 1; i < toSkip; ++i) { controlBits &= controlBits - 1; } - auto zeros = __builtin_ctzll(controlBits); - bufferStart += (zeros + 1) / 8; + const auto tailingZeros = __builtin_ctzll(controlBits); + bufferStart_ += (tailingZeros + 1) / 8; return items; } toSkip -= endCount; - bufferStart += sizeof(uint64_t); + bufferStart_ += sizeof(uint64_t); } - while (toSkip && bufferEnd > bufferStart) { - if ((*reinterpret_cast(bufferStart) & 0x80) == 0) { + while ((toSkip > 0) && bufferEnd_ > bufferStart_) { + if ((*reinterpret_cast(bufferStart_) & 0x80) == 0) { --toSkip; } - ++bufferStart; + ++bufferStart_; } return items - toSkip; } template -void IntDecoder::skipLongsFast(uint64_t numValues) { - if (useVInts) { +void IntDecoder::skipLongs(uint64_t numValues) { + VELOX_DCHECK_EQ(pendingSkip_, 0); + if (useVInts_) { skipVarints(numValues); } else { - skipBytes(numValues * numBytes, inputStream.get(), bufferStart, bufferEnd); + skipBytes( + numValues * numBytes_, inputStream_.get(), bufferStart_, bufferEnd_); } } -template void IntDecoder::skipLongsFast(uint64_t numValues); -template void IntDecoder::skipLongsFast(uint64_t numValues); +template void IntDecoder::skipLongs(uint64_t numValues); +template void IntDecoder::skipLongs(uint64_t numValues); template template void IntDecoder::bulkReadFixed(uint64_t size, T* result) { + VELOX_DCHECK_EQ(pendingSkip_, 0); if (isSigned) { - switch (numBytes) { + switch (numBytes_) { case 2: dwio::common::readContiguous( - size, *inputStream, result, bufferStart, bufferEnd); + size, *inputStream_, result, bufferStart_, bufferEnd_); break; case 4: dwio::common::readContiguous( - size, *inputStream, result, bufferStart, bufferEnd); + size, *inputStream_, result, bufferStart_, bufferEnd_); break; case 8: dwio::common::readContiguous( - size, *inputStream, result, bufferStart, bufferEnd); + size, *inputStream_, result, bufferStart_, bufferEnd_); break; default: - VELOX_FAIL("Bad fixed width {}", numBytes); + VELOX_FAIL("Bad fixed width {}", numBytes_); } } else { - switch (numBytes) { + switch (numBytes_) { case 2: dwio::common::readContiguous( - size, *inputStream, result, bufferStart, bufferEnd); + size, *inputStream_, result, bufferStart_, bufferEnd_); break; case 4: dwio::common::readContiguous( - size, *inputStream, result, bufferStart, bufferEnd); + size, *inputStream_, result, bufferStart_, bufferEnd_); break; case 8: dwio::common::readContiguous( - size, *inputStream, result, bufferStart, bufferEnd); + size, *inputStream_, result, bufferStart_, bufferEnd_); break; default: - VELOX_FAIL("Bad fixed width {}", numBytes); + VELOX_FAIL("Bad fixed width {}", numBytes_); } } } @@ -127,39 +132,40 @@ void IntDecoder::bulkReadRowsFixed( RowSet rows, int32_t initialRow, T* result) { + VELOX_DCHECK_EQ(pendingSkip_, 0); if (isSigned) { - switch (numBytes) { + switch (numBytes_) { case 2: dwio::common::readRows( - rows, initialRow, *inputStream, result, bufferStart, bufferEnd); + rows, initialRow, *inputStream_, result, bufferStart_, bufferEnd_); break; case 4: dwio::common::readRows( - rows, initialRow, *inputStream, result, bufferStart, bufferEnd); + rows, initialRow, *inputStream_, result, bufferStart_, bufferEnd_); break; case 8: dwio::common::readRows( - rows, initialRow, *inputStream, result, bufferStart, bufferEnd); + rows, initialRow, *inputStream_, result, bufferStart_, bufferEnd_); break; default: - VELOX_FAIL("Bad fixed width {}", numBytes); + VELOX_FAIL("Bad fixed width {}", numBytes_); } } else { - switch (numBytes) { + switch (numBytes_) { case 2: dwio::common::readRows( - rows, initialRow, *inputStream, result, bufferStart, bufferEnd); + rows, initialRow, *inputStream_, result, bufferStart_, bufferEnd_); break; case 4: dwio::common::readRows( - rows, initialRow, *inputStream, result, bufferStart, bufferEnd); + rows, initialRow, *inputStream_, result, bufferStart_, bufferEnd_); break; case 8: dwio::common::readRows( - rows, initialRow, *inputStream, result, bufferStart, bufferEnd); + rows, initialRow, *inputStream_, result, bufferStart_, bufferEnd_); break; default: - VELOX_FAIL("Bad fixed width {}", numBytes); + VELOX_FAIL("Bad fixed width {}", numBytes_); } } } @@ -915,7 +921,8 @@ FOLLY_ALWAYS_INLINE void varintSwitch( template template void IntDecoder::bulkRead(uint64_t size, T* result) { - if (!useVInts) { + skipPending(); + if (!useVInts_) { bulkReadFixed(size, result); return; } @@ -924,14 +931,14 @@ void IntDecoder::bulkRead(uint64_t size, T* result) { uint64_t carryover = 0; int32_t carryoverBits = 0; auto output = result; - const char* pos = bufferStart; + const char* pos = bufferStart_; auto end = result + size; if (pos) { // Decrement only if non-null to avoid asan error. pos -= maskSize; } while (output < end) { - while (end >= output + 8 && bufferEnd - pos >= 8 + maskSize) { + while (end >= output + 8 && bufferEnd_ - pos >= 8 + maskSize) { pos += maskSize; const auto word = folly::loadUnaligned(pos); const uint64_t controlBits = bits::extractBits(word, mask); @@ -940,16 +947,16 @@ void IntDecoder::bulkRead(uint64_t size, T* result) { if (pos) { pos += maskSize; } - bufferStart = pos; + bufferStart_ = pos; if (output < end) { *output++ = (readVuLong() << carryoverBits) | carryover; carryover = 0; carryoverBits = 0; while (output < end) { *output++ = readVuLong(); - if (output + 8 <= end && bufferEnd - bufferStart > 8 + maskSize) { + if (output + 8 <= end && bufferEnd_ - bufferStart_ > 8 + maskSize) { // Go back to fast loop after refilling the buffer. - pos = bufferStart - maskSize; + pos = bufferStart_ - maskSize; break; } } @@ -966,7 +973,8 @@ void IntDecoder::bulkReadRows( RowSet rows, T* result, int32_t initialRow) { - if (!useVInts) { + skipPending(); + if (!useVInts_) { bulkReadRowsFixed(rows, initialRow, result); return; } @@ -975,7 +983,7 @@ void IntDecoder::bulkReadRows( uint64_t carryover = 0; int32_t carryoverBits = 0; auto output = result; - const char* pos = bufferStart; + const char* pos = bufferStart_; int32_t nextRowIndex = 0; int32_t nextRow = rows[0]; int32_t row = initialRow; @@ -986,7 +994,7 @@ void IntDecoder::bulkReadRows( pos -= maskSize; } while (nextRowIndex < rows.size()) { - while (row + 8 <= endRow && bufferEnd - pos >= 8 + maskSize) { + while (row + 8 <= endRow && bufferEnd_ - pos >= 8 + maskSize) { pos += maskSize; const auto word = folly::loadUnaligned(pos); if (nextRow >= row + 8) { @@ -2305,7 +2313,7 @@ void IntDecoder::bulkReadRows( if (pos) { pos += maskSize; } - bufferStart = pos; + bufferStart_ = pos; DCHECK(!carryover || row == nextRow); while (nextRowIndex < endRowIndex) { skipVarints(nextRow - row); @@ -2317,9 +2325,9 @@ void IntDecoder::bulkReadRows( } row = nextRow + 1; nextRow = rows[nextRowIndex]; - if (endRow - row >= 8 && bufferEnd - bufferStart > 8 + maskSize) { + if (endRow - row >= 8 && bufferEnd_ - bufferStart_ > 8 + maskSize) { // Go back to fast loop after refilling the buffer. - pos = bufferStart - maskSize; + pos = bufferStart_ - maskSize; break; } } diff --git a/velox/dwio/common/IntDecoder.h b/velox/dwio/common/IntDecoder.h index 118bde04280cd..016e8b1f8244d 100644 --- a/velox/dwio/common/IntDecoder.h +++ b/velox/dwio/common/IntDecoder.h @@ -19,6 +19,7 @@ #include #include #include +#include "velox/common/base/Nulls.h" #include "velox/common/encode/Coding.h" #include "velox/dwio/common/IntCodecCommon.h" #include "velox/dwio/common/SeekableInputStream.h" @@ -38,46 +39,43 @@ class IntDecoder { bool useVInts, uint32_t numBytes, bool bigEndian = false) - : inputStream(std::move(input)), - bufferStart(nullptr), - bufferEnd(bufferStart), - useVInts(useVInts), - numBytes(numBytes), - bigEndian(bigEndian) {} + : inputStream_(std::move(input)), + bufferStart_(nullptr), + bufferEnd_(bufferStart_), + useVInts_(useVInts), + numBytes_(numBytes), + bigEndian_(bigEndian) {} - // Constructs for use in Parquet /Alphawhere the buffer is always preloaded. - IntDecoder(const char* FOLLY_NONNULL start, const char* FOLLY_NONNULL end) - : bufferStart(start), bufferEnd(end), useVInts(false), numBytes(0) {} + /// Constructs for use in Parquet /Alphawhere the buffer is always preloaded. + IntDecoder(const char* start, const char* end) + : bufferStart_(start), bufferEnd_(end), useVInts_(false), numBytes_(0) {} virtual ~IntDecoder() = default; - /** - * Seek to a specific row group. - */ + /// Seeks to a specific row group. Should not read the underlying input + /// stream to avoid decoding same data multiple times. virtual void seekToRowGroup( dwio::common::PositionProvider& positionProvider) = 0; - /** - * Seek over a given number of values. - */ - virtual void skip(uint64_t numValues) = 0; + /// Seeks over a given number of values. Does not decode the underlying input + /// stream. + void skip(uint64_t numValues) { + pendingSkip_ += numValues; + } /** - * Read a number of values into the batch. + * Read a number of values into the batch. Should call skipPending() in the + * beginning. + * * @param data the array to read into * @param numValues the number of values to read * @param nulls If the pointer is null, all values are read. If the * pointer is not null, positions that are true are skipped. */ - virtual void next( - int64_t* FOLLY_NONNULL data, - uint64_t numValues, - const uint64_t* FOLLY_NULLABLE nulls) = 0; - - virtual void next( - int32_t* FOLLY_NONNULL data, - uint64_t numValues, - const uint64_t* FOLLY_NULLABLE nulls) { + virtual void + next(int64_t* data, uint64_t numValues, const uint64_t* nulls) = 0; + + virtual void next(int32_t* data, uint64_t numValues, const uint64_t* nulls) { if (numValues <= 4) { int64_t temp[4]; next(temp, numValues, nulls); @@ -93,23 +91,17 @@ class IntDecoder { } } - virtual void nextInts( - int32_t* FOLLY_NONNULL data, - uint64_t numValues, - const uint64_t* FOLLY_NULLABLE nulls) { + virtual void + nextInts(int32_t* data, uint64_t numValues, const uint64_t* nulls) { narrow(data, numValues, nulls); } - virtual void nextShorts( - int16_t* FOLLY_NONNULL data, - uint64_t numValues, - const uint64_t* FOLLY_NULLABLE nulls) { + virtual void + nextShorts(int16_t* data, uint64_t numValues, const uint64_t* nulls) { narrow(data, numValues, nulls); } - virtual void nextLengths( - int32_t* FOLLY_NONNULL /*values*/, - int32_t /*numValues*/) { + virtual void nextLengths(int32_t* /*values*/, int32_t /*numValues*/) { VELOX_FAIL("A length decoder should be a RLEv1"); } @@ -117,37 +109,43 @@ class IntDecoder { * Load RowIndex values for the stream being read. * @return updated start index after this stream's index values. */ - size_t loadIndices(size_t startIndex) { - return inputStream->positionSize() + startIndex + 1; + size_t loadIndices(size_t startIndex) const { + return inputStream_->positionSize() + startIndex + 1; } - void skipLongs(uint64_t numValues) { - skipLongsFast(numValues); - } - - // Optimized variant of skipLongs using popcnt. Used on selective - // path only pending validation. - void skipLongsFast(uint64_t numValues); - // Reads 'size' consecutive T' and stores then in 'result'. template - void bulkRead(uint64_t size, T* FOLLY_NONNULL result); + void bulkRead(uint64_t size, T* result); // Reads data at positions 'rows' to 'result'. 'initialRow' is the // row number of the first unread element of 'this'. if rows is {10} // and 'initialRow' is 9, then this skips one element and reads the // next element into 'result'. template - void - bulkReadRows(RowSet rows, T* FOLLY_NONNULL result, int32_t initialRow = 0); + void bulkReadRows(RowSet rows, T* result, int32_t initialRow = 0); protected: + // Actually skip the pending entries. + virtual void skipPending() = 0; + + template + inline void skip(int32_t numValues, int32_t current, const uint64_t* nulls) { + if constexpr (kHasNulls) { + numValues = bits::countNonNulls(nulls, current, current + numValues); + } + pendingSkip_ += numValues; + if (pendingSkip_ > 0) { + skipPending(); + } + } + + void skipLongs(uint64_t numValues); + template - void bulkReadFixed(uint64_t size, T* FOLLY_NONNULL result); + void bulkReadFixed(uint64_t size, T* result); template - void - bulkReadRowsFixed(RowSet rows, int32_t initialRow, T* FOLLY_NONNULL result); + void bulkReadRowsFixed(RowSet rows, int32_t initialRow, T* result); template T readInt(); @@ -163,75 +161,66 @@ class IntDecoder { template cppType readLittleEndianFromBigEndian(); - // Applies 'visitor to 'numRows' consecutive values. - template - void readDense(int32_t numRows, Visitor& visitor) { - auto data = visitor.mutableValues(numRows); - bulkRead(numRows, data); - visitor.processN(data, numRows); - } - private: uint64_t skipVarintsInBuffer(uint64_t items); void skipVarints(uint64_t items); int128_t readVsHugeInt(); uint128_t readVuHugeInt(); - protected: - // note: there is opportunity for performance gains here by avoiding - // this by directly supporting deserialization into the correct - // target data type + // NOTE: there is opportunity for performance gains here by avoiding this by + // directly supporting deserialization into the correct target data type template - void narrow( - T* FOLLY_NONNULL const data, - const uint64_t numValues, - const uint64_t* FOLLY_NULLABLE const nulls) { - DWIO_ENSURE_LE(numBytes, sizeof(T)) + void narrow(T* data, uint64_t numValues, const uint64_t* nulls) { + VELOX_CHECK_LE(numBytes_, sizeof(T)); std::array buf; uint64_t remain = numValues; T* dataPtr = data; const uint64_t* nullsPtr = nulls; - while (remain != 0) { - uint64_t num = std::min(remain, static_cast(buf.size())); + while (remain > 0) { + const uint64_t num = std::min(remain, static_cast(buf.size())); next(buf.data(), num, nullsPtr); for (uint64_t i = 0; i < num; ++i) { *(dataPtr++) = (T)buf[i]; } remain -= num; if (remain != 0 && nullsPtr) { - DWIO_ENSURE(num % 64 == 0); + VELOX_CHECK_EQ(num % 64, 0); nullsPtr += num / 64; } } } - const std::unique_ptr inputStream; - const char* FOLLY_NULLABLE bufferStart; - const char* FOLLY_NULLABLE bufferEnd; - const bool useVInts; - const uint32_t numBytes; - bool bigEndian; + protected: + const std::unique_ptr inputStream_; + const char* bufferStart_; + const char* bufferEnd_; + const bool useVInts_; + const uint32_t numBytes_; + const bool bigEndian_; + int64_t pendingSkip_{0}; }; template FOLLY_ALWAYS_INLINE signed char IntDecoder::readByte() { - if (UNLIKELY(bufferStart == bufferEnd)) { + VELOX_DCHECK_EQ(pendingSkip_, 0); + + if (UNLIKELY(bufferStart_ == bufferEnd_)) { int32_t bufferLength; const void* bufferPointer; - DWIO_ENSURE( - inputStream->Next(&bufferPointer, &bufferLength), - "bad read in readByte, ", - inputStream->getName()); - bufferStart = static_cast(bufferPointer); - bufferEnd = bufferStart + bufferLength; + const bool ret = inputStream_->Next(&bufferPointer, &bufferLength); + VELOX_CHECK(ret, "bad read in readByte, ", inputStream_->getName()); + bufferStart_ = static_cast(bufferPointer); + bufferEnd_ = bufferStart_ + bufferLength; } - return *(bufferStart++); + return *(bufferStart_++); } template FOLLY_ALWAYS_INLINE uint64_t IntDecoder::readVuLong() { - if (LIKELY(bufferEnd - bufferStart >= folly::kMaxVarintLength64)) { - const char* p = bufferStart; + VELOX_DCHECK_EQ(pendingSkip_, 0); + + if (LIKELY(bufferEnd_ - bufferStart_ >= folly::kMaxVarintLength64)) { + const char* p = bufferStart_; uint64_t val; do { int64_t b; @@ -285,29 +274,30 @@ FOLLY_ALWAYS_INLINE uint64_t IntDecoder::readVuLong() { if (LIKELY(b >= 0)) { break; } else { - DWIO_RAISE(fmt::format( + VELOX_FAIL( "Invalid encoding: likely corrupt data. bytes remaining: {} , useVInts: {}, numBytes: {}, Input Stream Name: {}, byte: {}, val: {}", - bufferEnd - bufferStart, - useVInts, - numBytes, - inputStream->getName(), + bufferEnd_ - bufferStart_, + useVInts_, + numBytes_, + inputStream_->getName(), b, - val)); + val); } } while (false); - bufferStart = p; + + bufferStart_ = p; return val; - } else { - int64_t result = 0; - int64_t offset = 0; - signed char ch; - do { - ch = readByte(); - result |= (ch & BASE_128_MASK) << offset; - offset += 7; - } while (ch < 0); - return result; } + + int64_t result = 0; + int64_t offset = 0; + signed char ch; + do { + ch = readByte(); + result |= (ch & BASE_128_MASK) << offset; + offset += 7; + } while (ch < 0); + return result; } template @@ -317,39 +307,41 @@ FOLLY_ALWAYS_INLINE int64_t IntDecoder::readVsLong() { template inline int64_t IntDecoder::readLongLE() { + VELOX_DCHECK_EQ(pendingSkip_, 0); int64_t result = 0; - if (bufferStart && bufferStart + sizeof(int64_t) <= bufferEnd) { - bufferStart += numBytes; - if (numBytes == 8) { - return *reinterpret_cast(bufferStart - 8); + if (bufferStart_ && bufferStart_ + sizeof(int64_t) <= bufferEnd_) { + bufferStart_ += numBytes_; + if (numBytes_ == 8) { + return *reinterpret_cast(bufferStart_ - 8); } - if (numBytes == 4) { + if (numBytes_ == 4) { if (isSigned) { - return *reinterpret_cast(bufferStart - 4); + return *reinterpret_cast(bufferStart_ - 4); } - return *reinterpret_cast(bufferStart - 4); + return *reinterpret_cast(bufferStart_ - 4); } if (isSigned) { - return *reinterpret_cast(bufferStart - 2); + return *reinterpret_cast(bufferStart_ - 2); } - return *reinterpret_cast(bufferStart - 2); + return *reinterpret_cast(bufferStart_ - 2); } + char b; int64_t offset = 0; - for (uint32_t i = 0; i < numBytes; ++i) { + for (uint32_t i = 0; i < numBytes_; ++i) { b = readByte(); result |= (b & BASE_256_MASK) << offset; offset += 8; } - if (isSigned && numBytes < 8) { - if (numBytes == 2) { + if (isSigned && numBytes_ < 8) { + if (numBytes_ == 2) { return static_cast(result); } - if (numBytes == 4) { + if (numBytes_ == 4) { return static_cast(result); } - DCHECK(false) << "Bad width for signed fixed width: " << numBytes; + VELOX_DCHECK(false, "Bad width for signed fixed width: {}", numBytes_); } return result; } @@ -357,19 +349,22 @@ inline int64_t IntDecoder::readLongLE() { template template inline cppType IntDecoder::readLittleEndianFromBigEndian() { + VELOX_DCHECK_EQ(pendingSkip_, 0); + cppType bigEndianValue = 0; // Input is in Big Endian layout of size numBytes. - if (bufferStart && bufferStart + sizeof(int64_t) <= bufferEnd) { - bufferStart += numBytes; - auto valueOffset = bufferStart - numBytes; + if (bufferStart_ && (bufferStart_ + sizeof(int64_t) <= bufferEnd_)) { + bufferStart_ += numBytes_; + const auto valueOffset = bufferStart_ - numBytes_; // Use first byte to initialize bigEndianValue. bigEndianValue = *(reinterpret_cast(valueOffset)) >= 0 ? 0 : -1; // Copy numBytes input to the bigEndianValue. - memcpy( - reinterpret_cast(&bigEndianValue) + (sizeof(cppType) - numBytes), + ::memcpy( + reinterpret_cast(&bigEndianValue) + + (sizeof(cppType) - numBytes_), reinterpret_cast(valueOffset), - numBytes); + numBytes_); // Convert bigEndianValue to little endian value and return. if constexpr (sizeof(cppType) == 16) { return bits::builtin_bswap128(bigEndianValue); @@ -377,11 +372,12 @@ inline cppType IntDecoder::readLittleEndianFromBigEndian() { return __builtin_bswap64(bigEndianValue); } } + char b; cppType offset = 0; cppType numBytesBigEndian = 0; // Read numBytes input into numBytesBigEndian. - for (uint32_t i = 0; i < numBytes; ++i) { + for (uint32_t i = 0; i < numBytes_; ++i) { b = readByte(); if constexpr (sizeof(cppType) == 16) { numBytesBigEndian |= (b & INT128_BASE_256_MASK) << offset; @@ -394,10 +390,10 @@ inline cppType IntDecoder::readLittleEndianFromBigEndian() { bigEndianValue = (reinterpret_cast(&numBytesBigEndian)[0]) >= 0 ? 0 : -1; // Copy numBytes input to the bigEndianValue. - memcpy( - reinterpret_cast(&bigEndianValue) + (sizeof(cppType) - numBytes), + ::memcpy( + reinterpret_cast(&bigEndianValue) + (sizeof(cppType) - numBytes_), reinterpret_cast(&numBytesBigEndian), - numBytes); + numBytes_); // Convert bigEndianValue to little endian value and return. if constexpr (sizeof(cppType) == 16) { return bits::builtin_bswap128(bigEndianValue); @@ -413,6 +409,8 @@ inline int128_t IntDecoder::readVsHugeInt() { template inline uint128_t IntDecoder::readVuHugeInt() { + VELOX_DCHECK_EQ(pendingSkip_, 0); + uint128_t value = 0; uint128_t work; uint32_t offset = 0; @@ -433,10 +431,10 @@ inline uint128_t IntDecoder::readVuHugeInt() { template template inline T IntDecoder::readInt() { - if (useVInts) { + if (useVInts_) { return readVInt(); } - if (bigEndian) { + if (bigEndian_) { return readLittleEndianFromBigEndian(); } else { if constexpr (std::is_same_v) { @@ -466,9 +464,7 @@ inline T IntDecoder::readVInt() { template <> template <> -inline void IntDecoder::bulkRead( - uint64_t /*size*/, - double* FOLLY_NONNULL /*result*/) { +inline void IntDecoder::bulkRead(uint64_t /*size*/, double* /*result*/) { VELOX_UNREACHABLE(); } @@ -476,16 +472,14 @@ template <> template <> inline void IntDecoder::bulkReadRows( RowSet /*rows*/, - double* FOLLY_NONNULL /*result*/, + double* /*result*/, int32_t /*initialRow*/) { VELOX_UNREACHABLE(); } template <> template <> -inline void IntDecoder::bulkRead( - uint64_t /*size*/, - double* FOLLY_NONNULL /*result*/) { +inline void IntDecoder::bulkRead(uint64_t /*size*/, double* /*result*/) { VELOX_UNREACHABLE(); } @@ -493,16 +487,14 @@ template <> template <> inline void IntDecoder::bulkReadRows( RowSet /*rows*/, - double* FOLLY_NONNULL /*result*/, + double* /*result*/, int32_t /*initialRow*/) { VELOX_UNREACHABLE(); } template <> template <> -inline void IntDecoder::bulkRead( - uint64_t /*size*/, - float* FOLLY_NONNULL /*result*/) { +inline void IntDecoder::bulkRead(uint64_t /*size*/, float* /*result*/) { VELOX_UNREACHABLE(); } @@ -510,16 +502,14 @@ template <> template <> inline void IntDecoder::bulkReadRows( RowSet /*rows*/, - float* FOLLY_NONNULL /*result*/, + float* /*result*/, int32_t /*initialRow*/) { VELOX_UNREACHABLE(); } template <> template <> -inline void IntDecoder::bulkRead( - uint64_t /*size*/, - float* FOLLY_NONNULL /*result*/) { +inline void IntDecoder::bulkRead(uint64_t /*size*/, float* /*result*/) { VELOX_UNREACHABLE(); } @@ -527,7 +517,7 @@ template <> template <> inline void IntDecoder::bulkReadRows( RowSet /*rows*/, - float* FOLLY_NONNULL /*result*/, + float* /*result*/, int32_t /*initialRow*/) { VELOX_UNREACHABLE(); } @@ -536,7 +526,7 @@ template <> template <> inline void IntDecoder::bulkRead( uint64_t /*size*/, - int128_t* FOLLY_NONNULL /*result*/) { + int128_t* /*result*/) { VELOX_UNREACHABLE(); } @@ -544,7 +534,7 @@ template <> template <> inline void IntDecoder::bulkReadRows( RowSet /*rows*/, - int128_t* FOLLY_NONNULL /*result*/, + int128_t* /*result*/, int32_t /*initialRow*/) { VELOX_UNREACHABLE(); } @@ -553,7 +543,7 @@ template <> template <> inline void IntDecoder::bulkRead( uint64_t /*size*/, - int128_t* FOLLY_NONNULL /*result*/) { + int128_t* /*result*/) { VELOX_UNREACHABLE(); } @@ -561,7 +551,7 @@ template <> template <> inline void IntDecoder::bulkReadRows( RowSet /*rows*/, - int128_t* FOLLY_NONNULL /*result*/, + int128_t* /*result*/, int32_t /*initialRow*/) { VELOX_UNREACHABLE(); } diff --git a/velox/dwio/common/MeasureTime.h b/velox/dwio/common/MeasureTime.h new file mode 100644 index 0000000000000..c4eaf1281f379 --- /dev/null +++ b/velox/dwio/common/MeasureTime.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace facebook { +namespace velox { +namespace dwio { +namespace common { + +class MeasureTime { + public: + explicit MeasureTime( + const std::function& + callback) + : callback_{callback}, + startTime_{std::chrono::high_resolution_clock::now()} {} + + MeasureTime(const MeasureTime&) = delete; + MeasureTime(MeasureTime&&) = delete; + MeasureTime& operator=(const MeasureTime&) = delete; + MeasureTime& operator=(MeasureTime&& other) = delete; + + ~MeasureTime() { + callback_(std::chrono::high_resolution_clock::now() - startTime_); + } + + private: + const std::function& + callback_; + const std::chrono::time_point startTime_; +}; + +// Make sure you don't pass a lambda to this function, because that will cause a +// std::function to be created on the fly (implicitly), and when we return from +// this function that std::function won't exist anymore. So when MeasureTime is +// destroyed, it will try to access a non-existing std::function. +inline std::optional measureTimeIfCallback( + const std::function& + callback) { + if (callback) { + return std::make_optional(callback); + } + return std::nullopt; +} + +} // namespace common +} // namespace dwio +} // namespace velox +} // namespace facebook diff --git a/velox/dwio/common/MetadataFilter.cpp b/velox/dwio/common/MetadataFilter.cpp index 6b5d943d92536..62bf79107e10a 100644 --- a/velox/dwio/common/MetadataFilter.cpp +++ b/velox/dwio/common/MetadataFilter.cpp @@ -149,13 +149,6 @@ struct MetadataFilter::OrNode : Node { namespace { -const core::FieldAccessTypedExpr* asField( - const core::ITypedExpr* expr, - int index) { - return dynamic_cast( - expr->inputs()[index].get()); -} - const core::CallTypedExpr* asCall(const core::ITypedExpr* expr) { return dynamic_cast(expr); } @@ -221,6 +214,7 @@ void MetadataFilter::eval( if (!root_) { return; } + LeafResults leafResults; for (auto& [leaf, result] : leafNodeResults) { VELOX_CHECK_EQ( @@ -233,7 +227,7 @@ void MetadataFilter::eval( "Duplicate results: {}", leaf->field().toString()); } - auto bitCount = finalResult.size() * 64; + const auto bitCount = finalResult.size() * 64; if (auto* combined = root_->eval(leafResults, bitCount)) { bits::orBits(finalResult.data(), combined, 0, bitCount); } diff --git a/velox/dwio/common/MetadataFilter.h b/velox/dwio/common/MetadataFilter.h index b44ae69cbd554..62b604b14407d 100644 --- a/velox/dwio/common/MetadataFilter.h +++ b/velox/dwio/common/MetadataFilter.h @@ -49,9 +49,9 @@ class MetadataFilter { std::string toString() const; private: - class Node; - class AndNode; - class OrNode; + struct Node; + struct AndNode; + struct OrNode; std::shared_ptr root_; }; diff --git a/velox/dwio/common/MetricsLog.h b/velox/dwio/common/MetricsLog.h index 3c1d9f26594b2..555e92f24d63a 100644 --- a/velox/dwio/common/MetricsLog.h +++ b/velox/dwio/common/MetricsLog.h @@ -72,7 +72,7 @@ class MetricsLog { }; // read path logging methods - virtual void logRead(const ReadMetrics& metrics) const {}; + virtual void logRead(const ReadMetrics& metrics) const {} virtual void logColumnFilter( const ColumnFilter& filter, diff --git a/velox/dwio/common/Mutation.h b/velox/dwio/common/Mutation.h index a3181df0e11a2..de5ff469c71a7 100644 --- a/velox/dwio/common/Mutation.h +++ b/velox/dwio/common/Mutation.h @@ -16,6 +16,8 @@ #pragma once +#include "velox/common/base/RandomUtil.h" + #include namespace facebook::velox::dwio::common { @@ -23,6 +25,12 @@ namespace facebook::velox::dwio::common { struct Mutation { /// Bit masks for row numbers to be deleted. const uint64_t* deletedRows = nullptr; + + random::RandomSkipTracker* randomSkip = nullptr; }; +inline bool hasDeletion(const Mutation* mutation) { + return mutation && (mutation->deletedRows || mutation->randomSkip); +} + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/OnDemandUnitLoader.cpp b/velox/dwio/common/OnDemandUnitLoader.cpp new file mode 100644 index 0000000000000..d4ef4f0a5ef29 --- /dev/null +++ b/velox/dwio/common/OnDemandUnitLoader.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/OnDemandUnitLoader.h" + +#include + +#include "velox/common/base/Exceptions.h" +#include "velox/dwio/common/MeasureTime.h" +#include "velox/dwio/common/UnitLoaderTools.h" + +using facebook::velox::dwio::common::measureTimeIfCallback; + +namespace facebook::velox::dwio::common { + +namespace { + +class OnDemandUnitLoader : public UnitLoader { + public: + OnDemandUnitLoader( + std::vector> loadUnits, + std::function + blockedOnIoCallback) + : loadUnits_{std::move(loadUnits)}, + blockedOnIoCallback_{std::move(blockedOnIoCallback)} {} + + ~OnDemandUnitLoader() override = default; + + LoadUnit& getLoadedUnit(uint32_t unit) override { + VELOX_CHECK_LT(unit, loadUnits_.size(), "Unit out of range"); + + if (loadedUnit_.has_value()) { + if (loadedUnit_.value() == unit) { + return *loadUnits_[unit]; + } + + loadUnits_[*loadedUnit_]->unload(); + loadedUnit_.reset(); + } + + { + auto measure = measureTimeIfCallback(blockedOnIoCallback_); + loadUnits_[unit]->load(); + } + loadedUnit_ = unit; + + return *loadUnits_[unit]; + } + + void onRead(uint32_t unit, uint64_t rowOffsetInUnit, uint64_t /* rowCount */) + override { + VELOX_CHECK_LT(unit, loadUnits_.size(), "Unit out of range"); + VELOX_CHECK_LT( + rowOffsetInUnit, loadUnits_[unit]->getNumRows(), "Row out of range"); + } + + void onSeek(uint32_t unit, uint64_t rowOffsetInUnit) override { + VELOX_CHECK_LT(unit, loadUnits_.size(), "Unit out of range"); + VELOX_CHECK_LE( + rowOffsetInUnit, loadUnits_[unit]->getNumRows(), "Row out of range"); + } + + private: + const std::vector> loadUnits_; + const std::function + blockedOnIoCallback_; + std::optional loadedUnit_; +}; + +} // namespace + +std::unique_ptr OnDemandUnitLoaderFactory::create( + std::vector> loadUnits, + uint64_t rowsToSkip) { + const auto totalRows = std::accumulate( + loadUnits.cbegin(), loadUnits.cend(), 0UL, [](uint64_t sum, auto& unit) { + return sum + unit->getNumRows(); + }); + VELOX_CHECK_LE( + rowsToSkip, + totalRows, + "Can only skip up to the past-the-end row of the file."); + return std::make_unique( + std::move(loadUnits), blockedOnIoCallback_); +} + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/OnDemandUnitLoader.h b/velox/dwio/common/OnDemandUnitLoader.h new file mode 100644 index 0000000000000..e2d6d4ab3efe5 --- /dev/null +++ b/velox/dwio/common/OnDemandUnitLoader.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "velox/dwio/common/UnitLoader.h" + +namespace facebook::velox::dwio::common { + +class OnDemandUnitLoaderFactory + : public velox::dwio::common::UnitLoaderFactory { + public: + explicit OnDemandUnitLoaderFactory( + std::function + blockedOnIoCallback) + : blockedOnIoCallback_{std::move(blockedOnIoCallback)} {} + ~OnDemandUnitLoaderFactory() override = default; + + std::unique_ptr create( + std::vector> loadUnits, + uint64_t rowsToSkip) override; + + private: + std::function + blockedOnIoCallback_; +}; + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/Options.cpp b/velox/dwio/common/Options.cpp index 32672da0d88d9..faab7802a42e4 100644 --- a/velox/dwio/common/Options.cpp +++ b/velox/dwio/common/Options.cpp @@ -16,12 +16,9 @@ #include "velox/dwio/common/Options.h" -namespace facebook { -namespace velox { -namespace dwio { -namespace common { +namespace facebook::velox::dwio::common { -FileFormat toFileFormat(std::string s) { +FileFormat toFileFormat(std::string_view s) { if (s == "dwrf") { return FileFormat::DWRF; } else if (s == "rc") { @@ -36,15 +33,15 @@ FileFormat toFileFormat(std::string s) { return FileFormat::JSON; } else if (s == "parquet") { return FileFormat::PARQUET; - } else if (s == "alpha") { - return FileFormat::ALPHA; + } else if (s == "nimble" || s == "alpha") { + return FileFormat::NIMBLE; } else if (s == "orc") { return FileFormat::ORC; } return FileFormat::UNKNOWN; } -std::string toString(FileFormat fmt) { +std::string_view toString(FileFormat fmt) { switch (fmt) { case FileFormat::DWRF: return "dwrf"; @@ -60,8 +57,8 @@ std::string toString(FileFormat fmt) { return "json"; case FileFormat::PARQUET: return "parquet"; - case FileFormat::ALPHA: - return "alpha"; + case FileFormat::NIMBLE: + return "nimble"; case FileFormat::ORC: return "orc"; default: @@ -69,7 +66,4 @@ std::string toString(FileFormat fmt) { } } -} // namespace common -} // namespace dwio -} // namespace velox -} // namespace facebook +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/Options.h b/velox/dwio/common/Options.h index 0a66bd7b5a9bf..0a4a9dbcff36e 100644 --- a/velox/dwio/common/Options.h +++ b/velox/dwio/common/Options.h @@ -20,7 +20,10 @@ #include #include +#include "velox/common/base/RandomUtil.h" +#include "velox/common/base/SpillConfig.h" #include "velox/common/compression/Compression.h" +#include "velox/common/config/Config.h" #include "velox/common/io/Options.h" #include "velox/common/memory/Memory.h" #include "velox/dwio/common/ColumnSelector.h" @@ -29,12 +32,12 @@ #include "velox/dwio/common/FlushPolicy.h" #include "velox/dwio/common/InputStream.h" #include "velox/dwio/common/ScanSpec.h" +#include "velox/dwio/common/UnitLoader.h" #include "velox/dwio/common/encryption/Encryption.h" +#include "velox/type/Timestamp.h" +#include "velox/type/tz/TimeZoneMap.h" -namespace facebook { -namespace velox { -namespace dwio { -namespace common { +namespace facebook::velox::dwio::common { enum class FileFormat { UNKNOWN = 0, @@ -45,12 +48,12 @@ enum class FileFormat { TEXT = 5, JSON = 6, PARQUET = 7, - ALPHA = 8, + NIMBLE = 8, ORC = 9, }; -FileFormat toFileFormat(std::string s); -std::string toString(FileFormat fmt); +FileFormat toFileFormat(std::string_view s); +std::string_view toString(FileFormat fmt); FOLLY_ALWAYS_INLINE std::ostream& operator<<( std::ostream& output, @@ -59,9 +62,7 @@ FOLLY_ALWAYS_INLINE std::ostream& operator<<( return output; } -/** - * Formatting options for serialization. - */ +/// Formatting options for serialization. enum class SerDeSeparator { FIELD_DELIM = 0, COLLECTION_DELIM = 1, @@ -79,6 +80,7 @@ class SerDeOptions { inline static const std::string kFieldDelim{"field.delim"}; inline static const std::string kCollectionDelim{"collection.delim"}; inline static const std::string kMapKeyDelim{"mapkey.delim"}; + inline static const std::string kEscapeChar{"escape.delim"}; explicit SerDeOptions( uint8_t fieldDelim = '\1', @@ -95,177 +97,156 @@ class SerDeOptions { }; struct TableParameter { + /// If present in the table parameters, the option is passed to the row reader + /// to instruct it to skip the number of rows from the current position. Used + /// to skip the column header row(s). static constexpr const char* kSkipHeaderLineCount = "skip.header.line.count"; + /// If present in the table parameters, the option overrides the default value + /// of the SerDeOptions::nullString. It causes any field read from the file + /// (usually of the TEXT format) to be considered NULL if it is equal to this + /// string. + static constexpr const char* kSerializationNullFormat = + "serialization.null.format"; }; -/** - * Options for creating a RowReader. - */ -class RowReaderOptions { - private: - uint64_t dataStart; - uint64_t dataLength; - bool preloadStripe; - bool projectSelectedType; - bool returnFlatVector_ = false; - ErrorTolerance errorTolerance_; - std::shared_ptr selector_; - std::shared_ptr scanSpec_ = nullptr; - std::shared_ptr metadataFilter_; - // Node id for map column to a list of keys to be projected as a struct. - std::unordered_map> flatmapNodeIdAsStruct_; - // Optional executors to enable internal reader parallelism. - // 'decodingExecutor' allow parallelising the vector decoding process. - // 'ioExecutor' enables parallelism when performing file system read - // operations. - std::shared_ptr decodingExecutor_; - bool appendRowNumberColumn_ = false; - // Function to populate metrics related to feature projection stats - // in Koski. This gets fired in FlatMapColumnReader. - // This is a bit of a hack as there is (by design) no good way - // To propogate information from column reader to Koski - std::function - keySelectionCallback_; - bool eagerFirstStripeLoad = true; - uint64_t skipRows_ = 0; +struct RowNumberColumnInfo { + column_index_t insertPosition; + std::string name; + // This flag is used to distinguish the explicit and implicit use cases. In + // explicit case, row index column is declared in the output type or used in + // subfield filters or remaining filter. In implicit case, it's not declared + // in the output columns but only in the split reader. + bool isExplicit; +}; + +class FormatSpecificOptions { + public: + virtual ~FormatSpecificOptions() = default; +}; +/// Options for creating a RowReader. +class RowReaderOptions { public: RowReaderOptions() noexcept - : dataStart(0), - dataLength(std::numeric_limits::max()), - preloadStripe(false), - projectSelectedType(false) {} - - /** - * For files that have structs as the top-level object, select the fields - * to read. The first field is 0, the second 1, and so on. By default, - * all columns are read. This option clears any previous setting of - * the selected columns. - * @param include a list of fields to read - * @return this - */ + : dataStart_(0), + dataLength_(std::numeric_limits::max()), + preloadStripe_(false), + projectSelectedType_(false) {} + + /// For files that have structs as the top-level object, select the fields + /// to read. The first field is 0, the second 1, and so on. By default, + /// all columns are read. This option clears any previous setting of + /// the selected columns. + /// @param include a list of fields to read + /// @return this RowReaderOptions& select(const std::shared_ptr& selector) { selector_ = selector; + if (selector) { + VELOX_CHECK_NULL(requestedType_); + requestedType_ = selector->getSchema(); + } return *this; } - /** - * Set the section of the file to process. - * @param offset the starting byte offset - * @param length the number of bytes to read - * @return this - */ + /// Sets the section of the file to process. + /// @param offset the starting byte offset + /// @param length the number of bytes to read + /// @return this RowReaderOptions& range(uint64_t offset, uint64_t length) { - dataStart = offset; - dataLength = length; + dataStart_ = offset; + dataLength_ = length; return *this; } - /** - * Get the list of selected field or type ids to read. - */ - const std::shared_ptr& getSelector() const { + /// Gets the list of selected field or type ids to read. + const std::shared_ptr& selector() const { return selector_; } - /** - * Get the start of the range for the data being processed. - * @return if not set, return 0 - */ - uint64_t getOffset() const { - return dataStart; + /// Gets the start of the range for the data being processed. + /// @return if not set, return 0 + uint64_t offset() const { + return dataStart_; } - /** - * Get the length of the range for the data being processed. - * @return if not set, return the maximum unsigned long. - */ - uint64_t getLength() const { - return dataLength; + /// Gets the length of the range for the data being processed. + /// @return if not set, return the maximum unsigned long. + uint64_t length() const { + return dataLength_; } - /** - * Get the limit of the range (lowest offset not in the range). - * @return if not set, return the maximum unsigned long. - */ - uint64_t getLimit() const { - return ((std::numeric_limits::max() - dataStart) > dataLength) - ? (dataStart + dataLength) + /// Gets the limit of the range (lowest offset not in the range). + /// @return if not set, return the maximum unsigned long. + uint64_t limit() const { + return ((std::numeric_limits::max() - dataStart_) > dataLength_) + ? (dataStart_ + dataLength_) : std::numeric_limits::max(); } - /** - * Request that stripes be pre-loaded. - */ + /// Requests that stripes be pre-loaded. void setPreloadStripe(bool preload) { - preloadStripe = preload; + preloadStripe_ = preload; } - /** - * Are stripes to be pre-loaded? - */ - bool getPreloadStripe() const { - return preloadStripe; + /// Are stripes to be pre-loaded? + bool preloadStripe() const { + return preloadStripe_; } - /* - * Will load the first stripe on RowReader creation, if true. - * This behavior is already happening in DWRF, but isn't desired for some use - * cases. So this flag allows us to turn it off. - */ + /// Will load the first stripe on RowReader creation, if true. + /// This behavior is already happening in DWRF, but isn't desired for some use + /// cases. So this flag allows us to turn it off. void setEagerFirstStripeLoad(bool load) { - eagerFirstStripeLoad = load; + eagerFirstStripeLoad_ = load; } - /* - * Will load the first stripe on RowReader creation, if true. - * This behavior is already happening in DWRF, but isn't desired for some use - * cases. So this flag allows us to turn it off. - */ - bool getEagerFirstStripeLoad() const { - return eagerFirstStripeLoad; + /// Will load the first stripe on RowReader creation, if true. + /// This behavior is already happening in DWRF, but isn't desired for some use + /// cases. So this flag allows us to turn it off. + bool eagerFirstStripeLoad() const { + return eagerFirstStripeLoad_; } - // For flat map, return flat vector representation - bool getReturnFlatVector() const { + /// For flat map, return flat vector representation + bool returnFlatVector() const { return returnFlatVector_; } - // For flat map, request that flat vector representation is used + /// For flat map, request that flat vector representation is used void setReturnFlatVector(bool value) { returnFlatVector_ = value; } - /** - * Request that the selected type be projected. - */ - void setProjectSelectedType(bool vProjectSelectedType) { - projectSelectedType = vProjectSelectedType; + /// Requests that the selected type be projected. + void setProjectSelectedType(bool value) { + projectSelectedType_ = value; } - /** - * Is the selected type to be projected? - */ - bool getProjectSelectedType() const { - return projectSelectedType; + /// Is the selected type to be projected? + bool projectSelectedType() const { + return projectSelectedType_; } - /** - * set RowReader error tolerance. - */ + /// Set RowReader error tolerance. void setErrorTolerance(const ErrorTolerance& errorTolerance) { errorTolerance_ = errorTolerance; } - /** - * get RowReader error tolerance. - */ - const ErrorTolerance& getErrorTolerance() const { + /// Get RowReader error tolerance. + const ErrorTolerance& errorTolerance() const { return errorTolerance_; } - const std::shared_ptr& getScanSpec() const { + const RowTypePtr& requestedType() const { + return requestedType_; + } + + void setRequestedType(RowTypePtr requestedType) { + VELOX_CHECK_NULL(selector_); + requestedType_ = std::move(requestedType); + } + + const std::shared_ptr& scanSpec() const { return scanSpec_; } @@ -273,8 +254,7 @@ class RowReaderOptions { scanSpec_ = std::move(scanSpec); } - const std::shared_ptr& getMetadataFilter() - const { + const std::shared_ptr& metadataFilter() const { return metadataFilter_; } @@ -296,7 +276,7 @@ class RowReaderOptions { } const std::unordered_map>& - getMapColumnIdAsStruct() const { + mapColumnIdAsStruct() const { return flatmapNodeIdAsStruct_; } @@ -304,18 +284,17 @@ class RowReaderOptions { decodingExecutor_ = executor; } - /* - * Set to true, if you want to add a new column to the results containing the - * row numbers. These row numbers are relative to the beginning of file (0 as - * first row) and does not affected by filtering or deletion during the read - * (it always counts all rows in the file). - */ - void setAppendRowNumberColumn(bool value) { - appendRowNumberColumn_ = value; + void setDecodingParallelismFactor(size_t factor) { + decodingParallelismFactor_ = factor; } - bool getAppendRowNumberColumn() const { - return appendRowNumberColumn_; + void setRowNumberColumnInfo( + std::optional rowNumberColumnInfo) { + rowNumberColumnInfo_ = std::move(rowNumberColumnInfo); + } + + const std::optional& rowNumberColumnInfo() const { + return rowNumberColumnInfo_; } void setKeySelectionCallback( @@ -327,114 +306,168 @@ class RowReaderOptions { const std::function< void(facebook::velox::dwio::common::flatmap::FlatMapKeySelectionStats)> - getKeySelectionCallback() const { + keySelectionCallback() const { return keySelectionCallback_; } + void setBlockedOnIoCallback( + std::function + blockedOnIoCallback) { + blockedOnIoCallback_ = std::move(blockedOnIoCallback); + } + + const std::function + blockedOnIoCallback() const { + return blockedOnIoCallback_; + } + + void setDecodingTimeCallback( + std::function + decodingTime) { + decodingTimeCallback_ = std::move(decodingTime); + } + + std::function + decodingTimeCallback() const { + return decodingTimeCallback_; + } + + void setStripeCountCallback( + std::function stripeCountCallback) { + stripeCountCallback_ = std::move(stripeCountCallback); + } + + std::function stripeCountCallback() const { + return stripeCountCallback_; + } + void setSkipRows(uint64_t skipRows) { skipRows_ = skipRows; } - bool getSkipRows() const { + uint64_t skipRows() const { return skipRows_; } - const std::shared_ptr& getDecodingExecutor() const { + void setUnitLoaderFactory( + std::shared_ptr unitLoaderFactory) { + unitLoaderFactory_ = std::move(unitLoaderFactory); + } + + const std::shared_ptr& unitLoaderFactory() const { + return unitLoaderFactory_; + } + + const std::shared_ptr& decodingExecutor() const { return decodingExecutor_; } -}; -/** - * Options for creating a Reader. - */ -class ReaderOptions : public io::ReaderOptions { + size_t decodingParallelismFactor() const { + return decodingParallelismFactor_; + } + + TimestampPrecision timestampPrecision() const { + return timestampPrecision_; + } + + void setTimestampPrecision(TimestampPrecision precision) { + timestampPrecision_ = precision; + } + + const std::shared_ptr& formatSpecificOptions() const { + return formatSpecificOptions_; + } + + void setFormatSpecificOptions( + std::shared_ptr options) { + formatSpecificOptions_ = std::move(options); + } + private: - uint64_t tailLocation; - FileFormat fileFormat; - RowTypePtr fileSchema; - SerDeOptions serDeOptions; - std::shared_ptr decrypterFactory_; - uint64_t directorySizeGuess{kDefaultDirectorySizeGuess}; - uint64_t filePreloadThreshold{kDefaultFilePreloadThreshold}; - bool fileColumnNamesReadAsLowerCase{false}; - bool useColumnNamesForColumnMapping_{false}; - std::shared_ptr ioExecutor_; + uint64_t dataStart_; + uint64_t dataLength_; + bool preloadStripe_; + bool projectSelectedType_; + bool returnFlatVector_ = false; + ErrorTolerance errorTolerance_; + std::shared_ptr selector_; + RowTypePtr requestedType_; + std::shared_ptr scanSpec_{nullptr}; + std::shared_ptr metadataFilter_; + // Node id for map column to a list of keys to be projected as a struct. + std::unordered_map> flatmapNodeIdAsStruct_; + // Optional executors to enable internal reader parallelism. + // 'decodingExecutor' allow parallelising the vector decoding process. + // 'ioExecutor' enables parallelism when performing file system read + // operations. + std::shared_ptr decodingExecutor_; + size_t decodingParallelismFactor_{0}; + std::optional rowNumberColumnInfo_{std::nullopt}; + // Function to populate metrics related to feature projection stats + // in Koski. This gets fired in FlatMapColumnReader. + // This is a bit of a hack as there is (by design) no good way + // To propogate information from column reader to Koski + std::function + keySelectionCallback_; + + // Function to track how much time we spend waiting on IO before reading rows + // (in dwrf row reader). todo: encapsulate this and keySelectionCallBack_ in a + // struct + std::function + blockedOnIoCallback_; + std::function + decodingTimeCallback_; + std::function stripeCountCallback_; + bool eagerFirstStripeLoad_{true}; + uint64_t skipRows_{0}; + + std::shared_ptr unitLoaderFactory_; + + TimestampPrecision timestampPrecision_ = TimestampPrecision::kMilliseconds; + + std::shared_ptr formatSpecificOptions_; +}; + +/// Options for creating a Reader. +class ReaderOptions : public io::ReaderOptions { public: - static constexpr uint64_t kDefaultDirectorySizeGuess = 1024 * 1024; // 1MB + static constexpr uint64_t kDefaultFooterEstimatedSize = 1024 * 1024; // 1MB static constexpr uint64_t kDefaultFilePreloadThreshold = 1024 * 1024 * 8; // 8MB explicit ReaderOptions(velox::memory::MemoryPool* pool) : io::ReaderOptions(pool), - tailLocation(std::numeric_limits::max()), - fileFormat(FileFormat::UNKNOWN), - fileSchema(nullptr) {} - - ReaderOptions& operator=(const ReaderOptions& other) { - io::ReaderOptions::operator=(other); - tailLocation = other.tailLocation; - fileFormat = other.fileFormat; - if (other.fileSchema != nullptr) { - fileSchema = other.getFileSchema(); - } else { - fileSchema = nullptr; - } - serDeOptions = other.serDeOptions; - decrypterFactory_ = other.decrypterFactory_; - directorySizeGuess = other.directorySizeGuess; - filePreloadThreshold = other.filePreloadThreshold; - fileColumnNamesReadAsLowerCase = other.fileColumnNamesReadAsLowerCase; - useColumnNamesForColumnMapping_ = other.useColumnNamesForColumnMapping_; - return *this; - } + tailLocation_(std::numeric_limits::max()), + fileFormat_(FileFormat::UNKNOWN), + fileSchema_(nullptr) {} - ReaderOptions(const ReaderOptions& other) - : io::ReaderOptions(other), - tailLocation(other.tailLocation), - fileFormat(other.fileFormat), - fileSchema(other.fileSchema), - serDeOptions(other.serDeOptions), - decrypterFactory_(other.decrypterFactory_), - directorySizeGuess(other.directorySizeGuess), - filePreloadThreshold(other.filePreloadThreshold), - fileColumnNamesReadAsLowerCase(other.fileColumnNamesReadAsLowerCase), - useColumnNamesForColumnMapping_(other.useColumnNamesForColumnMapping_) { - } - - /** - * Set the format of the file, such as "rc" or "dwrf". The - * default is "dwrf". - */ + /// Sets the format of the file, such as "rc" or "dwrf". The default is + /// "dwrf". ReaderOptions& setFileFormat(FileFormat format) { - fileFormat = format; + fileFormat_ = format; return *this; } - /** - * Set the schema of the file (a Type tree). - * For "dwrf" format, a default schema is derived from the file. - * For "rc" format, there is no default schema. - */ + /// Sets the schema of the file (a Type tree). For "dwrf" format, a default + /// schema is derived from the file. For "rc" format, there is no default + /// schema. ReaderOptions& setFileSchema(const RowTypePtr& schema) { - fileSchema = schema; + fileSchema_ = schema; return *this; } - /** - * Set the location of the tail as defined by the logical length of the - * file. - */ + /// Sets the location of the tail as defined by the logical length of the + /// file. ReaderOptions& setTailLocation(uint64_t offset) { - tailLocation = offset; + tailLocation_ = offset; return *this; } - /** - * Modify the serialization-deserialization options. - */ - ReaderOptions& setSerDeOptions(const SerDeOptions& sdo) { - serDeOptions = sdo; + /// Modifies the serialization-deserialization options. + ReaderOptions& setSerDeOptions(const SerDeOptions& serdeOpts) { + serDeOptions_ = serdeOpts; return *this; } @@ -444,18 +477,18 @@ class ReaderOptions : public io::ReaderOptions { return *this; } - ReaderOptions& setDirectorySizeGuess(uint64_t size) { - directorySizeGuess = size; + ReaderOptions& setFooterEstimatedSize(uint64_t size) { + footerEstimatedSize_ = size; return *this; } ReaderOptions& setFilePreloadThreshold(uint64_t threshold) { - filePreloadThreshold = threshold; + filePreloadThreshold_ = threshold; return *this; } ReaderOptions& setFileColumnNamesReadAsLowerCase(bool flag) { - fileColumnNamesReadAsLowerCase = flag; + fileColumnNamesReadAsLowerCase_ = flag; return *this; } @@ -469,70 +502,151 @@ class ReaderOptions : public io::ReaderOptions { return *this; } - /** - * Get the desired tail location. - * @return if not set, return the maximum long. - */ - uint64_t getTailLocation() const { - return tailLocation; + ReaderOptions& setSessionTimezone(const tz::TimeZone* sessionTimezone) { + sessionTimezone_ = sessionTimezone; + return *this; } - /** - * Get the file format. - */ - FileFormat getFileFormat() const { - return fileFormat; + /// Gets the desired tail location. + uint64_t tailLocation() const { + return tailLocation_; } - /** - * Get the file schema. - */ - const std::shared_ptr& getFileSchema() const { - return fileSchema; + /// Gets the file format. + FileFormat fileFormat() const { + return fileFormat_; } - SerDeOptions& getSerDeOptions() { - return serDeOptions; + /// Gets the file schema. + const std::shared_ptr& fileSchema() const { + return fileSchema_; } - const SerDeOptions& getSerDeOptions() const { - return serDeOptions; + SerDeOptions& serDeOptions() { + return serDeOptions_; } - const std::shared_ptr getDecrypterFactory() - const { + const SerDeOptions& serDeOptions() const { + return serDeOptions_; + } + + const std::shared_ptr decrypterFactory() const { return decrypterFactory_; } - uint64_t getDirectorySizeGuess() const { - return directorySizeGuess; + uint64_t footerEstimatedSize() const { + return footerEstimatedSize_; } - uint64_t getFilePreloadThreshold() const { - return filePreloadThreshold; + uint64_t filePreloadThreshold() const { + return filePreloadThreshold_; } - const std::shared_ptr& getIOExecutor() const { + const std::shared_ptr& ioExecutor() const { return ioExecutor_; } - bool isFileColumnNamesReadAsLowerCase() const { - return fileColumnNamesReadAsLowerCase; + const tz::TimeZone* getSessionTimezone() const { + return sessionTimezone_; } - bool isUseColumnNamesForColumnMapping() const { + bool fileColumnNamesReadAsLowerCase() const { + return fileColumnNamesReadAsLowerCase_; + } + + bool useColumnNamesForColumnMapping() const { return useColumnNamesForColumnMapping_; } + + const std::shared_ptr& randomSkip() const { + return randomSkip_; + } + + void setRandomSkip(std::shared_ptr randomSkip) { + randomSkip_ = std::move(randomSkip); + } + + bool noCacheRetention() const { + return noCacheRetention_; + } + + void setNoCacheRetention(bool noCacheRetention) { + noCacheRetention_ = noCacheRetention; + } + + const std::shared_ptr& scanSpec() const { + return scanSpec_; + } + + void setScanSpec(std::shared_ptr scanSpec) { + scanSpec_ = std::move(scanSpec); + } + + bool selectiveNimbleReaderEnabled() const { + return selectiveNimbleReaderEnabled_; + } + + void setSelectiveNimbleReaderEnabled(bool value) { + selectiveNimbleReaderEnabled_ = value; + } + + private: + uint64_t tailLocation_; + FileFormat fileFormat_; + RowTypePtr fileSchema_; + SerDeOptions serDeOptions_; + std::shared_ptr decrypterFactory_; + uint64_t footerEstimatedSize_{kDefaultFooterEstimatedSize}; + uint64_t filePreloadThreshold_{kDefaultFilePreloadThreshold}; + bool fileColumnNamesReadAsLowerCase_{false}; + bool useColumnNamesForColumnMapping_{false}; + std::shared_ptr ioExecutor_; + std::shared_ptr randomSkip_; + std::shared_ptr scanSpec_; + const tz::TimeZone* sessionTimezone_{nullptr}; + bool selectiveNimbleReaderEnabled_{false}; }; struct WriterOptions { - TypePtr schema; - velox::memory::MemoryPool* memoryPool; - velox::memory::SetMemoryReclaimer setMemoryReclaimer{nullptr}; + TypePtr schema{nullptr}; + velox::memory::MemoryPool* memoryPool{nullptr}; + const velox::common::SpillConfig* spillConfig{nullptr}; + tsan_atomic* nonReclaimableSection{nullptr}; + + /// A ready-to-use default memory reclaimer factory. It shall be provided by + /// the system that creates writers to ensure a smooth memory system + /// integration (e.g. graceful suspension upon arbitration request). Writer + /// can choose to implement its custom memory reclaimer if needed and not use + /// this default one. + std::function()> + memoryReclaimerFactory{[]() { return nullptr; }}; + std::optional compressionKind; + std::optional orcMinCompressionSize{std::nullopt}; + std::optional maxStripeSize{std::nullopt}; + std::optional orcLinearStripeSizeHeuristics{std::nullopt}; + std::optional maxDictionaryMemory{std::nullopt}; + std::optional orcWriterIntegerDictionaryEncodingEnabled{std::nullopt}; + std::optional orcWriterStringDictionaryEncodingEnabled{std::nullopt}; + std::map serdeParameters; + std::optional zlibCompressionLevel; + std::optional zstdCompressionLevel; + + std::function()> + flushPolicyFactory; + + virtual ~WriterOptions() = default; }; -} // namespace common -} // namespace dwio -} // namespace velox -} // namespace facebook +} // namespace facebook::velox::dwio::common + +template <> +struct fmt::formatter + : fmt::formatter { + template + auto format(facebook::velox::dwio::common::FileFormat fmt, FormatContext& ctx) + const { + return formatter::format( + facebook::velox::dwio::common::toString(fmt), ctx); + } +}; diff --git a/velox/dwio/common/ParallelFor.cpp b/velox/dwio/common/ParallelFor.cpp new file mode 100644 index 0000000000000..793b2f486acfe --- /dev/null +++ b/velox/dwio/common/ParallelFor.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/ParallelFor.h" +#include "velox/common/base/Exceptions.h" +#include "velox/dwio/common/ExecutorBarrier.h" + +namespace facebook::velox::dwio::common { + +namespace { + +std::vector> +splitRange(size_t from, size_t to, size_t factor) { + VELOX_CHECK_LE(from, to); + std::vector> ranges; + + if (from == to) { + return ranges; + } + + if (factor <= 1) { + ranges.emplace_back(from, to); + return ranges; + } + + auto rangeSize = to - from; + auto chunkSize = rangeSize / factor; + auto remainder = rangeSize % factor; + auto start = from; + for (size_t i = 0; i < factor; ++i) { + auto end = start + chunkSize; + if (remainder > 0) { + --remainder; + ++end; + } + // If `factor > (to - from)`, the rest of the chunks will be empty + if (end > start) { + ranges.emplace_back(start, end); + } else { + break; + } + start = end; + } + return ranges; +} + +} // namespace + +ParallelFor::ParallelFor( + folly::Executor* executor, + size_t from, + size_t to, + size_t parallelismFactor) + : executor_(executor), + ranges_{splitRange(from, to, (executor_ ? parallelismFactor : 0))} {} + +ParallelFor::ParallelFor( + std::shared_ptr executor, + size_t from, + size_t to, + size_t parallelismFactor) + : ParallelFor{executor.get(), from, to, parallelismFactor} { + owned_ = std::move(executor); +} + +void ParallelFor::execute(std::function func) { + // Otherwise from == to + if (ranges_.empty()) { + return; + } + if (ranges_.size() == 1) { + for (size_t i = ranges_[0].first, end = ranges_[0].second; i < end; ++i) { + func(i); + } + } else { + VELOX_CHECK( + executor_, + "Executor wasn't provided so we shouldn't have more than 1 range"); + ExecutorBarrier barrier(*executor_); + const size_t last = ranges_.size() - 1; + // First N-1 ranges in executor threads + for (size_t r = 0; r < last; ++r) { + auto& range = ranges_[r]; + barrier.add([begin = range.first, end = range.second, &func]() { + for (size_t i = begin; i < end; ++i) { + func(i); + } + }); + } + // Last range in calling thread + auto& range = ranges_[last]; + for (size_t i = range.first, end = range.second; i < end; ++i) { + func(i); + } + barrier.waitAll(); + } +} + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/ParallelFor.h b/velox/dwio/common/ParallelFor.h new file mode 100644 index 0000000000000..f6debc9f14fdc --- /dev/null +++ b/velox/dwio/common/ParallelFor.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "folly/Executor.h" + +namespace facebook::velox::dwio::common { + +/* + * A helper class that allows to run a function on a range of indices in + * multiple threads. + * The range (from, to] is split into equal-sized chunks and each chunk is + * scheduled in a different thread. The number of threads is: parallelismFactor + * It means that if parallelismFactor == 1 (or 0), the function will be executed + * in the calling thread for the entire range. If parallelismFactor == 2, the + * function will be called for half of the range in one thread in the executor, + * and for the last half in the calling thread (and so on). If no executor is + * passed (nullptr), the function will be executed in the calling thread for the + * entire range. + */ +class ParallelFor { + public: + ParallelFor( + folly::Executor* executor, + size_t from, // start index + size_t to, // past end index + // number of threads. + size_t parallelismFactor); + + ParallelFor( + std::shared_ptr executor, + size_t from, // start index + size_t to, // past end index + // number of threads + size_t parallelismFactor); + + void execute(std::function func); + + private: + std::shared_ptr owned_; + folly::Executor* executor_; + std::vector> ranges_; +}; + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/PositionProvider.h b/velox/dwio/common/PositionProvider.h new file mode 100644 index 0000000000000..7be3bc7a16029 --- /dev/null +++ b/velox/dwio/common/PositionProvider.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace facebook::velox::dwio::common { + +class PositionProvider { + public: + explicit PositionProvider(const std::vector& positions) + : position_{positions.begin()}, end_{positions.end()} {} + + uint64_t next(); + + bool hasNext() const; + + private: + std::vector::const_iterator position_; + std::vector::const_iterator end_; +}; + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/RandGen.h b/velox/dwio/common/RandGen.h index f84f0c666576c..b83743bbcca18 100644 --- a/velox/dwio/common/RandGen.h +++ b/velox/dwio/common/RandGen.h @@ -27,7 +27,7 @@ namespace common { class RandGen { public: RandGen() - : rand_{}, mt_{rand_()}, dist_(0, std::numeric_limits::max()){}; + : rand_{}, mt_{rand_()}, dist_(0, std::numeric_limits::max()) {} template T gen(int32_t max) { diff --git a/velox/dwio/common/Range.cpp b/velox/dwio/common/Range.cpp index ff4e5fa66d4d6..509fba4f36225 100644 --- a/velox/dwio/common/Range.cpp +++ b/velox/dwio/common/Range.cpp @@ -18,12 +18,13 @@ namespace facebook::velox::common { -Ranges Ranges::filter(std::function func) const { +Ranges Ranges::filter(const std::function& func) const { Ranges ret; - for (auto& r : ranges_) { + for (auto& range : ranges_) { bool inRun = false; size_t runStart = 0; - for (auto cur = std::get<0>(r), end = std::get<1>(r); cur != end; ++cur) { + for (auto cur = std::get<0>(range), end = std::get<1>(range); cur != end; + ++cur) { if (func(cur)) { if (!inRun) { inRun = true; @@ -35,9 +36,10 @@ Ranges Ranges::filter(std::function func) const { inRun = false; } } + if (inRun) { - ret.ranges_.emplace_back(runStart, std::get<1>(r)); - ret.size_ += (std::get<1>(r) - runStart); + ret.ranges_.emplace_back(runStart, std::get<1>(range)); + ret.size_ += (std::get<1>(range) - runStart); } } return ret; diff --git a/velox/dwio/common/Range.h b/velox/dwio/common/Range.h index 71445f33941bc..8f12abff182f6 100644 --- a/velox/dwio/common/Range.h +++ b/velox/dwio/common/Range.h @@ -16,33 +16,31 @@ #pragma once +#include "velox/common/base/Exceptions.h" #include "velox/common/base/GTestMacros.h" -#include "velox/dwio/common/exception/Exception.h" namespace facebook::velox::common { -/** Utility class to represent ranges of input used by DWRF writer. -This class does not dedepe overlapping ranges because for encoded input, the -overlapping range should be processed the same amount of time as specified -rather than just once. -This class does not support representing empty ranges. It -is the Caller's responsibility to avoid using the class when empty ranges are -possible. -*/ +/// Utility class to represent ranges of input used by DWRF writer.This class +/// does not dedepe overlapping ranges because for encoded input, the +/// overlapping range should be processed the same amount of time as specified +/// rather than just once. This class does not support representing empty +/// ranges. It is the Caller's responsibility to avoid using the class when +/// empty ranges are possible. class Ranges { public: void add(size_t begin, size_t end) { if (begin == end) { return; } - DWIO_ENSURE_LT(begin, end); + VELOX_CHECK_LT(begin, end); size_ += (end - begin); - if (ranges_.size()) { - // try merge with last + if (!ranges_.empty()) { + // try merge with last. auto& last = ranges_.back(); - auto& e = std::get<1>(last); - if (e == begin) { - e = end; + auto& lastEnd = std::get<1>(last); + if (lastEnd == begin) { + lastEnd = end; return; } } @@ -51,22 +49,23 @@ class Ranges { ranges_.emplace_back(begin, end); } - // returns another instance with ranges meet the filter criteria - Ranges filter(std::function func) const; + /// Returns another instance with ranges meet the filter criteria + Ranges filter(const std::function& func) const; class Iterator { public: Iterator( std::vector>::const_iterator cur, std::vector>::const_iterator end) - : cur_{cur}, end_{end}, val_{0} { + : end_{end}, cur_{cur}, val_{0} { if (cur_ != end_) { val_ = std::get<0>(*cur_); } } bool operator==(const Iterator& other) const { - return cur_ == other.cur_ && end_ == other.end_ && val_ == other.val_; + return std::tie(cur_, end_, val_) == + std::tie(other.cur_, other.end_, other.val_); } bool operator!=(const Iterator& other) const { @@ -74,7 +73,7 @@ class Ranges { } Iterator& operator++() { - DCHECK(cur_ != end_); + VELOX_DCHECK(cur_ != end_); if (++val_ == std::get<1>(*cur_)) { val_ = (++cur_ != end_ ? std::get<0>(*cur_) : 0); } @@ -82,13 +81,13 @@ class Ranges { } const size_t& operator*() const { - DCHECK(cur_ != end_); + VELOX_DCHECK(cur_ != end_); return val_; } private: + const std::vector>::const_iterator end_; std::vector>::const_iterator cur_; - std::vector>::const_iterator end_; size_t val_; }; @@ -114,9 +113,9 @@ class Ranges { } static Ranges of(size_t begin, size_t end) { - Ranges r; - r.add(begin, end); - return r; + Ranges ranges; + ranges.add(begin, end); + return ranges; } private: diff --git a/velox/dwio/common/Reader.cpp b/velox/dwio/common/Reader.cpp index 85e56bc44c8b0..1beec1b02491c 100644 --- a/velox/dwio/common/Reader.cpp +++ b/velox/dwio/common/Reader.cpp @@ -18,9 +18,73 @@ namespace facebook::velox::dwio::common { +using namespace velox::common; + +namespace { + +template +bool filterSimpleVectorRow( + const BaseVector& vector, + Filter& filter, + vector_size_t index) { + using T = typename TypeTraits::NativeType; + auto* simpleVector = vector.asUnchecked>(); + return applyFilter(filter, simpleVector->valueAt(index)); +} + +bool filterRow(const BaseVector& vector, Filter& filter, vector_size_t index) { + if (vector.isNullAt(index)) { + return filter.testNull(); + } + switch (vector.typeKind()) { + case TypeKind::ARRAY: + case TypeKind::MAP: + case TypeKind::ROW: + VELOX_USER_CHECK( + filter.kind() == FilterKind::kIsNull || + filter.kind() == FilterKind::kIsNotNull, + "Complex type can only take null filter, got {}", + filter.toString()); + return filter.testNonNull(); + default: + return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + filterSimpleVectorRow, vector.typeKind(), vector, filter, index); + } +} + +void applyFilter( + const BaseVector& vector, + const ScanSpec& spec, + uint64_t* result) { + if (spec.filter()) { + bits::forEachSetBit(result, 0, vector.size(), [&](auto i) { + if (!filterRow(vector, *spec.filter(), i)) { + bits::clearBit(result, i); + } + }); + } + if (!vector.type()->isRow()) { + // Filter on MAP or ARRAY children are pruning, and won't affect correctness + // of the result. + return; + } + auto& rowType = vector.type()->asRow(); + auto* rowVector = vector.as(); + // Should not have any lazy from non-selective reader. + VELOX_CHECK_NOT_NULL(rowVector); + for (auto& childSpec : spec.children()) { + auto child = + rowVector->childAt(rowType.getChildIdx(childSpec->fieldName())); + applyFilter(*child, *childSpec, result); + } +} + +} // namespace + VectorPtr RowReader::projectColumns( const VectorPtr& input, - const velox::common::ScanSpec& spec) { + const ScanSpec& spec, + const Mutation* mutation) { auto* inputRow = input->as(); VELOX_CHECK_NOT_NULL(inputRow); auto& inputRowType = input->type()->asRow(); @@ -31,27 +95,175 @@ VectorPtr RowReader::projectColumns( std::vector names(numColumns); std::vector types(numColumns); std::vector children(numColumns); + std::vector passed(bits::nwords(input->size()), -1); + if (mutation) { + if (mutation->deletedRows) { + bits::andWithNegatedBits( + passed.data(), mutation->deletedRows, 0, input->size()); + } + if (mutation->randomSkip) { + bits::forEachSetBit(passed.data(), 0, input->size(), [&](auto i) { + if (!mutation->randomSkip->testOne()) { + bits::clearBit(passed.data(), i); + } + }); + } + } for (auto& childSpec : spec.children()) { + VectorPtr child; + if (childSpec->isConstant()) { + child = BaseVector::wrapInConstant( + input->size(), 0, childSpec->constantValue()); + } else { + child = + inputRow->childAt(inputRowType.getChildIdx(childSpec->fieldName())); + applyFilter(*child, *childSpec, passed.data()); + } if (!childSpec->projectOut()) { continue; } auto i = childSpec->channel(); names[i] = childSpec->fieldName(); - if (childSpec->isConstant()) { - children[i] = BaseVector::wrapInConstant( - input->size(), 0, childSpec->constantValue()); - } else { - children[i] = - inputRow->childAt(inputRowType.getChildIdx(childSpec->fieldName())); + types[i] = child->type(); + children[i] = std::move(child); + } + auto rowType = ROW(std::move(names), std::move(types)); + auto size = bits::countBits(passed.data(), 0, input->size()); + if (size == 0) { + return RowVector::createEmpty(rowType, input->pool()); + } + if (size < input->size()) { + auto indices = allocateIndices(size, input->pool()); + auto* rawIndices = indices->asMutable(); + vector_size_t j = 0; + bits::forEachSetBit( + passed.data(), 0, input->size(), [&](auto i) { rawIndices[j++] = i; }); + for (auto& child : children) { + child->disableMemo(); + child = BaseVector::wrapInDictionary( + nullptr, indices, size, std::move(child)); } - types[i] = children[i]->type(); } return std::make_shared( - input->pool(), - ROW(std::move(names), std::move(types)), - nullptr, - input->size(), - std::move(children)); + input->pool(), rowType, nullptr, size, std::move(children)); +} + +namespace { +void fillRowNumberVector( + VectorPtr& rowNumVector, + bool contiguousRowNumbers, + uint64_t previousRow, + uint64_t rowsToRead, + const dwio::common::SelectiveColumnReader* columnReader, + VectorPtr& result) { + FlatVector* flatRowNum{nullptr}; + if (rowNumVector && BaseVector::isVectorWritable(rowNumVector)) { + flatRowNum = rowNumVector->asFlatVector(); + } + if (flatRowNum) { + flatRowNum->clearAllNulls(); + flatRowNum->resize(result->size()); + } else { + rowNumVector = std::make_shared>( + result->pool(), + BIGINT(), + nullptr, + result->size(), + AlignedBuffer::allocate(result->size(), result->pool()), + std::vector()); + flatRowNum = rowNumVector->asUnchecked>(); + } + auto* rawRowNum = flatRowNum->mutableRawValues(); + if (contiguousRowNumbers) { + VELOX_DCHECK_EQ(rowsToRead, result->size()); + std::iota(rawRowNum, rawRowNum + rowsToRead, previousRow); + } else { + const auto rowOffsets = columnReader->outputRows(); + VELOX_DCHECK_EQ(rowOffsets.size(), result->size()); + for (int i = 0; i < rowOffsets.size(); ++i) { + rawRowNum[i] = previousRow + rowOffsets[i]; + } + } +} +} // namespace + +void RowReader::readWithRowNumber( + std::unique_ptr& columnReader, + const dwio::common::RowReaderOptions& options, + uint64_t previousRow, + uint64_t rowsToRead, + const dwio::common::Mutation* mutation, + VectorPtr& result) { + const auto& rowNumberColumnInfo = options.rowNumberColumnInfo(); + VELOX_CHECK(rowNumberColumnInfo.has_value()); + const auto rowNumberColumnIndex = rowNumberColumnInfo->insertPosition; + const auto& rowNumberColumnName = rowNumberColumnInfo->name; + column_index_t numChildren{0}; + column_index_t numNotReadFromFileChildren{0}; + for (auto& column : options.scanSpec()->children()) { + if (column->projectOut()) { + ++numChildren; + if (column->isConstant() || column->isExplicitRowNumber()) { + ++numNotReadFromFileChildren; + } + } + } + VELOX_CHECK_LE(rowNumberColumnIndex, numChildren); + const bool contiguousRowNumbers = + (numChildren == numNotReadFromFileChildren) && !hasDeletion(mutation); + if (rowNumberColumnInfo->isExplicit) { + columnReader->next(rowsToRead, result, mutation); + fillRowNumberVector( + result->asUnchecked()->childAt(rowNumberColumnIndex), + contiguousRowNumbers, + previousRow, + rowsToRead, + columnReader.get(), + result); + } else { + auto* rowVector = result->asUnchecked(); + VectorPtr rowNumVector; + if (rowVector->childrenSize() != numChildren) { + VELOX_CHECK_EQ(rowVector->childrenSize(), numChildren + 1); + rowNumVector = rowVector->childAt(rowNumberColumnIndex); + const auto& rowType = rowVector->type()->asRow(); + auto names = rowType.names(); + auto types = rowType.children(); + auto children = rowVector->children(); + VELOX_DCHECK(!names.empty() && !types.empty() && !children.empty()); + names.erase(names.begin() + rowNumberColumnIndex); + types.erase(types.begin() + rowNumberColumnIndex); + children.erase(children.begin() + rowNumberColumnIndex); + result = std::make_shared( + rowVector->pool(), + ROW(std::move(names), std::move(types)), + rowVector->nulls(), + rowVector->size(), + std::move(children)); + } + columnReader->next(rowsToRead, result, mutation); + fillRowNumberVector( + rowNumVector, + contiguousRowNumbers, + previousRow, + rowsToRead, + columnReader.get(), + result); + rowVector = result->asUnchecked(); + auto& rowType = rowVector->type()->asRow(); + auto names = rowType.names(); + auto types = rowType.children(); + auto children = rowVector->children(); + names.insert(names.begin() + rowNumberColumnIndex, rowNumberColumnName); + types.insert(types.begin() + rowNumberColumnIndex, BIGINT()); + children.insert(children.begin() + rowNumberColumnIndex, rowNumVector); + result = std::make_shared( + rowVector->pool(), + ROW(std::move(names), std::move(types)), + rowVector->nulls(), + rowVector->size(), + std::move(children)); + } } } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/Reader.h b/velox/dwio/common/Reader.h index 9587acd8badcd..0c3da77d29ce8 100644 --- a/velox/dwio/common/Reader.h +++ b/velox/dwio/common/Reader.h @@ -24,6 +24,7 @@ #include "velox/dwio/common/InputStream.h" #include "velox/dwio/common/Mutation.h" #include "velox/dwio/common/Options.h" +#include "velox/dwio/common/SelectiveColumnReader.h" #include "velox/dwio/common/Statistics.h" #include "velox/dwio/common/TypeWithId.h" #include "velox/type/Type.h" @@ -136,15 +137,24 @@ class RowReader { */ virtual std::optional> prefetchUnits() { return std::nullopt; - }; + } /** * Helper function used by non-selective reader to project top level columns - * according to the scan spec. + * according to the scan spec and mutations. */ static VectorPtr projectColumns( const VectorPtr& input, - const velox::common::ScanSpec&); + const velox::common::ScanSpec& spec, + const Mutation* mutation); + + static void readWithRowNumber( + std::unique_ptr& columnReader, + const dwio::common::RowReaderOptions& options, + uint64_t previousRow, + uint64_t rowsToRead, + const dwio::common::Mutation*, + VectorPtr& result); }; /** diff --git a/velox/dwio/common/ReaderFactory.cpp b/velox/dwio/common/ReaderFactory.cpp index dcb120aab9076..56599c620fd92 100644 --- a/velox/dwio/common/ReaderFactory.cpp +++ b/velox/dwio/common/ReaderFactory.cpp @@ -31,7 +31,8 @@ ReaderFactoriesMap& readerFactories() { } // namespace bool registerReaderFactory(std::shared_ptr factory) { - bool ok = readerFactories().insert({factory->fileFormat(), factory}).second; + [[maybe_unused]] const bool ok = + readerFactories().insert({factory->fileFormat(), factory}).second; // NOTE: re-enable this check after Prestissimo has updated dwrf registration. #if 0 VELOX_CHECK( diff --git a/velox/dwio/common/Retry.h b/velox/dwio/common/Retry.h index a7e7f7c3cd5c2..d4d52c120f280 100644 --- a/velox/dwio/common/Retry.h +++ b/velox/dwio/common/Retry.h @@ -57,10 +57,12 @@ using RetryDuration = std::chrono::duration; namespace retrypolicy { + class IRetryPolicy { public: virtual ~IRetryPolicy() = default; virtual folly::Optional nextWaitTime() = 0; + virtual void start() {} }; class KAttempts : public IRetryPolicy { @@ -68,7 +70,7 @@ class KAttempts : public IRetryPolicy { explicit KAttempts(std::vector durations) : index_(0), durations_(std::move(durations)) {} - folly::Optional nextWaitTime() { + folly::Optional nextWaitTime() override { if (index_ < durations_.size()) { return folly::Optional(durations_[index_++]); } else { @@ -86,26 +88,32 @@ class ExponentialBackoff : public IRetryPolicy { ExponentialBackoff( RetryDuration start, RetryDuration max, - uint64_t maxRetries = std::numeric_limits::max(), - RetryDuration maxTotal = RetryDuration::zero()) + uint64_t maxRetries, + RetryDuration maxTotal, + bool countExecutionTime) : maxWait_(max), maxTotal_(maxTotal), + countExecutionTime_(countExecutionTime), nextWait_(start), - total_(0), + totalWait_(0), retriesLeft_(maxRetries) { DWIO_ENSURE_LE(start.count(), max.count()); DWIO_ENSURE(maxTotal_.count() == 0 || maxTotal_.count() > start.count()); } - folly::Optional nextWaitTime() { - if (retriesLeft_ == 0 || (maxTotal_.count() > 0 && total_ >= maxTotal_)) { + void start() override { + startTime_ = std::chrono::system_clock::now(); + } + + folly::Optional nextWaitTime() override { + if (retriesLeft_ == 0 || (maxTotal_.count() > 0 && total() >= maxTotal_)) { return folly::Optional(); } RetryDuration waitTime = nextWait_ + jitter(); nextWait_ = std::min(nextWait_ + nextWait_, maxWait_); --retriesLeft_; - total_ += waitTime; + totalWait_ += waitTime; return folly::Optional(waitTime); } @@ -114,10 +122,17 @@ class ExponentialBackoff : public IRetryPolicy { return RetryDuration(rand_.gen(folly::to(nextWait_.count()) / 2)); } + RetryDuration total() const { + return countExecutionTime_ ? std::chrono::system_clock::now() - startTime_ + : totalWait_; + } + const RetryDuration maxWait_; const RetryDuration maxTotal_; + const bool countExecutionTime_; + std::chrono::system_clock::time_point startTime_; RetryDuration nextWait_; - RetryDuration total_; + RetryDuration totalWait_; uint64_t retriesLeft_; RandGen rand_; }; @@ -142,27 +157,30 @@ class KAttemptsPolicyFactory : public IRetryPolicyFactory { }; class ExponentialBackoffPolicyFactory : public IRetryPolicyFactory { - private: - const RetryDuration start_; - const RetryDuration maxWait_; - const uint64_t maxRetries_; - const RetryDuration maxTotal_; - public: ExponentialBackoffPolicyFactory( RetryDuration start, RetryDuration maxWait, uint64_t maxRetries = std::numeric_limits::max(), - RetryDuration maxTotal = RetryDuration::zero()) + RetryDuration maxTotal = RetryDuration::zero(), + bool countExecutionTime = false) : start_(start), maxWait_(maxWait), maxRetries_(maxRetries), - maxTotal_(maxTotal) {} + maxTotal_(maxTotal), + countExecutionTime_(countExecutionTime) {} std::unique_ptr getRetryPolicy() const { return std::make_unique( - start_, maxWait_, maxRetries_, maxTotal_); + start_, maxWait_, maxRetries_, maxTotal_, countExecutionTime_); } + + private: + const RetryDuration start_; + const RetryDuration maxWait_; + const uint64_t maxRetries_; + const RetryDuration maxTotal_; + const bool countExecutionTime_; }; } // namespace retrypolicy @@ -174,6 +192,7 @@ class RetryModule { F func, std::unique_ptr policy, std::function abortFunc = nullptr) { + policy->start(); do { try { // If abort signal is triggered before func, no ops. diff --git a/velox/dwio/common/ScanSpec.cpp b/velox/dwio/common/ScanSpec.cpp index 4845b3242a773..a37cbc98ec6ab 100644 --- a/velox/dwio/common/ScanSpec.cpp +++ b/velox/dwio/common/ScanSpec.cpp @@ -19,52 +19,31 @@ namespace facebook::velox::common { -ScanSpec& ScanSpec::operator=(const ScanSpec& other) { - if (this != &other) { - numReads_ = other.numReads_; - subscript_ = other.subscript_; - fieldName_ = other.fieldName_; - channel_ = other.channel_; - constantValue_ = other.constantValue_; - projectOut_ = other.projectOut_; - extractValues_ = other.extractValues_; - makeFlat_ = other.makeFlat_; - filter_ = other.filter_; - metadataFilters_ = other.metadataFilters_; - selectivity_ = other.selectivity_; - enableFilterReorder_ = other.enableFilterReorder_; - children_ = other.children_; - stableChildren_ = other.stableChildren_; - childByFieldName_ = other.childByFieldName_; - valueHook_ = other.valueHook_; - isArrayElementOrMapEntry_ = other.isArrayElementOrMapEntry_; - maxArrayElementsCount_ = other.maxArrayElementsCount_; - } - return *this; +ScanSpec* ScanSpec::getOrCreateChild(const std::string& name) { + if (auto it = this->childByFieldName_.find(name); + it != this->childByFieldName_.end()) { + return it->second; + } + this->children_.push_back(std::make_unique(name)); + auto* child = this->children_.back().get(); + this->childByFieldName_[child->fieldName()] = child; + return child; } ScanSpec* ScanSpec::getOrCreateChild(const Subfield& subfield) { - auto container = this; - auto& path = subfield.path(); + auto* container = this; + const auto& path = subfield.path(); for (size_t depth = 0; depth < path.size(); ++depth) { - auto element = path[depth].get(); + const auto element = path[depth].get(); VELOX_CHECK_EQ(element->kind(), kNestedField); auto* nestedField = static_cast(element); - auto it = container->childByFieldName_.find(nestedField->name()); - if (it != container->childByFieldName_.end()) { - container = it->second; - } else { - container->children_.push_back(std::make_unique(*element)); - auto* child = container->children_.back().get(); - container->childByFieldName_[child->fieldName()] = child; - container = child; - } + container = container->getOrCreateChild(nestedField->name()); } return container; } uint64_t ScanSpec::newRead() { - if (!numReads_) { + if (numReads_ == 0) { reorder(); } else if (enableFilterReorder_) { for (auto i = 1; i < children_.size(); ++i) { @@ -78,13 +57,14 @@ uint64_t ScanSpec::newRead() { } } } - return numReads_++; + return ++numReads_; } void ScanSpec::reorder() { if (children_.empty()) { return; } + // Make sure 'stableChildren_' is initialized. stableChildren(); std::sort( @@ -153,36 +133,34 @@ bool ScanSpec::hasFilter() const { return false; } +bool ScanSpec::testNull() const { + if (filter_ && !filter_->testNull()) { + return false; + } + for (auto& child : children_) { + if (!child->isArrayElementOrMapEntry_ && !child->testNull()) { + return false; + } + } + return true; +} + void ScanSpec::moveAdaptationFrom(ScanSpec& other) { // moves the filters and filter order from 'other'. - std::vector> newChildren; - childByFieldName_.clear(); - for (auto& otherChild : other.children_) { - bool found = false; - for (auto& child : children_) { - if (child && child->fieldName_ == otherChild->fieldName_) { - if (!child->isConstant() && !otherChild->isConstant()) { - // If other child is constant, a possible filter on a - // constant will have been evaluated at split start time. If - // 'child' is constant there is no adaptation that can be - // received. - child->filter_ = std::move(otherChild->filter_); - child->selectivity_ = otherChild->selectivity_; - } - childByFieldName_[child->fieldName_] = child.get(); - newChildren.push_back(std::move(child)); - found = true; - break; - } + for (auto& child : children_) { + auto it = other.childByFieldName_.find(child->fieldName_); + if (it == other.childByFieldName_.end()) { + continue; + } + auto* otherChild = it->second; + if (!child->isConstant() && !otherChild->isConstant()) { + // If other child is constant, a possible filter on a + // constant will have been evaluated at split start time. If + // 'child' is constant there is no adaptation that can be + // received. + child->filter_ = std::move(otherChild->filter_); + child->selectivity_ = otherChild->selectivity_; } - VELOX_CHECK(found); - } - children_ = std::move(newChildren); - stableChildren_.clear(); - for (auto& otherChild : other.stableChildren_) { - auto child = childByName(otherChild->fieldName_); - VELOX_CHECK(child); - stableChildren_.push_back(child); } } @@ -290,8 +268,8 @@ bool testStringFilter( bool testBoolFilter( common::Filter* filter, dwio::common::BooleanColumnStatistics* boolStats) { - auto trueCount = boolStats->getTrueCount(); - auto falseCount = boolStats->getFalseCount(); + const auto trueCount = boolStats->getTrueCount(); + const auto falseCount = boolStats->getFalseCount(); if (trueCount.has_value() && falseCount.has_value()) { if (trueCount.value() == 0) { if (!filter->testBool(false)) { @@ -313,7 +291,7 @@ bool testFilter( dwio::common::ColumnStatistics* stats, uint64_t totalRows, const TypePtr& type) { - bool mayHaveNull = true; + bool mayHaveNull{true}; // Has-null statistics is often not set. Hence, we supplement it with // number-of-values statistic to detect no-null columns more often. @@ -331,6 +309,7 @@ bool testFilter( // IS NULL filter cannot pass. return false; } + if (mayHaveNull && filter->testNull()) { return true; } @@ -342,23 +321,23 @@ bool testFilter( case TypeKind::INTEGER: case TypeKind::SMALLINT: case TypeKind::TINYINT: { - auto intStats = + auto* intStats = dynamic_cast(stats); return testIntFilter(filter, intStats, mayHaveNull); } case TypeKind::REAL: case TypeKind::DOUBLE: { - auto doubleStats = + auto* doubleStats = dynamic_cast(stats); return testDoubleFilter(filter, doubleStats, mayHaveNull); } case TypeKind::BOOLEAN: { - auto boolStats = + auto* boolStats = dynamic_cast(stats); return testBoolFilter(filter, boolStats); } case TypeKind::VARCHAR: { - auto stringStats = + auto* stringStats = dynamic_cast(stats); return testStringFilter(filter, stringStats, mayHaveNull); } @@ -402,24 +381,12 @@ std::string ScanSpec::toString() const { return out.str(); } -std::shared_ptr ScanSpec::removeChild(const ScanSpec* child) { - for (auto it = children_.begin(); it != children_.end(); ++it) { - if (it->get() == child) { - auto removed = std::move(*it); - children_.erase(it); - childByFieldName_.erase(removed->fieldName()); - return removed; - } - } - return nullptr; -} - void ScanSpec::addFilter(const Filter& filter) { filter_ = filter_ ? filter_->mergeWith(&filter) : filter.clone(); } ScanSpec* ScanSpec::addField(const std::string& name, column_index_t channel) { - auto child = getOrCreateChild(Subfield(name)); + auto child = getOrCreateChild(name); child->setProjectOut(true); child->setChannel(channel); return child; diff --git a/velox/dwio/common/ScanSpec.h b/velox/dwio/common/ScanSpec.h index e03b33963d0c8..14f4db86b9f93 100644 --- a/velox/dwio/common/ScanSpec.h +++ b/velox/dwio/common/ScanSpec.h @@ -44,24 +44,8 @@ class ScanSpec { static constexpr const char* kMapValuesFieldName = "values"; static constexpr const char* kArrayElementsFieldName = "elements"; - explicit ScanSpec(const Subfield::PathElement& element) { - if (element.kind() == kNestedField) { - auto field = reinterpret_cast(&element); - fieldName_ = field->name(); - - } else { - VELOX_CHECK(false, "Only nested fields are supported"); - } - } - explicit ScanSpec(const std::string& name) : fieldName_(name) {} - ScanSpec(const ScanSpec& other) { - *this = other; - } - - ScanSpec& operator=(const ScanSpec&); - // Filter to apply. If 'this' corresponds to a struct/list/map, this // can only be isNull or isNotNull, other filtering is given by // 'children'. @@ -117,6 +101,14 @@ class ScanSpec { return constantValue_ != nullptr; } + void setExplicitRowNumber(bool isExplicitRowNumber) { + isExplicitRowNumber_ = isExplicitRowNumber; + } + + bool isExplicitRowNumber() const { + return isExplicitRowNumber_; + } + // Name of the value in its container, i.e. field name in struct or // string key in map. Not all fields of 'this' apply in list/map // value cases but the overhead is manageable, the space taken is @@ -202,6 +194,10 @@ class ScanSpec { // each level of struct is mandatory. uint64_t newRead(); + /// Returns the ScanSpec corresponding to 'name'. Creates it if needed without + /// any intermediate level. + ScanSpec* getOrCreateChild(const std::string& name); + // Returns the ScanSpec corresponding to 'subfield'. Creates it if // needed, including any intermediate levels. This is used at // TableScan initialization to create the ScanSpec tree that @@ -216,10 +212,6 @@ class ScanSpec { return it->second; } - // Remove a child from this scan spec, returning the removed child. This is - // used for example to transform a flatmap scan spec into a struct scan spec. - std::shared_ptr removeChild(const ScanSpec* child); - SelectivityInfo& selectivity() { return selectivity_; } @@ -232,9 +224,11 @@ class ScanSpec { valueHook_ = valueHook; } - // Returns true if the corresponding reader only needs to reference - // the nulls stream. True if filter is is-null with or without value - // extraction or if filter is is-not-null and no value is extracted. + // Returns true if the corresponding reader only needs to reference the nulls + // stream. True if filter is is-null with or without value extraction or if + // filter is is-not-null and no value is extracted. Note that this does not + // apply to Nimble format leaf nodes, because nulls are mixed in the encoding + // with actual values. bool readsNullsOnly() const { if (filter_) { if (filter_->kind() == FilterKind::kIsNull) { @@ -263,6 +257,12 @@ class ScanSpec { // This may change as a result of runtime adaptation. bool hasFilter() const; + /// Assume this field is read as null constant vector (usually due to missing + /// field), check if any filter in the struct subtree would make the whole + /// vector to be filtered out. Return false when the whole vector should be + /// filtered out. + bool testNull() const; + // Resets cached values after this or children were updated, e.g. a new filter // was added or existing filter was modified. void resetCachedValues(bool doReorder) { @@ -324,6 +324,26 @@ class ScanSpec { // projected out. void addAllChildFields(const Type&); + const std::vector& flatMapFeatureSelection() const { + return flatMapFeatureSelection_; + } + + void setFlatMapFeatureSelection(std::vector features) { + flatMapFeatureSelection_ = std::move(features); + } + + /// Invoke the function provided on each node of the ScanSpec tree. + template + void visit(const Type& type, F&& f); + + bool isFlatMapAsStruct() const { + return isFlatMapAsStruct_; + } + + void setFlatMapAsStruct(bool value) { + isFlatMapAsStruct_ = value; + } + private: void reorder(); @@ -351,10 +371,12 @@ class ScanSpec { VectorPtr constantValue_; bool projectOut_ = false; bool extractValues_ = false; + + bool isExplicitRowNumber_ = false; // True if a string dictionary or flat map in this field should be // returned as flat. bool makeFlat_ = false; - std::shared_ptr filter_; + std::unique_ptr filter_; // Filters that will be only used for row group filtering based on metadata. // The conjunctions among these filters are tracked in MetadataFilter, with @@ -400,8 +422,44 @@ class ScanSpec { // Only take the first maxArrayElementsCount_ elements from each array. vector_size_t maxArrayElementsCount_ = std::numeric_limits::max(); + + // Used only for bulk reader to project flat map features. + std::vector flatMapFeatureSelection_; + + // This node represents a flat map column that need to be read as struct, + // i.e. in table schema it is a MAP, but in result vector it is ROW. + bool isFlatMapAsStruct_ = false; }; +template +void ScanSpec::visit(const Type& type, F&& f) { + f(type, *this); + if (isConstant()) { + // Child specs are not populated in this case. + return; + } + switch (type.kind()) { + case TypeKind::ROW: + for (auto& child : children_) { + VELOX_CHECK_NE(child->channel(), kNoChannel); + child->visit(*type.childAt(child->channel()), std::forward(f)); + } + break; + case TypeKind::MAP: + childByName(kMapKeysFieldName) + ->visit(*type.childAt(0), std::forward(f)); + childByName(kMapValuesFieldName) + ->visit(*type.childAt(1), std::forward(f)); + break; + case TypeKind::ARRAY: + childByName(kArrayElementsFieldName) + ->visit(*type.childAt(0), std::forward(f)); + break; + default: + break; + } +} + // Returns false if no value from a range defined by stats can pass the // filter. True, otherwise. bool testFilter( diff --git a/velox/dwio/common/SeekableInputStream.cpp b/velox/dwio/common/SeekableInputStream.cpp index 5a514e50c8f95..7773445dea5d3 100644 --- a/velox/dwio/common/SeekableInputStream.cpp +++ b/velox/dwio/common/SeekableInputStream.cpp @@ -37,7 +37,7 @@ void printBuffer(std::ostream& out, const char* buffer, uint64_t length) { } uint64_t PositionProvider::next() { - uint64_t result = *position_; + const uint64_t result = *position_; ++position_; return result; } @@ -47,18 +47,18 @@ bool PositionProvider::hasNext() const { } void SeekableInputStream::readFully(char* buffer, size_t bufferSize) { - size_t posn = 0; + size_t pos = 0; size_t readLength = 0; size_t bytesToCopy = 0; - while (posn < bufferSize) { + while (pos < bufferSize) { const void* chunk; int32_t length; - DWIO_ENSURE(Next(&chunk, &length), "bad read in readFully"); + VELOX_CHECK(Next(&chunk, &length), "bad read in readFully"); readLength = static_cast(length); - bytesToCopy = std::min(readLength, bufferSize - posn); - auto bytes = reinterpret_cast(chunk); - std::copy(bytes, bytes + bytesToCopy, buffer + posn); - posn += bytesToCopy; + bytesToCopy = std::min(readLength, bufferSize - pos); + auto* bytes = reinterpret_cast(chunk); + std::copy(bytes, bytes + bytesToCopy, buffer + pos); + pos += bytesToCopy; } // return remaining bytes back to stream if (bytesToCopy < readLength) { @@ -70,64 +70,67 @@ SeekableArrayInputStream::SeekableArrayInputStream( const unsigned char* values, uint64_t size, uint64_t blkSize) - : data(reinterpret_cast(values)), dataRead{nullptr} { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : blkSize; + : data_(reinterpret_cast(values)), dataRead_{nullptr} { + length_ = size; + position_ = 0; + blockSize_ = blkSize == 0 ? length_ : blkSize; } SeekableArrayInputStream::SeekableArrayInputStream( const char* values, uint64_t size, uint64_t blkSize) - : data(values), dataRead{nullptr} { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : blkSize; + : data_(values), dataRead_{nullptr} { + length_ = size; + position_ = 0; + blockSize_ = blkSize == 0 ? length_ : blkSize; } SeekableArrayInputStream::SeekableArrayInputStream( std::unique_ptr values, uint64_t size, uint64_t blkSize) - : ownedData(std::move(values)), data(ownedData.get()), dataRead{nullptr} { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : blkSize; + : ownedData_(std::move(values)), + data_(ownedData_.get()), + dataRead_{nullptr} { + length_ = size; + position_ = 0; + blockSize_ = blkSize == 0 ? length_ : blkSize; } SeekableArrayInputStream::SeekableArrayInputStream( std::function()> read, uint64_t blkSize) - : data(nullptr), dataRead{read} { - position = 0; - length = 0; - blockSize = blkSize; + : data_(nullptr), dataRead_{std::move(read)} { + position_ = 0; + length_ = 0; + blockSize_ = blkSize; } void SeekableArrayInputStream::loadIfAvailable() { - if (UNLIKELY(!!dataRead)) { - const auto result = dataRead(); - auto size = std::get<1>(result); - DWIO_ENSURE_LT(size, MAX_UINT64, "invalid data size"); - data = std::get<0>(result); - length = size; - if (blockSize == 0) { - blockSize = length; - } - - // just load once - dataRead = nullptr; + if (FOLLY_LIKELY(dataRead_ == nullptr)) { + return; + } + const auto result = dataRead_(); + auto size = std::get<1>(result); + VELOX_CHECK_LT(size, MAX_UINT64, "invalid data size"); + data_ = std::get<0>(result); + length_ = size; + if (blockSize_ == 0) { + blockSize_ = length_; } + // just load once + dataRead_ = nullptr; } bool SeekableArrayInputStream::Next(const void** buffer, int32_t* size) { loadIfAvailable(); - uint64_t currentSize = std::min(length - position, blockSize); + const uint64_t currentSize = std::min(length_ - position_, blockSize_); if (currentSize > 0) { - *buffer = data + position; + *buffer = data_ + position_; *size = static_cast(currentSize); - position += currentSize; + position_ += currentSize; + totalRead_ += currentSize; return true; } @@ -139,40 +142,38 @@ void SeekableArrayInputStream::BackUp(int32_t count) { loadIfAvailable(); if (count >= 0) { - uint64_t unsignedCount = static_cast(count); - DWIO_ENSURE( - unsignedCount <= blockSize && unsignedCount <= position, - "Can't backup that much!"); - position -= unsignedCount; + const uint64_t unsignedCount = static_cast(count); + VELOX_CHECK_LE(unsignedCount, blockSize_, "Can't backup that much!"); + VELOX_CHECK_LE(unsignedCount, position_, "Can't backup that much!"); + position_ -= unsignedCount; } } -bool SeekableArrayInputStream::Skip(int32_t count) { +bool SeekableArrayInputStream::SkipInt64(int64_t count) { loadIfAvailable(); if (count >= 0) { - uint64_t unsignedCount = static_cast(count); - if (unsignedCount + position <= length) { - position += unsignedCount; + const uint64_t unsignedCount = static_cast(count); + if (unsignedCount + position_ <= length_) { + position_ += unsignedCount; return true; - } else { - position = length; } + position_ = length_; } return false; } google::protobuf::int64 SeekableArrayInputStream::ByteCount() const { - return static_cast(position); + return static_cast(position_); } void SeekableArrayInputStream::seekToPosition(PositionProvider& position) { - this->position = position.next(); + position_ = position.next(); } std::string SeekableArrayInputStream::getName() const { return folly::to( - "SeekableArrayInputStream ", position, " of ", length); + "SeekableArrayInputStream ", position_, " of ", length_); } size_t SeekableArrayInputStream::positionSize() { @@ -188,71 +189,72 @@ SeekableFileInputStream::SeekableFileInputStream( std::shared_ptr input, uint64_t offset, uint64_t byteCount, - memory::MemoryPool& _pool, + memory::MemoryPool& pool, LogType logType, - uint64_t _blockSize) - : pool(_pool), - input(std::move(input)), - logType(logType), - start(offset), - length(byteCount), - blockSize(computeBlock(_blockSize, length)), - buffer{pool} { - position = 0; - pushBack = 0; + uint64_t blockSize) + : input_(std::move(input)), + logType_(logType), + start_(offset), + length_(byteCount), + blockSize_(computeBlock(blockSize, length_)), + pool_(&pool), + buffer_{pool} { + position_ = 0; + pushback_ = 0; } bool SeekableFileInputStream::Next(const void** data, int32_t* size) { uint64_t bytesRead; - if (pushBack != 0) { - *data = buffer.data() + (buffer.size() - pushBack); - bytesRead = pushBack; + if (pushback_ != 0) { + *data = buffer_.data() + (buffer_.size() - pushback_); + bytesRead = pushback_; } else { - bytesRead = std::min(length - position, blockSize); - buffer.resize(bytesRead); + bytesRead = std::min(length_ - position_, blockSize_); + buffer_.resize(bytesRead); if (bytesRead > 0) { - input->read(buffer.data(), bytesRead, start + position, logType); - *data = static_cast(buffer.data()); + input_->read(buffer_.data(), bytesRead, start_ + position_, logType_); + *data = static_cast(buffer_.data()); } } - position += bytesRead; - pushBack = 0; + position_ += bytesRead; + pushback_ = 0; *size = static_cast(bytesRead); return bytesRead != 0; } void SeekableFileInputStream::BackUp(int32_t signedCount) { - DWIO_ENSURE_GE(signedCount, 0, "can't backup negative distances"); - uint64_t count = static_cast(signedCount); - DWIO_ENSURE_EQ(pushBack, 0, "can't backup unless we just called Next"); - DWIO_ENSURE(count <= blockSize && count <= position, "can't backup that far"); - pushBack = static_cast(count); - position -= pushBack; + VELOX_CHECK_GE(signedCount, 0, "can't backup negative distances"); + VELOX_CHECK_EQ(pushback_, 0, "can't backup unless we just called Next"); + const uint64_t count = static_cast(signedCount); + VELOX_CHECK_LE(count, blockSize_, "can't backup that far"); + VELOX_CHECK_LE(count, position_, "can't backup that far"); + pushback_ = static_cast(count); + position_ -= pushback_; } -bool SeekableFileInputStream::Skip(int32_t signedCount) { +bool SeekableFileInputStream::SkipInt64(int64_t signedCount) { if (signedCount < 0) { return false; } - uint64_t count = static_cast(signedCount); - position = std::min(position + count, length); - pushBack = 0; - return position < length; + const uint64_t count = static_cast(signedCount); + position_ = std::min(position_ + count, length_); + pushback_ = 0; + return position_ < length_; } google::protobuf::int64 SeekableFileInputStream::ByteCount() const { - return static_cast(position); + return static_cast(position_); } void SeekableFileInputStream::seekToPosition(PositionProvider& location) { - position = location.next(); - DWIO_ENSURE_LE(position, length, "seek too far"); - pushBack = 0; + position_ = location.next(); + VELOX_CHECK_LE(position_, length_, "seek too far"); + pushback_ = 0; } std::string SeekableFileInputStream::getName() const { return folly::to( - input->getName(), " from ", start, " for ", length); + input_->getName(), " from ", start_, " for ", length_); } size_t SeekableFileInputStream::positionSize() { diff --git a/velox/dwio/common/SeekableInputStream.h b/velox/dwio/common/SeekableInputStream.h index 53bdb0b1fb4cc..ab402753b8c56 100644 --- a/velox/dwio/common/SeekableInputStream.h +++ b/velox/dwio/common/SeekableInputStream.h @@ -16,30 +16,15 @@ #pragma once -#include - #include "velox/dwio/common/DataBuffer.h" #include "velox/dwio/common/InputStream.h" +#include "velox/dwio/common/PositionProvider.h" #include "velox/dwio/common/wrap/zero-copy-stream-wrapper.h" namespace facebook::velox::dwio::common { void printBuffer(std::ostream& out, const char* buffer, uint64_t length); -class PositionProvider { - public: - explicit PositionProvider(const std::vector& positions) - : position_{positions.begin()}, end_{positions.end()} {} - - uint64_t next(); - - bool hasNext() const; - - private: - std::vector::const_iterator position_; - std::vector::const_iterator end_; -}; - /** * A subclass of Google's ZeroCopyInputStream that supports seek. * By extending Google's class, we get the ability to pass it directly @@ -57,6 +42,12 @@ class SeekableInputStream : public google::protobuf::io::ZeroCopyInputStream { // ORC/DWRF stream address. virtual size_t positionSize() = 0; + virtual bool SkipInt64(int64_t count) = 0; + + bool Skip(int32_t count) final override { + return SkipInt64(count); + } + void readFully(char* buffer, size_t bufferSize); }; @@ -64,16 +55,6 @@ class SeekableInputStream : public google::protobuf::io::ZeroCopyInputStream { * Create a seekable input stream based on a memory range. */ class SeekableArrayInputStream : public SeekableInputStream { - private: - // data may optionally be owned by *this via ownedData. - std::unique_ptr ownedData; - const char* data; - std::function()> dataRead; - uint64_t length; - uint64_t position; - uint64_t blockSize; - void loadIfAvailable(); - public: SeekableArrayInputStream( const unsigned char* list, @@ -94,30 +75,38 @@ class SeekableArrayInputStream : public SeekableInputStream { uint64_t block_size = 0); ~SeekableArrayInputStream() override = default; + virtual bool Next(const void** data, int32_t* size) override; virtual void BackUp(int32_t count) override; - virtual bool Skip(int32_t count) override; + virtual bool SkipInt64(int64_t count) override; virtual google::protobuf::int64 ByteCount() const override; virtual void seekToPosition(PositionProvider& position) override; virtual std::string getName() const override; virtual size_t positionSize() override; + + /// Return the total number of bytes returned from Next() calls. Intended to + /// be used for test validation. + int64_t totalRead() const { + return totalRead_; + } + + private: + void loadIfAvailable(); + + // data may optionally be owned by *this via ownedData. + const std::unique_ptr ownedData_; + const char* data_; + std::function()> dataRead_; + uint64_t length_; + uint64_t position_; + uint64_t blockSize_; + int64_t totalRead_ = 0; }; /** * Create a seekable input stream based on an io stream. */ class SeekableFileInputStream : public SeekableInputStream { - private: - memory::MemoryPool& pool; - std::shared_ptr input; - LogType logType; - const uint64_t start; - const uint64_t length; - const uint64_t blockSize; - DataBuffer buffer; - uint64_t position; - uint64_t pushBack; - public: SeekableFileInputStream( std::shared_ptr input, @@ -130,11 +119,23 @@ class SeekableFileInputStream : public SeekableInputStream { virtual bool Next(const void** data, int32_t* size) override; virtual void BackUp(int32_t count) override; - virtual bool Skip(int32_t count) override; + virtual bool SkipInt64(int64_t count) override; virtual google::protobuf::int64 ByteCount() const override; virtual void seekToPosition(PositionProvider& position) override; virtual std::string getName() const override; virtual size_t positionSize() override; + + private: + const std::shared_ptr input_; + const LogType logType_; + const uint64_t start_; + const uint64_t length_; + const uint64_t blockSize_; + memory::MemoryPool* const pool_; + + DataBuffer buffer_; + uint64_t position_; + uint64_t pushback_; }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/SelectiveByteRleColumnReader.cpp b/velox/dwio/common/SelectiveByteRleColumnReader.cpp index e649a1d425f0f..97b00a71e1535 100644 --- a/velox/dwio/common/SelectiveByteRleColumnReader.cpp +++ b/velox/dwio/common/SelectiveByteRleColumnReader.cpp @@ -18,7 +18,9 @@ namespace facebook::velox::dwio::common { -void SelectiveByteRleColumnReader::getValues(RowSet rows, VectorPtr* result) { +void SelectiveByteRleColumnReader::getValues( + const RowSet& rows, + VectorPtr* result) { switch (requestedType_->kind()) { case TypeKind::BOOLEAN: getFlatValues(rows, result, requestedType_); diff --git a/velox/dwio/common/SelectiveByteRleColumnReader.h b/velox/dwio/common/SelectiveByteRleColumnReader.h index ed9a7e050dca3..43b5e3181bda0 100644 --- a/velox/dwio/common/SelectiveByteRleColumnReader.h +++ b/velox/dwio/common/SelectiveByteRleColumnReader.h @@ -24,29 +24,33 @@ class SelectiveByteRleColumnReader : public SelectiveColumnReader { public: SelectiveByteRleColumnReader( const TypePtr& requestedType, + std::shared_ptr fileType, dwio::common::FormatParams& params, - velox::common::ScanSpec& scanSpec, - std::shared_ptr type) + velox::common::ScanSpec& scanSpec) : SelectiveColumnReader( requestedType, + std::move(fileType), params, - scanSpec, - std::move(type)) {} + scanSpec) {} bool hasBulkPath() const override { return false; } - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; - template + template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void processFilter( velox::common::Filter* filter, ExtractValues extractValues, - RowSet rows); + const RowSet& rows); template - void processValueHook(RowSet rows, ValueHook* hook); + void processValueHook(const RowSet& rows, ValueHook* hook); template < typename Reader, @@ -55,12 +59,14 @@ class SelectiveByteRleColumnReader : public SelectiveColumnReader { typename ExtractValues> void readHelper( velox::common::Filter* filter, - RowSet rows, + const RowSet& rows, ExtractValues extractValues); - template - void - readCommon(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls); + template + void readCommon( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls); }; template < @@ -70,7 +76,7 @@ template < typename ExtractValues> void SelectiveByteRleColumnReader::readHelper( velox::common::Filter* filter, - RowSet rows, + const RowSet& rows, ExtractValues extractValues) { reinterpret_cast(this)->readWithVisitor( rows, @@ -78,11 +84,15 @@ void SelectiveByteRleColumnReader::readHelper( *reinterpret_cast(filter), this, rows, extractValues)); } -template +template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void SelectiveByteRleColumnReader::processFilter( velox::common::Filter* filter, ExtractValues extractValues, - RowSet rows) { + const RowSet& rows) { using velox::common::FilterKind; switch (filter ? filter->kind() : FilterKind::kAlwaysTrue) { case FilterKind::kAlwaysTrue: @@ -90,13 +100,20 @@ void SelectiveByteRleColumnReader::processFilter( filter, rows, extractValues); break; case FilterKind::kIsNull: - filterNulls( - rows, - true, - !std::is_same_v); + if constexpr (kEncodingHasNulls) { + filterNulls( + rows, + true, + !std::is_same_v); + } else { + readHelper( + filter, rows, extractValues); + } break; case FilterKind::kIsNotNull: - if (std::is_same_v) { + if constexpr ( + kEncodingHasNulls && + std::is_same_v) { filterNulls(rows, false, false); } else { readHelper( @@ -130,31 +147,27 @@ void SelectiveByteRleColumnReader::processFilter( template void SelectiveByteRleColumnReader::processValueHook( - RowSet rows, + const RowSet& rows, ValueHook* hook) { using namespace facebook::velox::aggregate; switch (hook->kind()) { - case aggregate::AggregationHook::kSumBigintToBigint: + case aggregate::AggregationHook::kBigintSum: readHelper( - &dwio::common::alwaysTrue(), - rows, - dwio::common::ExtractToHook>(hook)); + &alwaysTrue(), rows, ExtractToHook>(hook)); break; default: readHelper( - &dwio::common::alwaysTrue(), - rows, - dwio::common::ExtractToGenericHook(hook)); + &alwaysTrue(), rows, ExtractToGenericHook(hook)); } } -template +template void SelectiveByteRleColumnReader::readCommon( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { prepareRead(offset, rows, incomingNulls); - bool isDense = rows.back() == rows.size() - 1; + const bool isDense = rows.back() == rows.size() - 1; velox::common::Filter* filter = scanSpec_->filter() ? scanSpec_->filter() : &dwio::common::alwaysTrue(); if (scanSpec_->keepValues()) { @@ -167,17 +180,19 @@ void SelectiveByteRleColumnReader::readCommon( return; } if (isDense) { - processFilter( + processFilter( filter, dwio::common::ExtractToReader(this), rows); } else { - processFilter( + processFilter( filter, dwio::common::ExtractToReader(this), rows); } } else { if (isDense) { - processFilter(filter, dwio::common::DropValues(), rows); + processFilter( + filter, dwio::common::DropValues(), rows); } else { - processFilter(filter, dwio::common::DropValues(), rows); + processFilter( + filter, dwio::common::DropValues(), rows); } } } diff --git a/velox/dwio/common/SelectiveColumnReader.cpp b/velox/dwio/common/SelectiveColumnReader.cpp index d4411b80727e7..c7a3ad67a4e2b 100644 --- a/velox/dwio/common/SelectiveColumnReader.cpp +++ b/velox/dwio/common/SelectiveColumnReader.cpp @@ -43,14 +43,14 @@ void ScanState::updateRawState() { SelectiveColumnReader::SelectiveColumnReader( const TypePtr& requestedType, + std::shared_ptr fileType, dwio::common::FormatParams& params, - velox::common::ScanSpec& scanSpec, - std::shared_ptr type) - : memoryPool_(params.pool()), - fileType_(type), - formatData_(params.toFormatData(type, scanSpec)), - scanSpec_(&scanSpec), - requestedType_(requestedType) {} + velox::common::ScanSpec& scanSpec) + : memoryPool_(¶ms.pool()), + requestedType_(requestedType), + fileType_(fileType), + formatData_(params.toFormatData(fileType, scanSpec)), + scanSpec_(&scanSpec) {} void SelectiveColumnReader::filterRowGroups( uint64_t rowGroupSize, @@ -70,13 +70,13 @@ void SelectiveColumnReader::seekTo(vector_size_t offset, bool readsNullsOnly) { return; } if (readOffset_ < offset) { - if (numParentNulls_) { + if (numParentNulls_ > 0) { VELOX_CHECK_LE( parentNullsRecordedTo_, offset, "Must not seek to before parentNullsRecordedTo_"); } - auto distance = offset - readOffset_ - numParentNulls_; + const auto distance = offset - readOffset_ - numParentNulls_; numParentNulls_ = 0; parentNullsRecordedTo_ = 0; if (readsNullsOnly) { @@ -86,47 +86,54 @@ void SelectiveColumnReader::seekTo(vector_size_t offset, bool readsNullsOnly) { } readOffset_ = offset; } else { - VELOX_FAIL("Seeking backward on a ColumnReader"); + VELOX_FAIL( + "Seeking backward on a ColumnReader from {} to {}", + readOffset_, + offset); + } +} + +void SelectiveColumnReader::initReturnReaderNulls(const RowSet& rows) { + if (useBulkPath() && !scanSpec_->hasFilter()) { + anyNulls_ = nullsInReadRange_ != nullptr; + const bool isDense = rows.back() == rows.size() - 1; + returnReaderNulls_ = anyNulls_ && isDense; + } else { + returnReaderNulls_ = false; } } void SelectiveColumnReader::prepareNulls( - RowSet rows, + const RowSet& rows, bool hasNulls, int32_t extraRows) { if (!hasNulls) { anyNulls_ = false; return; } - auto numRows = rows.size() + extraRows; - if (useBulkPath()) { - bool isDense = rows.back() == rows.size() - 1; - if (!scanSpec_->hasFilter()) { - anyNulls_ = nullsInReadRange_ != nullptr; - returnReaderNulls_ = anyNulls_ && isDense; - // No need for null flags if fast path - if (returnReaderNulls_) { - return; - } - } + + initReturnReaderNulls(rows); + if (returnReaderNulls_) { + // No need for null flags if fast path. + return; } + + const auto numRows = rows.size() + extraRows; if (resultNulls_ && resultNulls_->unique() && resultNulls_->capacity() >= bits::nbytes(numRows) + simd::kPadding) { - // Clear whole capacity because future uses could hit - // uncleared data between capacity() and 'numBytes'. - simd::memset(rawResultNulls_, bits::kNotNullByte, resultNulls_->capacity()); - anyNulls_ = false; - return; + resultNulls_->setSize(bits::nbytes(numRows)); + } else { + resultNulls_ = AlignedBuffer::allocate( + numRows + (simd::kPadding * 8), memoryPool_); + rawResultNulls_ = resultNulls_->asMutable(); } - anyNulls_ = false; - resultNulls_ = AlignedBuffer::allocate( - numRows + (simd::kPadding * 8), &memoryPool_); - rawResultNulls_ = resultNulls_->asMutable(); + // Clear whole capacity because future uses could hit uncleared data between + // capacity() and 'numBytes'. simd::memset(rawResultNulls_, bits::kNotNullByte, resultNulls_->capacity()); } -const uint64_t* SelectiveColumnReader::shouldMoveNulls(RowSet rows) { +const uint64_t* SelectiveColumnReader::shouldMoveNulls(const RowSet& rows) { if (rows.size() == numValues_ || !anyNulls_) { // Nulls will only be moved if there is a selection on values. A cast // alone does not move nulls. @@ -137,7 +144,7 @@ const uint64_t* SelectiveColumnReader::shouldMoveNulls(RowSet rows) { if (!(resultNulls_ && resultNulls_->unique() && resultNulls_->capacity() >= rows.size() + simd::kPadding)) { resultNulls_ = AlignedBuffer::allocate( - rows.size() + (simd::kPadding * 8), &memoryPool_); + rows.size() + (simd::kPadding * 8), memoryPool_); rawResultNulls_ = resultNulls_->asMutable(); } moveFrom = nullsInReadRange_->as(); @@ -149,8 +156,40 @@ const uint64_t* SelectiveColumnReader::shouldMoveNulls(RowSet rows) { return moveFrom; } +void SelectiveColumnReader::setComplexNulls( + const RowSet& rows, + VectorPtr& result) const { + if (!nullsInReadRange_) { + if (result->isNullsWritable()) { + result->clearNulls(0, rows.size()); + } else { + result->resetNulls(); + } + return; + } + + const bool dense = 1 + rows.back() == rows.size(); + auto& nulls = result->nulls(); + if (dense && + !(nulls && nulls->isMutable() && + nulls->capacity() >= bits::nbytes(rows.size()))) { + result->setNulls(nullsInReadRange_); + return; + } + + auto* readerNulls = nullsInReadRange_->as(); + auto* resultNulls = result->mutableNulls(rows.size())->asMutable(); + if (dense) { + bits::copyBits(readerNulls, 0, resultNulls, 0, rows.size()); + return; + } + for (vector_size_t i = 0; i < rows.size(); ++i) { + bits::setBit(resultNulls, i, bits::isBitSet(readerNulls, rows[i])); + } +} + void SelectiveColumnReader::getIntValues( - RowSet rows, + const RowSet& rows, const TypePtr& requestedType, VectorPtr* result) { switch (requestedType->kind()) { @@ -220,9 +259,78 @@ void SelectiveColumnReader::getIntValues( } } +void SelectiveColumnReader::getUnsignedIntValues( + const RowSet& rows, + const TypePtr& requestedType, + VectorPtr* result) { + switch (requestedType->kind()) { + case TypeKind::TINYINT: + switch (valueSize_) { + case 1: + getFlatValues(rows, result, requestedType); + break; + case 4: + getFlatValues(rows, result, requestedType); + break; + default: + VELOX_FAIL("Unsupported value size: {}", valueSize_); + } + break; + case TypeKind::SMALLINT: + switch (valueSize_) { + case 2: + getFlatValues(rows, result, requestedType); + break; + case 4: + getFlatValues(rows, result, requestedType); + break; + default: + VELOX_FAIL("Unsupported value size: {}", valueSize_); + } + break; + case TypeKind::INTEGER: + switch (valueSize_) { + case 4: + getFlatValues(rows, result, requestedType); + break; + default: + VELOX_FAIL("Unsupported value size: {}", valueSize_); + } + break; + case TypeKind::BIGINT: + switch (valueSize_) { + case 4: + getFlatValues(rows, result, requestedType); + break; + case 8: + getFlatValues(rows, result, requestedType); + break; + default: + VELOX_FAIL("Unsupported value size: {}", valueSize_); + } + break; + case TypeKind::HUGEINT: + switch (valueSize_) { + case 8: + getFlatValues(rows, result, requestedType); + break; + case 16: + getFlatValues(rows, result, requestedType); + break; + default: + VELOX_FAIL("Unsupported value size: {}", valueSize_); + } + break; + default: + VELOX_FAIL( + "Not a valid type for unsigned integer reader: {}", + requestedType->toString()); + } +} + template <> void SelectiveColumnReader::getFlatValues( - RowSet rows, + const RowSet& rows, VectorPtr* result, const TypePtr& type, bool isFinal) { @@ -230,7 +338,7 @@ void SelectiveColumnReader::getFlatValues( VELOX_CHECK_EQ(valueSize_, sizeof(int8_t)); compactScalarValues(rows, isFinal); auto boolValues = - AlignedBuffer::allocate(numValues_, &memoryPool_, false); + AlignedBuffer::allocate(numValues_, memoryPool_, false); auto rawBytes = values_->as(); auto zero = xsimd::broadcast(0); if constexpr (kWidth == 32) { @@ -248,7 +356,7 @@ void SelectiveColumnReader::getFlatValues( } } *result = std::make_shared>( - &memoryPool_, + memoryPool_, type, resultNulls(), numValues_, @@ -258,7 +366,7 @@ void SelectiveColumnReader::getFlatValues( template <> void SelectiveColumnReader::compactScalarValues( - RowSet rows, + const RowSet& rows, bool isFinal) { if (!values_ || rows.size() == numValues_) { if (values_) { @@ -275,7 +383,7 @@ void SelectiveColumnReader::compactScalarValues( continue; } - VELOX_DCHECK(outputRows_[i] == nextRow); + VELOX_DCHECK_EQ(outputRows_[i], nextRow); bits::setBit(rawBits, rowIndex, bits::isBitSet(rawBits, i)); if (moveNullsFrom && rowIndex != i) { @@ -299,7 +407,7 @@ char* SelectiveColumnReader::copyStringValue(folly::StringPiece value) { uint64_t size = value.size(); if (stringBuffers_.empty() || rawStringUsed_ + size > rawStringSize_) { auto bytes = std::max(size, kStringBufferSize); - BufferPtr buffer = AlignedBuffer::allocate(bytes, &memoryPool_); + BufferPtr buffer = AlignedBuffer::allocate(bytes, memoryPool_); // Use the preferred size instead of the requested one to improve memory // efficiency. buffer->setSize(buffer->capacity()); @@ -322,17 +430,6 @@ void SelectiveColumnReader::addStringValue(folly::StringPiece value) { StringView(copy, value.size()); } -bool SelectiveColumnReader::readsNullsOnly() const { - auto filter = scanSpec_->filter(); - if (filter) { - auto kind = filter->kind(); - return kind == velox::common::FilterKind::kIsNull || - (!scanSpec_->keepValues() && - kind == velox::common::FilterKind::kIsNotNull); - } - return false; -} - void SelectiveColumnReader::setNulls(BufferPtr resultNulls) { resultNulls_ = resultNulls; rawResultNulls_ = resultNulls ? resultNulls->asMutable() : nullptr; @@ -360,8 +457,8 @@ void SelectiveColumnReader::resetFilterCaches() { void SelectiveColumnReader::addParentNulls( int32_t firstRowInNulls, const uint64_t* nulls, - RowSet rows) { - int32_t firstNullIndex = + const RowSet& rows) { + const int32_t firstNullIndex = readOffset_ < firstRowInNulls ? 0 : readOffset_ - firstRowInNulls; numParentNulls_ += nulls ? bits::countNulls(nulls, firstNullIndex, rows.back() + 1) : 0; diff --git a/velox/dwio/common/SelectiveColumnReader.h b/velox/dwio/common/SelectiveColumnReader.h index 4dab2518501dd..053db8884884f 100644 --- a/velox/dwio/common/SelectiveColumnReader.h +++ b/velox/dwio/common/SelectiveColumnReader.h @@ -18,7 +18,7 @@ #include "velox/common/base/RawVector.h" #include "velox/common/memory/Memory.h" #include "velox/common/process/ProcessBase.h" -#include "velox/dwio/common/ColumnSelector.h" +#include "velox/common/process/TraceHistory.h" #include "velox/dwio/common/FormatData.h" #include "velox/dwio/common/IntDecoder.h" #include "velox/dwio/common/Mutation.h" @@ -27,20 +27,20 @@ namespace facebook::velox::dwio::common { -// Generalized representation of a set of distinct values for dictionary -// encodings. +/// Generalized representation of a set of distinct values for dictionary +/// encodings. struct DictionaryValues { - // Array of values for dictionary. StringViews for string values. + /// Array of values for dictionary. StringViews for string values. BufferPtr values; - // For a string dictionary, holds the characters that are pointed to by - // StringViews in 'values'. + /// For a string dictionary, holds the characters that are pointed to by + /// StringViews in 'values'. BufferPtr strings; - // Number of valid elements in 'values'. + /// Number of valid elements in 'values'. int32_t numValues{0}; - // True if values are in ascending order. + /// True if values are in ascending order. bool sorted{false}; void clear() { @@ -49,10 +49,21 @@ struct DictionaryValues { numValues = 0; sorted = false; } + + /// Whether the dictionary values have filter on it. + static bool hasFilter(const velox::common::Filter* filter) { + // Dictionary values cannot be null. It's by design not possible in ORC and + // Parquet; in other formats even when it is possible in theory, it should + // not be used in a normal file because outside dictionary we only need 1 + // bit to encode a null, but if we move it inside dictionary, we would need + // 1 integer to encode a null. A sanity check can be added on encoding + // metadata for such formats. + return filter && filter->kind() != velox::common::FilterKind::kIsNotNull; + } }; struct RawDictionaryState { - const void* FOLLY_NULLABLE values{nullptr}; + const void* values{nullptr}; int32_t numValues{0}; }; @@ -63,22 +74,21 @@ struct RawScanState { // See comment in ScanState below. RawDictionaryState dictionary2; - const uint64_t* __restrict FOLLY_NULLABLE inDictionary{nullptr}; - uint8_t* __restrict FOLLY_NULLABLE filterCache; + const uint64_t* __restrict inDictionary{nullptr}; + uint8_t* __restrict filterCache; }; -// Maintains state for encoding between calls to readWithVisitor of -// individual readers. DWRF sets up encoding information at the -// start of a stripe and dictionaries at the start of stripes and -// optionally row groups. Other encodings can set dictionaries and -// encoding types at any time during processing a stripe. +// Maintains state for encoding between calls to readWithVisitor of individual +// readers. DWRF sets up encoding information at the start of a stripe and +// dictionaries at the start of stripes and optionally row groups. Other +// encodings can set dictionaries and encoding types at any time during +// processing a stripe. // -// This is the union of the state elements that the supported -// encodings require for keeping state. This may be augmented when -// adding formats. This is however inlined in the reader superclass -// ad not for example nodeled as a class hierarchy with virtual -// functions because this needs to be trivially and branchlessly -// accessible. +// This is the union of the state elements that the supported encodings require +// for keeping state. This may be augmented when adding formats. This is however +// inlined in the reader superclass and not for example modeled as a class +// hierarchy with virtual functions because this needs to be trivially and +// branchlessly accessible. struct ScanState { // Copies the owned values of 'this' into 'rawState'. void updateRawState(); @@ -94,14 +104,14 @@ struct ScanState { DictionaryValues dictionary; // If the format, like ORC/DWRF has a base dictionary completed by - // local delta dictionaries over the furst one, this represents the + // local delta dictionaries over the first one, this represents the // local values, e.g. row group dictionary in ORC. TBD: If there is // a pattern of dictionaries completed by more dictionaries in other // formats, this will be modeled as an vector of n DictionaryValues. DictionaryValues dictionary2; // Bits selecting between dictionary and dictionary2 or dictionary and - // literal. OR/DWRFC only. + // literal. ORC/DWRF only. BufferPtr inDictionary; // Copy of Visitor::rows_ adjusted to start at the current encoding @@ -124,9 +134,9 @@ class SelectiveColumnReader { SelectiveColumnReader( const TypePtr& requestedType, + std::shared_ptr fileType, dwio::common::FormatParams& params, - velox::common::ScanSpec& scanSpec, - std::shared_ptr type); + velox::common::ScanSpec& scanSpec); virtual ~SelectiveColumnReader() = default; @@ -140,7 +150,7 @@ class SelectiveColumnReader { /** * Read the next group of values into a RowVector. * @param numValues the number of values to read - * @param vector to read into + * @param result vector to read into */ virtual void next(uint64_t /*numValues*/, VectorPtr& /*result*/, const Mutation*) { @@ -159,23 +169,23 @@ class SelectiveColumnReader { // between this and the next call to read. virtual void read( vector_size_t offset, - RowSet rows, - const uint64_t* FOLLY_NULLABLE incomingNulls) = 0; + const RowSet& rows, + const uint64_t* incomingNulls) = 0; virtual uint64_t skip(uint64_t numValues) { return formatData_->skip(numValues); } - // Extracts the values at 'rows' into '*result'. May rewrite or - // reallocate '*result'. 'rows' must be the same set or a subset of - // 'rows' passed to the last 'read(). - virtual void getValues(RowSet rows, VectorPtr* FOLLY_NONNULL result) = 0; + /// Extracts the values at 'rows' into '*result'. May rewrite or reallocate + /// '*result'. 'rows' must be the same set or a subset of 'rows' passed to the + /// last 'read(). + virtual void getValues(const RowSet& rows, VectorPtr* result) = 0; // Returns the rows that were selected/visited by the last // read(). If 'this' has no filter, returns 'rows' passed to last // read(). const RowSet outputRows() const { - if (scanSpec_->hasFilter() || hasMutation()) { + if (scanSpec_->hasFilter() || hasDeletion()) { return outputRows_; } return inputRows_; @@ -185,11 +195,12 @@ class SelectiveColumnReader { // offset-th from the start of stripe. virtual void seekTo(vector_size_t offset, bool readsNullsOnly); - // Positions this at the start of 'index'th row - // group. Interpretation of 'index' depends on format. Clears counts - // of skipped enclosing struct nulls for formats where nulls are - // recorded at each nesting level, i.e. not rep-def. - virtual void seekToRowGroup(uint32_t /*index*/) { + /// Positions this at the start of 'index'th row group. Interpretation of + /// 'index' depends on format. Clears counts of skipped enclosing struct nulls + /// for formats where nulls are recorded at each nesting level, i.e. not + /// rep-def. + virtual void seekToRowGroup(uint32_t index) { + VELOX_TRACE_HISTORY_PUSH("seekToRowGroup %u", index); numParentNulls_ = 0; parentNullsRecordedTo_ = 0; } @@ -202,32 +213,41 @@ class SelectiveColumnReader { return *fileType_; } - // The below functions are called from ColumnVisitor to fill the result set. + /// The below functions are called from ColumnVisitor to fill the result set. inline void addOutputRow(vector_size_t row) { outputRows_.push_back(row); } // Returns a pointer to output rows with at least 'size' elements available. - vector_size_t* FOLLY_NONNULL mutableOutputRows(int32_t size) { - numOutConfirmed_ = outputRows_.size(); - outputRows_.resize(numOutConfirmed_ + size); - return outputRows_.data() + numOutConfirmed_; + vector_size_t* mutableOutputRows(int32_t size) { + auto numOutConfirmed = outputRows_.size(); + outputRows_.resize(numOutConfirmed + size); + return outputRows_.data() + numOutConfirmed; + } + + void* rawValues() { + return rawValues_; } template - T* FOLLY_NONNULL mutableValues(int32_t size) { + T* mutableValues(int32_t size) { DCHECK(values_->capacity() >= (numValues_ + size) * sizeof(T)); return reinterpret_cast(rawValues_) + numValues_; } + uint64_t valuesCapacity() const { + VELOX_DCHECK_NOT_NULL(values_); + return values_->capacity(); + } + // Returns a mutable pointer to start of result nulls // bitmap. Ensures that this has at least 'numValues_' + 'size' // capacity and is unique. If extending existing buffer, preserves // previous contents. - uint64_t* FOLLY_NONNULL mutableNulls(int32_t size) { + uint64_t* mutableNulls(int32_t size) { if (!resultNulls_->unique()) { resultNulls_ = AlignedBuffer::allocate( - numValues_ + size, &memoryPool_, bits::kNotNull); + numValues_ + size, memoryPool_, bits::kNotNull); rawResultNulls_ = resultNulls_->asMutable(); } if (resultNulls_->capacity() * 8 < numValues_ + size) { @@ -242,6 +262,10 @@ class SelectiveColumnReader { return rawResultNulls_; } + uint64_t* rawResultNulls() { + return rawResultNulls_; + } + // True if this reads contiguous rows starting at 0 and may have // nulls. If so, the nulls decoded from the nulls in encoded data // can be returned directly in the vector in getValues(). @@ -249,6 +273,8 @@ class SelectiveColumnReader { return returnReaderNulls_; } + void initReturnReaderNulls(const RowSet& rows); + void setNumValues(vector_size_t size) { numValues_ = size; } @@ -279,6 +305,10 @@ class SelectiveColumnReader { } } + bool hasNulls() const { + return anyNulls_; + } + void setHasNulls() { anyNulls_ = true; } @@ -294,30 +324,28 @@ class SelectiveColumnReader { template inline void addNull() { VELOX_DCHECK_NE(valueSize_, kNoValueSize); - VELOX_DCHECK_LE( - rawResultNulls_ && rawValues_ && (numValues_ + 1) * valueSize_, - values_->capacity()); + VELOX_DCHECK(rawResultNulls_ && rawValues_); + VELOX_DCHECK_LE((numValues_ + 1) * valueSize_, values_->capacity()); anyNulls_ = true; bits::setNull(rawResultNulls_, numValues_); - // Set the default value at the nominal width of the reader but - // calculate the index based on the actual width of the - // data. These may differ for integer and dictionary readers. - auto valuesAsChar = reinterpret_cast(rawValues_); + // Set the default value at the nominal width of the reader but calculate + // the index based on the actual width of the data. These may differ for + // integer and dictionary readers. + auto* valuesAsChar = reinterpret_cast(rawValues_); *reinterpret_cast(valuesAsChar + valueSize_ * numValues_) = T(); - numValues_++; + ++numValues_; } template - inline void addValue(const T value) { - // @lint-ignore-every HOWTOEVEN ConstantArgumentPassByValue + inline void addValue(T value) { static_assert( std::is_pod_v, "General case of addValue is only for primitive types"); - VELOX_DCHECK_LE( - rawValues_ && (numValues_ + 1) * sizeof(T), values_->capacity()); + VELOX_DCHECK_NOT_NULL(rawValues_); + VELOX_DCHECK_LE((numValues_ + 1) * sizeof(T), values_->capacity()); reinterpret_cast(rawValues_)[numValues_] = value; - numValues_++; + ++numValues_; } void dropResults(vector_size_t count) { @@ -325,7 +353,7 @@ class SelectiveColumnReader { numValues_ -= count; } - velox::common::ScanSpec* FOLLY_NONNULL scanSpec() const { + velox::common::ScanSpec* scanSpec() const { return scanSpec_; } @@ -341,8 +369,8 @@ class SelectiveColumnReader { setReadOffset(readOffset); } - // Recursively sets 'isTopLevel_'. Recurses down non-nullable structs, - // otherwise only sets 'isTopLevel_' of 'this' + /// Recursively sets 'isTopLevel_'. Recurses down non-nullable structs, + /// otherwise only sets 'isTopLevel_' of 'this' virtual void setIsTopLevel() { isTopLevel_ = true; } @@ -376,12 +404,15 @@ class SelectiveColumnReader { return nullsInReadRange_; } - // Returns true if no filters or deterministic filters/hooks that - // discard nulls. This is used at read prepare time. useFastPath() - // in DecoderUtil.h is used at read time and is expected to produce - // the same result. - virtual bool useBulkPath() const { - auto filter = scanSpec_->filter(); + const uint64_t* rawNullsInReadRange() const { + return nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; + } + + /// Returns true if no filters or deterministic filters/hooks that discard + /// nulls. This is used at read prepare time. useFastPath() in DecoderUtil.h + /// is used at read time and is expected to produce the same result. + bool useBulkPath() const { + auto* filter = scanSpec_->filter(); return hasBulkPath() && process::hasAvx2() && (!filter || (filter->isDeterministic() && @@ -409,18 +440,17 @@ class SelectiveColumnReader { // converts to direct in mid-read. virtual void dedictionarize() {} - // A reader nested inside nullable containers has fewer rows than - // the top level table. addParentNulls records how many parent nulls - // there are between the position of 'this' and 'rows.back() + 1', - // i.e. the position of the scan in top level rows. 'firstRowInNulls' is - // the top level row corresponding to the first bit in - // 'nulls'. 'nulls' is in terms of top level rows and represents all - // null parents at any enclosing level. 'nulls' is nullptr if there are no - // parent nulls. + /// A reader nested inside nullable containers has fewer rows than the top + /// level table. addParentNulls records how many parent nulls there are + /// between the position of 'this' and 'rows.back() + 1', i.e. the position of + /// the scan in top level rows. 'firstRowInNulls' is the top level row + /// corresponding to the first bit in 'nulls'. 'nulls' is in terms of top + /// level rows and represents all null parents at any enclosing level. 'nulls' + /// is nullptr if there are no parent nulls. void addParentNulls( int32_t firstRowInNulls, - const uint64_t* FOLLY_NULLABLE nulls, - RowSet rows); + const uint64_t* nulls, + const RowSet& rows); // When skipping rows in a struct, records how many parent nulls at // any level there are between top level row 'from' and 'to'. If @@ -432,44 +462,81 @@ class SelectiveColumnReader { static constexpr int8_t kNoValueSize = -1; static constexpr uint32_t kRowGroupNotSet = ~0; - // True if we have an is null filter and optionally return column - // values or we have an is not null filter and do not return column - // values. This means that only null flags need be accessed. - bool readsNullsOnly() const; - template void ensureValuesCapacity(vector_size_t numRows); // Prepares the result buffer for nulls for reading 'rows'. Leaves // 'extraSpace' bits worth of space in the nulls buffer. - void prepareNulls(RowSet rows, bool hasNulls, int32_t extraRows = 0); + void prepareNulls(const RowSet& rows, bool hasNulls, int32_t extraRows = 0); - protected: - // Filters 'rows' according to 'is_null'. Only applies to cases where - // readsNullsOnly() is true. + void setIsFlatMapValue(bool value) { + isFlatMapValue_ = value; + } + + /// Filters 'rows' according to 'isNull'. Only applies to cases where + /// scanSpec_->readsNullsOnly() is true. template - void filterNulls(RowSet rows, bool isNull, bool extractValues); + void filterNulls(const RowSet& rows, bool isNull, bool extractValues); + + // Temporary method for estimate total in-memory byte size and row count of + // current encoding chunk on this column for Nimble. Will be removed once + // column statistics are added for Nimble. Note that the estimations are + // based on current encoding chunk, so in multi-chunk stripe this is not + // accurate. Other formats should not use this. + virtual bool estimateMaterializedSize( + size_t& /*byteSize*/, + size_t& /*rowCount*/) const { + return false; + } + + StringView copyStringValueIfNeed(folly::StringPiece value) { + if (value.size() <= StringView::kInlineSize) { + return StringView(value); + } + auto* data = copyStringValue(value); + return StringView(data, value.size()); + } + + // Whether output rows should be filled when there is no column projected out + // and there is delete mutation. Used for row number generation. The case + // for no delete mutation is handled more efficiently outside column reader in + // `RowReader::readWithRowNumber'. + virtual void setFillMutatedOutputRows(bool /*value*/) { + VELOX_UNREACHABLE("Only struct reader supports this method"); + } + protected: template void prepareRead( vector_size_t offset, - RowSet rows, - const uint64_t* FOLLY_NULLABLE incomingNulls); + const RowSet& rows, + const uint64_t* incomingNulls); + + virtual bool readsNullsOnly() const { + return scanSpec_->readsNullsOnly(); + } - void setOutputRows(RowSet rows) { + void setOutputRows(const RowSet& rows) { outputRows_.resize(rows.size()); - if (!rows.size()) { + if (rows.empty()) { return; } - memcpy(outputRows_.data(), &rows[0], rows.size() * sizeof(vector_size_t)); + ::memcpy(outputRows_.data(), &rows[0], rows.size() * sizeof(vector_size_t)); } - // Returns integer values for 'rows' cast to the width of - // 'requestedType' in '*result'. + /// Returns integer values for 'rows' cast to the width of 'requestedType' in + /// '*result'. void getIntValues( - RowSet rows, + const RowSet& rows, const TypePtr& requestedType, - VectorPtr* FOLLY_NONNULL result); + VectorPtr* result); + + /// Returns integer values for 'rows' cast to the width of 'requestedType' in + /// '*result', the related fileDataType is unsigned int type. + void getUnsignedIntValues( + const RowSet& rows, + const TypePtr& requestedType, + VectorPtr* result); // Returns read values for 'rows' in 'vector'. This can be called // multiple times for consecutive subsets of 'rows'. If 'isFinal' is @@ -477,37 +544,34 @@ class SelectiveColumnReader { // to rows. TODO: Consider isFinal as template parameter. template void getFlatValues( - RowSet rows, - VectorPtr* FOLLY_NONNULL result, + const RowSet& rows, + VectorPtr* result, const TypePtr& type, bool isFinal = false); template - void compactScalarValues(RowSet rows, bool isFinal); - - // Compacts values extracted for a complex type column with - // filter. The values for 'rows' are shifted to be consecutive at - // indices [0..rows.size() - 1]'. 'move' is a function that takes - // two indices source and target and moves the value at source to - // target. target is <= source for all calls. - template - void compactComplexValues(RowSet rows, Move move, bool isFinal); + void compactScalarValues(const RowSet& rows, bool isFinal); template - void upcastScalarValues(RowSet rows); + void upcastScalarValues(const RowSet& rows); + + // For complex type column, we need to compact only nulls if the rows are + // shrinked. Child fields are handled recursively in their own column + // readers. + void setComplexNulls(const RowSet& rows, VectorPtr& result) const; // Return the source null bits if compactScalarValues and upcastScalarValues // should move null flags. Return nullptr if nulls does not need to be moved. // Checks consistency of nulls-related state. - const uint64_t* shouldMoveNulls(RowSet rows); + const uint64_t* shouldMoveNulls(const RowSet& rows); void addStringValue(folly::StringPiece value); // Copies 'value' to buffers owned by 'this' and returns the start of the // copy. - char* FOLLY_NONNULL copyStringValue(folly::StringPiece value); + char* copyStringValue(folly::StringPiece value); - virtual bool hasMutation() const { + virtual bool hasDeletion() const { return false; } @@ -515,8 +579,8 @@ class SelectiveColumnReader { void decodeWithVisitor( IntDecoder* intDecoder, ColumnVisitor& visitor) { - auto decoder = dynamic_cast(intDecoder); - VELOX_CHECK( + auto* decoder = dynamic_cast(intDecoder); + VELOX_CHECK_NOT_NULL( decoder, "Unexpected Decoder type, Expected: {}", typeid(Decoder).name()); @@ -536,21 +600,21 @@ class SelectiveColumnReader { : resultNulls_; } - memory::MemoryPool& memoryPool_; + memory::MemoryPool* const memoryPool_; + + // The requested data type + const TypePtr requestedType_; // The file data type - std::shared_ptr fileType_; + const std::shared_ptr fileType_; // Format specific state and functions. - std::unique_ptr formatData_; + const std::unique_ptr formatData_; // Specification of filters, value extraction, pruning etc. The // spec is assigned at construction and the contents may change at // run time based on adaptation. Owned by caller. - velox::common::ScanSpec* FOLLY_NONNULL scanSpec_; - - // The requested data type - TypePtr requestedType_; + velox::common::ScanSpec* const scanSpec_; // Row number after last read row, relative to the ORC stripe or Parquet // Rowgroup start. @@ -568,15 +632,12 @@ class SelectiveColumnReader { // The rows to process in read(). References memory supplied by // caller. The values must remain live until the next call to read(). RowSet inputRows_; - // Rows passing the filter in readWithVisitor. Must stay - // constant between consecutive calls to read(). + // Rows passing the filter in readWithVisitor. Must stay constant between + // consecutive calls to read(). raw_vector outputRows_; - // Index of last set value in outputRows. Values between this and - // size() can be used as scratchpad inside read(). - vector_size_t numOutConfirmed_; - // The row number - // corresponding to each element in 'values_' + // The row number corresponding to each element in 'values_' raw_vector valueRows_; + // The set of all nulls in the range of read(). Created when first // needed and then reused. May be referenced by result if all rows are // selected. @@ -584,11 +645,11 @@ class SelectiveColumnReader { // Nulls buffer for readWithVisitor. Not set if no nulls. 'numValues' // is the index of the first non-set bit. BufferPtr resultNulls_; - uint64_t* FOLLY_NULLABLE rawResultNulls_ = nullptr; + uint64_t* rawResultNulls_ = nullptr; // Buffer for gathering scalar values in readWithVisitor. BufferPtr values_; // Writable content in 'values' - void* FOLLY_NULLABLE rawValues_ = nullptr; + void* rawValues_ = nullptr; vector_size_t numValues_ = 0; // Size of fixed width value in 'rawValues'. For integers, values // are read at 64 bit width and can be compacted or extracted at a @@ -598,11 +659,10 @@ class SelectiveColumnReader { // true if 'this' is in a state where gatValues can be called. bool mayGetValues_ = false; - // True if row numbers of 'this' correspond 1:1 to row numbers in - // the file. This is false inside lists, maps and nullable - // structs. If true, a skip of n rows can use row group indices to - // skip long distances. Lazy vectors will only be made for results - // of top level readers. + // True if row numbers of 'this' correspond 1:1 to row numbers in the file. + // This is false inside lists, maps and nullable structs. If true, a skip of n + // rows can use row group indices to skip long distances. Lazy vectors will + // only be made for results of top level readers. bool isTopLevel_{false}; // Maps from position in non-null rows to a position in value @@ -614,12 +674,12 @@ class SelectiveColumnReader { // Buffers backing the StringViews in 'values' when reading strings. std::vector stringBuffers_; // Writable contents of 'stringBuffers_.back()'. - char* FOLLY_NULLABLE rawStringBuffer_ = nullptr; + char* rawStringBuffer_ = nullptr; // True if a vector can acquire a pin to a stream's buffer and refer // to that as its values. bool mayUseStreamBuffer_ = false; - // True if nulls and everything selected, so that nullsInReadRange - // can be returned as the null flags of the vector in getValues(). + // True if nulls and everything selected, so that nullsInReadRange can be + // returned as the null flags of the vector in getValues(). bool returnReaderNulls_ = false; // Total writable bytes in 'rawStringBuffer_'. int32_t rawStringSize_ = 0; @@ -636,6 +696,19 @@ class SelectiveColumnReader { // Encoding-related state to keep between reads, e.g. dictionaries. ScanState scanState_; + + // Whether this column reader is for a flatmap value column and the result is + // an ordinary map. If this is true, the nullsInReadRange_ and value_ will + // never be shared outside file reader and we can reuse them regardless of + // refcounts. + bool isFlatMapValue_ = false; + + // When isFlatMapValue_ is true, these fields are used to hold + // nullsInReadRange_ and value_ memory that can be reused when they switch + // between null and non-null values. + BufferPtr flatMapValueNullsInReadRange_; + VectorPtr flatMapValueFlatValues_; + VectorPtr flatMapValueConstantNullValues_; }; template <> @@ -656,14 +729,24 @@ inline void SelectiveColumnReader::addValue(const folly::StringPiece value) { addStringValue(value); } +velox::common::AlwaysTrue& alwaysTrue(); + } // namespace facebook::velox::dwio::common namespace facebook::velox::dwio::common { + // Template parameter to indicate no hook in fast scan path. This is // referenced in decoders, thus needs to be declared in a header. -struct NoHook : public ValueHook { - void addValue(vector_size_t /*row*/, const void* FOLLY_NULLABLE /*value*/) - override {} +struct NoHook final : public ValueHook { + void addValue(vector_size_t /*row*/, int64_t /*value*/) final {} + + void addValue(vector_size_t /*row*/, int128_t /*value*/) final {} + + void addValue(vector_size_t /*row*/, float /*value*/) final {} + + void addValue(vector_size_t /*row*/, double /*value*/) final {} + + void addValue(vector_size_t /*row*/, folly::StringPiece /*value*/) final {} }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/SelectiveColumnReaderInternal.h b/velox/dwio/common/SelectiveColumnReaderInternal.h index e4df3ebcb958b..689f54b5e818b 100644 --- a/velox/dwio/common/SelectiveColumnReaderInternal.h +++ b/velox/dwio/common/SelectiveColumnReaderInternal.h @@ -31,46 +31,41 @@ namespace facebook::velox::dwio::common { -velox::common::AlwaysTrue& alwaysTrue(); - -class Timer { - public: - Timer() : startClocks_{folly::hardware_timestamp()} {} - - uint64_t elapsedClocks() const { - return folly::hardware_timestamp() - startClocks_; - } - - private: - const uint64_t startClocks_; -}; - template void SelectiveColumnReader::ensureValuesCapacity(vector_size_t numRows) { - if (values_ && values_->unique() && + if (values_ && (isFlatMapValue_ || values_->unique()) && values_->capacity() >= BaseVector::byteSize(numRows) + simd::kPadding) { return; } values_ = AlignedBuffer::allocate( - numRows + (simd::kPadding / sizeof(T)), &memoryPool_); + numRows + (simd::kPadding / sizeof(T)), memoryPool_); rawValues_ = values_->asMutable(); } template void SelectiveColumnReader::prepareRead( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { - seekTo(offset, scanSpec_->readsNullsOnly()); - vector_size_t numRows = rows.back() + 1; + const bool readsNullsOnly = this->readsNullsOnly(); + seekTo(offset, readsNullsOnly); - // Do not re-use unless singly-referenced. - if (nullsInReadRange_ && !nullsInReadRange_->unique()) { + const vector_size_t numRows = rows.back() + 1; + if (isFlatMapValue_) { + if (!nullsInReadRange_) { + nullsInReadRange_ = std::move(flatMapValueNullsInReadRange_); + } + } else if (nullsInReadRange_ && !nullsInReadRange_->unique()) { nullsInReadRange_.reset(); } + formatData_->readNulls( - numRows, incomingNulls, nullsInReadRange_, readsNullsOnly()); + numRows, incomingNulls, nullsInReadRange_, readsNullsOnly); + if (isFlatMapValue_ && nullsInReadRange_) { + flatMapValueNullsInReadRange_ = nullsInReadRange_; + } + // We check for all nulls and no nulls. We expect both calls to // bits::isAllSet to fail early in the common case. We could do a // single traversal of null bits counting the bits and then compare @@ -85,18 +80,19 @@ void SelectiveColumnReader::prepareRead( nullsInReadRange_->as(), 0, numRows, bits::kNotNull)) { nullsInReadRange_ = nullptr; } + innerNonNullRows_.clear(); outerNonNullRows_.clear(); outputRows_.clear(); - // is part of read() and after read returns getValues may be called. + // Is part of read() and after read returns getValues may be called. mayGetValues_ = true; - numOutConfirmed_ = 0; numValues_ = 0; valueSize_ = sizeof(T); inputRows_ = rows; - if (scanSpec_->filter() || hasMutation()) { + if (scanSpec_->filter() || hasDeletion()) { outputRows_.reserve(rows.size()); } + ensureValuesCapacity(rows.size()); if (scanSpec_->keepValues() && !scanSpec_->valueHook()) { valueRows_.clear(); @@ -106,7 +102,7 @@ void SelectiveColumnReader::prepareRead( template void SelectiveColumnReader::getFlatValues( - RowSet rows, + const RowSet& rows, VectorPtr* result, const TypePtr& type, bool isFinal) { @@ -115,17 +111,24 @@ void SelectiveColumnReader::getFlatValues( if (isFinal) { mayGetValues_ = false; } + if (allNull_) { - *result = std::make_shared>( - &memoryPool_, - rows.size(), - true, - type, - T(), - SimpleVectorStats{}, - sizeof(TVector) * rows.size()); + if (isFlatMapValue_) { + if (flatMapValueConstantNullValues_) { + flatMapValueConstantNullValues_->resize(rows.size()); + } else { + flatMapValueConstantNullValues_ = + std::make_shared>( + memoryPool_, rows.size(), true, type, T()); + } + *result = flatMapValueConstantNullValues_; + } else { + *result = std::make_shared>( + memoryPool_, rows.size(), true, type, T()); + } return; } + if (valueSize_ == sizeof(TVector)) { compactScalarValues(rows, isFinal); } else if (sizeof(T) >= sizeof(TVector)) { @@ -134,24 +137,43 @@ void SelectiveColumnReader::getFlatValues( upcastScalarValues(rows); } valueSize_ = sizeof(TVector); - *result = std::make_shared>( - &memoryPool_, - type, - resultNulls(), - numValues_, - values_, - std::move(stringBuffers_)); + if (isFlatMapValue_) { + if (flatMapValueFlatValues_) { + auto* flat = flatMapValueFlatValues_->asUnchecked>(); + flat->unsafeSetSize(numValues_); + flat->setNulls(resultNulls()); + flat->unsafeSetValues(values_); + flat->setStringBuffers(std::move(stringBuffers_)); + } else { + flatMapValueFlatValues_ = std::make_shared>( + memoryPool_, + type, + resultNulls(), + numValues_, + values_, + std::move(stringBuffers_)); + } + *result = flatMapValueFlatValues_; + } else { + *result = std::make_shared>( + memoryPool_, + type, + resultNulls(), + numValues_, + values_, + std::move(stringBuffers_)); + } } template <> void SelectiveColumnReader::getFlatValues( - RowSet rows, + const RowSet& rows, VectorPtr* result, const TypePtr& type, bool isFinal); template -void SelectiveColumnReader::upcastScalarValues(RowSet rows) { +void SelectiveColumnReader::upcastScalarValues(const RowSet& rows) { VELOX_CHECK_LE(rows.size(), numValues_); VELOX_CHECK(!rows.empty()); if (!values_) { @@ -206,7 +228,9 @@ void SelectiveColumnReader::upcastScalarValues(RowSet rows) { } template -void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { +void SelectiveColumnReader::compactScalarValues( + const RowSet& rows, + bool isFinal) { VELOX_CHECK_LE(rows.size(), numValues_); VELOX_CHECK(!rows.empty()); if (!values_ || (rows.size() == numValues_ && sizeof(T) == sizeof(TVector))) { @@ -215,6 +239,7 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { } return; } + VELOX_CHECK_LE(sizeof(TVector), sizeof(T)); T* typedSourceValues = reinterpret_cast(rawValues_); TVector* typedDestValues = reinterpret_cast(rawValues_); @@ -233,15 +258,16 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { if (valueRows_.empty()) { valueRows_.resize(rows.size()); } + vector_size_t rowIndex = 0; auto nextRow = rows[rowIndex]; - auto* moveNullsFrom = shouldMoveNulls(rows); - for (size_t i = 0; i < numValues_; i++) { + const auto* moveNullsFrom = shouldMoveNulls(rows); + for (size_t i = 0; i < numValues_; ++i) { if (sourceRows[i] < nextRow) { continue; } - VELOX_DCHECK(sourceRows[i] == nextRow); + VELOX_DCHECK_EQ(sourceRows[i], nextRow); typedDestValues[rowIndex] = typedSourceValues[i]; if (moveNullsFrom && rowIndex != i) { bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i)); @@ -249,12 +275,13 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { if (!isFinal) { valueRows_[rowIndex] = nextRow; } - rowIndex++; + ++rowIndex; if (rowIndex >= rows.size()) { break; } nextRow = rows[rowIndex]; } + numValues_ = rows.size(); valueRows_.resize(numValues_); values_->setSize(numValues_ * sizeof(TVector)); @@ -262,7 +289,7 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { template <> void SelectiveColumnReader::compactScalarValues( - RowSet rows, + const RowSet& rows, bool isFinal); inline int32_t sizeOfIntKind(TypeKind kind) { @@ -274,72 +301,20 @@ inline int32_t sizeOfIntKind(TypeKind kind) { case TypeKind::BIGINT: return 8; default: - VELOX_FAIL("Not an integer TypeKind"); + VELOX_FAIL("Not an integer TypeKind: {}", static_cast(kind)); } } -template -void SelectiveColumnReader::compactComplexValues( - RowSet rows, - Move move, - bool isFinal) { - VELOX_CHECK_LE(rows.size(), outputRows_.size()); - VELOX_CHECK(!rows.empty()); - if (rows.size() == outputRows_.size()) { - return; - } - RowSet sourceRows; - // The row numbers corresponding to elements in 'values_' are in - // 'valueRows_' if values have been accessed before. Otherwise - // they are in 'outputRows_' if these are non-empty (there is a - // filter) and in 'inputRows_' otherwise. - if (!valueRows_.empty()) { - sourceRows = valueRows_; - } else if (!outputRows_.empty()) { - sourceRows = outputRows_; - } else { - sourceRows = inputRows_; - } - if (valueRows_.empty()) { - valueRows_.resize(rows.size()); - } - vector_size_t rowIndex = 0; - auto nextRow = rows[rowIndex]; - auto* moveNullsFrom = shouldMoveNulls(rows); - for (size_t i = 0; i < numValues_; i++) { - if (sourceRows[i] < nextRow) { - continue; - } - - VELOX_DCHECK(sourceRows[i] == nextRow); - // The value at i is moved to be the value at 'rowIndex'. - move(i, rowIndex); - if (moveNullsFrom && rowIndex != i) { - bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i)); - } - if (!isFinal) { - valueRows_[rowIndex] = nextRow; - } - rowIndex++; - if (rowIndex >= rows.size()) { - break; - } - nextRow = rows[rowIndex]; - } - numValues_ = rows.size(); - valueRows_.resize(numValues_); -} - template void SelectiveColumnReader::filterNulls( - RowSet rows, + const RowSet& rows, bool isNull, bool extractValues) { - bool isDense = rows.back() == rows.size() - 1; + const bool isDense = rows.back() == rows.size() - 1; // We decide is (not) null based on 'nullsInReadRange_'. This may be // set due to nulls in enclosing structs even if the column itself // does not add nulls. - auto rawNulls = + auto* rawNulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; if (isNull) { if (!rawNulls) { @@ -362,7 +337,6 @@ void SelectiveColumnReader::filterNulls( } } } - return; } diff --git a/velox/dwio/common/SelectiveFloatingPointColumnReader.h b/velox/dwio/common/SelectiveFloatingPointColumnReader.h index 2e71d7aebc2fb..97eb1e7583562 100644 --- a/velox/dwio/common/SelectiveFloatingPointColumnReader.h +++ b/velox/dwio/common/SelectiveFloatingPointColumnReader.h @@ -26,26 +26,28 @@ class SelectiveFloatingPointColumnReader : public SelectiveColumnReader { using ValueType = TRequested; SelectiveFloatingPointColumnReader( const TypePtr& requestedType, - std::shared_ptr dataType, + std::shared_ptr fileType, FormatParams& params, velox::common::ScanSpec& scanSpec) : SelectiveColumnReader( requestedType, + std::move(fileType), params, - scanSpec, - std::move(dataType)) {} + scanSpec) {} // Offers fast path only if data and result widths match. bool hasBulkPath() const override { return std::is_same_v; } - template - void - readCommon(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls); + template + void readCommon( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls); - void getValues(RowSet rows, VectorPtr* result) override { - getFlatValues(rows, result, requestedType_); + void getValues(const RowSet& rows, VectorPtr* result) override { + getFlatValues(rows, result, requestedType_); } protected: @@ -54,17 +56,23 @@ class SelectiveFloatingPointColumnReader : public SelectiveColumnReader { typename TFilter, bool isDense, typename ExtractValues> - void - readHelper(velox::common::Filter* filter, RowSet rows, ExtractValues values); + void readHelper( + velox::common::Filter* filter, + const RowSet& rows, + ExtractValues values); - template + template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void processFilter( velox::common::Filter* filter, - RowSet rows, + const RowSet& rows, ExtractValues extractValues); template - void processValueHook(RowSet rows, ValueHook* hook); + void processValueHook(const RowSet& rows, ValueHook* hook); }; template @@ -75,31 +83,48 @@ template < typename ExtractValues> void SelectiveFloatingPointColumnReader::readHelper( velox::common::Filter* filter, - RowSet rows, + const RowSet& rows, ExtractValues extractValues) { reinterpret_cast(this)->readWithVisitor( rows, - ColumnVisitor( + ColumnVisitor( *reinterpret_cast(filter), this, rows, extractValues)); } template -template +template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void SelectiveFloatingPointColumnReader::processFilter( velox::common::Filter* filter, - RowSet rows, + const RowSet& rows, ExtractValues extractValues) { - switch (filter ? filter->kind() : velox::common::FilterKind::kAlwaysTrue) { + if (filter == nullptr) { + readHelper( + &dwio::common::alwaysTrue(), rows, extractValues); + return; + } + + switch (filter->kind()) { case velox::common::FilterKind::kAlwaysTrue: readHelper( filter, rows, extractValues); break; case velox::common::FilterKind::kIsNull: - filterNulls( - rows, true, !std::is_same_v); + if constexpr (kEncodingHasNulls) { + filterNulls( + rows, true, !std::is_same_v); + } else { + readHelper( + filter, rows, extractValues); + } break; case velox::common::FilterKind::kIsNotNull: - if (std::is_same_v) { + if constexpr ( + kEncodingHasNulls && + std::is_same_v) { filterNulls(rows, false, false); } else { readHelper( @@ -121,30 +146,20 @@ void SelectiveFloatingPointColumnReader::processFilter( template template void SelectiveFloatingPointColumnReader::processValueHook( - RowSet rows, + const RowSet& rows, ValueHook* hook) { switch (hook->kind()) { - case aggregate::AggregationHook::kSumFloatToDouble: - readHelper( - &alwaysTrue(), - rows, - ExtractToHook>(hook)); - break; - case aggregate::AggregationHook::kSumDoubleToDouble: + case aggregate::AggregationHook::kDoubleSum: readHelper( - &alwaysTrue(), - rows, - ExtractToHook>(hook)); + &alwaysTrue(), rows, ExtractToHook>(hook)); break; - case aggregate::AggregationHook::kFloatMax: - case aggregate::AggregationHook::kDoubleMax: + case aggregate::AggregationHook::kFloatingPointMax: readHelper( &alwaysTrue(), rows, ExtractToHook>(hook)); break; - case aggregate::AggregationHook::kFloatMin: - case aggregate::AggregationHook::kDoubleMin: + case aggregate::AggregationHook::kFloatingPointMin: readHelper( &alwaysTrue(), rows, @@ -157,12 +172,12 @@ void SelectiveFloatingPointColumnReader::processValueHook( } template -template +template void SelectiveFloatingPointColumnReader::readCommon( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { - prepareRead(offset, rows, incomingNulls); + prepareRead(offset, rows, incomingNulls); bool isDense = rows.back() == rows.size() - 1; if (scanSpec_->keepValues()) { if (scanSpec_->valueHook()) { @@ -173,18 +188,20 @@ void SelectiveFloatingPointColumnReader::readCommon( } } else { if (isDense) { - processFilter( + processFilter( scanSpec_->filter(), rows, ExtractToReader(this)); } else { - processFilter( + processFilter( scanSpec_->filter(), rows, ExtractToReader(this)); } } } else { if (isDense) { - processFilter(scanSpec_->filter(), rows, DropValues()); + processFilter( + scanSpec_->filter(), rows, DropValues()); } else { - processFilter(scanSpec_->filter(), rows, DropValues()); + processFilter( + scanSpec_->filter(), rows, DropValues()); } } } diff --git a/velox/dwio/common/SelectiveIntegerColumnReader.h b/velox/dwio/common/SelectiveIntegerColumnReader.h index cb88227d1c2bd..2716f800bba83 100644 --- a/velox/dwio/common/SelectiveIntegerColumnReader.h +++ b/velox/dwio/common/SelectiveIntegerColumnReader.h @@ -20,37 +20,41 @@ namespace facebook::velox::dwio::common { -// Abstract class for format and encoding-independent parts of reading ingeger -// columns. +/// Abstract class for format and encoding-independent parts of reading integer +/// columns. class SelectiveIntegerColumnReader : public SelectiveColumnReader { public: SelectiveIntegerColumnReader( const TypePtr& requestedType, dwio::common::FormatParams& params, velox::common::ScanSpec& scanSpec, - std::shared_ptr type) + std::shared_ptr fileType) : SelectiveColumnReader( requestedType, + std::move(fileType), params, - scanSpec, - std::move(type)) {} + scanSpec) {} - void getValues(RowSet rows, VectorPtr* result) override { + void getValues(const RowSet& rows, VectorPtr* result) override { getIntValues(rows, requestedType_, result); } protected: // Switches based on filter type between different readHelper instantiations. - template + template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void processFilter( velox::common::Filter* filter, ExtractValues extractValues, - RowSet rows); + const RowSet& rows); // Switches based on the type of ValueHook between different readWithVisitor // instantiations. - template - void processValueHook(RowSet rows, ValueHook* hook); + template + void processValueHook(const RowSet& rows, ValueHook* hook); // Instantiates a Visitor based on type, isDense, value processing. template < @@ -60,14 +64,14 @@ class SelectiveIntegerColumnReader : public SelectiveColumnReader { typename ExtractValues> void readHelper( velox::common::Filter* filter, - RowSet rows, + const RowSet& rows, ExtractValues extractValues); // The common part of integer reading. calls the appropriate // instantiation of processValueHook or processFilter based on // possible value hook, filter and denseness. - template - void readCommon(RowSet rows); + template + void readCommon(const RowSet& rows); }; template < @@ -77,7 +81,7 @@ template < typename ExtractValues> void SelectiveIntegerColumnReader::readHelper( velox::common::Filter* filter, - RowSet rows, + const RowSet& rows, ExtractValues extractValues) { switch (valueSize_) { case 2: @@ -113,22 +117,39 @@ void SelectiveIntegerColumnReader::readHelper( } } -template +template < + typename Reader, + bool isDense, + bool kEncodingHasNulls, + typename ExtractValues> void SelectiveIntegerColumnReader::processFilter( velox::common::Filter* filter, ExtractValues extractValues, - RowSet rows) { - switch (filter ? filter->kind() : velox::common::FilterKind::kAlwaysTrue) { + const RowSet& rows) { + if (filter == nullptr) { + readHelper( + &dwio::common::alwaysTrue(), rows, extractValues); + return; + } + + switch (filter->kind()) { case velox::common::FilterKind::kAlwaysTrue: readHelper( filter, rows, extractValues); break; case velox::common::FilterKind::kIsNull: - filterNulls( - rows, true, !std::is_same_v); + if constexpr (kEncodingHasNulls) { + filterNulls( + rows, true, !std::is_same_v); + } else { + readHelper( + filter, rows, extractValues); + } break; case velox::common::FilterKind::kIsNotNull: - if (std::is_same_v) { + if constexpr ( + kEncodingHasNulls && + std::is_same_v) { filterNulls(rows, false, false); } else { readHelper( @@ -172,14 +193,20 @@ void SelectiveIntegerColumnReader::processFilter( template void SelectiveIntegerColumnReader::processValueHook( - RowSet rows, + const RowSet& rows, ValueHook* hook) { switch (hook->kind()) { - case aggregate::AggregationHook::kSumBigintToBigint: + case aggregate::AggregationHook::kBigintSum: + readHelper( + &alwaysTrue(), + rows, + ExtractToHook>(hook)); + break; + case aggregate::AggregationHook::kBigintSumOverflow: readHelper( &alwaysTrue(), rows, - ExtractToHook>(hook)); + ExtractToHook>(hook)); break; case aggregate::AggregationHook::kBigintMax: readHelper( @@ -199,9 +226,9 @@ void SelectiveIntegerColumnReader::processValueHook( } } -template -void SelectiveIntegerColumnReader::readCommon(RowSet rows) { - bool isDense = rows.back() == rows.size() - 1; +template +void SelectiveIntegerColumnReader::readCommon(const RowSet& rows) { + const bool isDense = rows.back() == rows.size() - 1; velox::common::Filter* filter = scanSpec_->filter() ? scanSpec_->filter() : &alwaysTrue(); if (scanSpec_->keepValues()) { @@ -213,16 +240,20 @@ void SelectiveIntegerColumnReader::readCommon(RowSet rows) { } } else { if (isDense) { - processFilter(filter, ExtractToReader(this), rows); + processFilter( + filter, ExtractToReader(this), rows); } else { - processFilter(filter, ExtractToReader(this), rows); + processFilter( + filter, ExtractToReader(this), rows); } } } else { if (isDense) { - processFilter(filter, DropValues(), rows); + processFilter( + filter, DropValues(), rows); } else { - processFilter(filter, DropValues(), rows); + processFilter( + filter, DropValues(), rows); } } } diff --git a/velox/dwio/common/SelectiveRepeatedColumnReader.cpp b/velox/dwio/common/SelectiveRepeatedColumnReader.cpp index 3afbc525d698d..ce2a4a002eab0 100644 --- a/velox/dwio/common/SelectiveRepeatedColumnReader.cpp +++ b/velox/dwio/common/SelectiveRepeatedColumnReader.cpp @@ -75,21 +75,32 @@ void prepareResult( // makeOffsetsAndSizes. Child vectors are handled in child column readers. } +vector_size_t +advanceNestedRows(const RowSet& rows, vector_size_t i, vector_size_t last) { + while (i + 16 < rows.size() && rows[i + 16] < last) { + i += 16; + } + while (i < rows.size() && rows[i] < last) { + ++i; + } + return i; +} + } // namespace void SelectiveRepeatedColumnReader::makeNestedRowSet( - RowSet rows, + const RowSet& rows, int32_t maxRow) { if (!allLengthsHolder_ || allLengthsHolder_->capacity() < (maxRow + 1) * sizeof(vector_size_t)) { - allLengthsHolder_ = allocateIndices(maxRow + 1, &memoryPool_); + allLengthsHolder_ = allocateIndices(maxRow + 1, memoryPool_); allLengths_ = allLengthsHolder_->asMutable(); } - auto nulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; + auto* nulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; // Reads the lengths, leaves an uninitialized gap for a null // map/list. Reading these checks the null mask. readLengths(allLengths_, maxRow + 1, nulls); - vector_size_t nestedLength = 0; + vector_size_t nestedLength{0}; for (auto row : rows) { if (!nulls || !bits::isBitNull(nulls, row)) { nestedLength += @@ -97,11 +108,12 @@ void SelectiveRepeatedColumnReader::makeNestedRowSet( } } nestedRowsHolder_.resize(nestedLength); + vector_size_t currentRow = 0; vector_size_t nestedRow = 0; vector_size_t nestedOffset = 0; for (auto rowIndex = 0; rowIndex < rows.size(); ++rowIndex) { - auto row = rows[rowIndex]; + const auto row = rows[rowIndex]; // Add up the lengths of non-null rows skipped since the last // non-null. nestedOffset += sumLengths(allLengths_, nulls, currentRow, row); @@ -109,7 +121,7 @@ void SelectiveRepeatedColumnReader::makeNestedRowSet( if (nulls && bits::isBitNull(nulls, row)) { continue; } - auto lengthAtRow = + const auto lengthAtRow = std::min(scanSpec_->maxArrayElementsCount(), allLengths_[row]); std::iota( nestedRowsHolder_.data() + nestedRow, @@ -124,7 +136,7 @@ void SelectiveRepeatedColumnReader::makeNestedRowSet( } void SelectiveRepeatedColumnReader::makeOffsetsAndSizes( - RowSet rows, + const RowSet& rows, ArrayVectorBase& result) { auto* rawOffsets = result.mutableOffsets(rows.size())->asMutable(); @@ -134,33 +146,27 @@ void SelectiveRepeatedColumnReader::makeOffsetsAndSizes( vector_size_t currentOffset = 0; vector_size_t nestedRowIndex = 0; for (int i = 0; i < rows.size(); ++i) { - auto row = rows[i]; + const auto row = rows[i]; currentOffset += sumLengths(allLengths_, nulls, currentRow, row); currentRow = row + 1; - while (nestedRowIndex < nestedRows_.size() && - nestedRows_[nestedRowIndex] < currentOffset) { - ++nestedRowIndex; - } + nestedRowIndex = + advanceNestedRows(nestedRows_, nestedRowIndex, currentOffset); rawOffsets[i] = nestedRowIndex; if (nulls && bits::isBitNull(nulls, row)) { rawSizes[i] = 0; - bits::setNull(rawResultNulls_, i); anyNulls_ = true; } else { - vector_size_t length = 0; currentOffset += allLengths_[row]; - while (nestedRowIndex < nestedRows_.size() && - nestedRows_[nestedRowIndex] < currentOffset) { - ++length; - ++nestedRowIndex; - } - rawSizes[i] = length; + const auto newNestedRowIndex = + advanceNestedRows(nestedRows_, nestedRowIndex, currentOffset); + rawSizes[i] = newNestedRowIndex - nestedRowIndex; + nestedRowIndex = newNestedRowIndex; } } numValues_ = rows.size(); } -RowSet SelectiveRepeatedColumnReader::applyFilter(RowSet rows) { +RowSet SelectiveRepeatedColumnReader::applyFilter(const RowSet& rows) { if (!scanSpec_->filter()) { return rows; } @@ -180,26 +186,13 @@ RowSet SelectiveRepeatedColumnReader::applyFilter(RowSet rows) { return outputRows_; } -void SelectiveRepeatedColumnReader::setResultNulls(BaseVector& result) { - if (anyNulls_) { - resultNulls_->setSize(bits::nbytes(result.size())); - result.setNulls(resultNulls_); - } else { - result.resetNulls(); - } -} - SelectiveListColumnReader::SelectiveListColumnReader( - const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const TypePtr& requestedType, + const std::shared_ptr& fileType, FormatParams& params, velox::common::ScanSpec& scanSpec) - : SelectiveRepeatedColumnReader( - dataType->type(), - params, - scanSpec, - dataType), - requestedType_{requestedType} {} + : SelectiveRepeatedColumnReader(requestedType, params, scanSpec, fileType) { +} uint64_t SelectiveListColumnReader::skip(uint64_t numValues) { numValues = formatData_->skipNulls(numValues); @@ -226,7 +219,7 @@ uint64_t SelectiveListColumnReader::skip(uint64_t numValues) { void SelectiveListColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { // Catch up if the child is behind the length stream. child_->seekTo(childTargetReadOffset_, false); @@ -240,39 +233,37 @@ void SelectiveListColumnReader::read( readOffset_ = offset + rows.back() + 1; } -void SelectiveListColumnReader::getValues(RowSet rows, VectorPtr* result) { +void SelectiveListColumnReader::getValues( + const RowSet& rows, + VectorPtr* result) { VELOX_DCHECK_NOT_NULL(result); - prepareResult(*result, requestedType_->type(), rows.size(), &memoryPool_); + prepareResult(*result, requestedType_, rows.size(), memoryPool_); auto* resultArray = result->get()->asUnchecked(); makeOffsetsAndSizes(rows, *resultArray); - setResultNulls(**result); + setComplexNulls(rows, *result); if (child_ && !nestedRows_.empty()) { auto& elements = resultArray->elements(); - prepareStructResult(requestedType_->type()->childAt(0), &elements); + prepareStructResult(requestedType_->childAt(0), &elements); child_->getValues(nestedRows_, &elements); } } SelectiveMapColumnReader::SelectiveMapColumnReader( - const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const TypePtr& requestedType, + const std::shared_ptr& fileType, FormatParams& params, velox::common::ScanSpec& scanSpec) - : SelectiveRepeatedColumnReader( - dataType->type(), - params, - scanSpec, - dataType), - requestedType_{requestedType} {} + : SelectiveRepeatedColumnReader(requestedType, params, scanSpec, fileType) { +} uint64_t SelectiveMapColumnReader::skip(uint64_t numValues) { numValues = formatData_->skipNulls(numValues); if (keyReader_ || elementReader_) { std::array buffer; - uint64_t childElements = 0; - uint64_t lengthsRead = 0; + uint64_t childElements{0}; + uint64_t lengthsRead{0}; while (lengthsRead < numValues) { - uint64_t chunk = + const uint64_t chunk = std::min(numValues - lengthsRead, static_cast(kBufferSize)); readLengths(buffer.data(), chunk, nullptr); for (size_t i = 0; i < chunk; ++i) { @@ -280,6 +271,7 @@ uint64_t SelectiveMapColumnReader::skip(uint64_t numValues) { } lengthsRead += chunk; } + if (keyReader_) { keyReader_->seekTo(keyReader_->readOffset() + childElements, false); } @@ -288,7 +280,6 @@ uint64_t SelectiveMapColumnReader::skip(uint64_t numValues) { elementReader_->readOffset() + childElements, false); } childTargetReadOffset_ += childElements; - } else { VELOX_FAIL("repeated reader with no children"); } @@ -297,7 +288,7 @@ uint64_t SelectiveMapColumnReader::skip(uint64_t numValues) { void SelectiveMapColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { // Catch up if child readers are behind the length stream. if (keyReader_) { @@ -308,7 +299,7 @@ void SelectiveMapColumnReader::read( } prepareRead(offset, rows, incomingNulls); - auto activeRows = applyFilter(rows); + const auto activeRows = applyFilter(rows); makeNestedRowSet(activeRows, rows.back()); if (keyReader_ && elementReader_ && !nestedRows_.empty()) { keyReader_->read(keyReader_->readOffset(), nestedRows_, nullptr); @@ -321,12 +312,18 @@ void SelectiveMapColumnReader::read( readOffset_ = offset + rows.back() + 1; } -void SelectiveMapColumnReader::getValues(RowSet rows, VectorPtr* result) { +void SelectiveMapColumnReader::getValues( + const RowSet& rows, + VectorPtr* result) { VELOX_DCHECK_NOT_NULL(result); - prepareResult(*result, requestedType_->type(), rows.size(), &memoryPool_); + VELOX_CHECK( + !result->get() || result->get()->type()->isMap(), + "Expect MAP result vector, got {}", + result->get()->type()->toString()); + prepareResult(*result, requestedType_, rows.size(), memoryPool_); auto* resultMap = result->get()->asUnchecked(); makeOffsetsAndSizes(rows, *resultMap); - setResultNulls(**result); + setComplexNulls(rows, *result); VELOX_CHECK( keyReader_ && elementReader_, "keyReader_ and elementReaer_ must exist in " @@ -334,7 +331,7 @@ void SelectiveMapColumnReader::getValues(RowSet rows, VectorPtr* result) { if (!nestedRows_.empty()) { keyReader_->getValues(nestedRows_, &resultMap->mapKeys()); auto& values = resultMap->mapValues(); - prepareStructResult(requestedType_->type()->childAt(1), &values); + prepareStructResult(requestedType_->childAt(1), &values); elementReader_->getValues(nestedRows_, &values); } } diff --git a/velox/dwio/common/SelectiveRepeatedColumnReader.h b/velox/dwio/common/SelectiveRepeatedColumnReader.h index 31175e1261196..c5a60d8b5e6cb 100644 --- a/velox/dwio/common/SelectiveRepeatedColumnReader.h +++ b/velox/dwio/common/SelectiveRepeatedColumnReader.h @@ -24,10 +24,6 @@ namespace facebook::velox::dwio::common { // logic for dealing with mapping between enclosing and nested rows. class SelectiveRepeatedColumnReader : public SelectiveColumnReader { public: - bool useBulkPath() const override { - return false; - } - const std::vector& children() const override { return children_; } @@ -43,41 +39,35 @@ class SelectiveRepeatedColumnReader : public SelectiveColumnReader { std::shared_ptr type) : SelectiveColumnReader( requestedType, + std::move(type), params, - scanSpec, - std::move(type)) {} + scanSpec) {} /// Reads 'numLengths' next lengths into 'result'. If 'nulls' is /// non-null, each kNull bit signifies a null with a length of 0 to /// be inserted at the corresponding position in the result. 'nulls' /// is expected to be null flags for 'numRows' next rows at the /// level of this reader. - virtual void readLengths( - int32_t* FOLLY_NONNULL lengths, - int32_t numLengths, - const uint64_t* FOLLY_NULLABLE nulls) = 0; - - // Create row set for child columns based on the row set of parent column. - void makeNestedRowSet(RowSet rows, int32_t maxRow); - - // Compute the offsets and lengths based on the current filtered rows passed - // in. - void makeOffsetsAndSizes(RowSet rows, ArrayVectorBase&); - - // Creates a struct if '*result' is empty and 'type' is a row. - void prepareStructResult( - const TypePtr& type, - VectorPtr* FOLLY_NULLABLE result) { + virtual void + readLengths(int32_t* lengths, int32_t numLengths, const uint64_t* nulls) = 0; + + /// Create row set for child columns based on the row set of parent column. + void makeNestedRowSet(const RowSet& rows, int32_t maxRow); + + /// Compute the offsets and lengths based on the current filtered rows passed + /// in. + void makeOffsetsAndSizes(const RowSet& rows, ArrayVectorBase&); + + /// Creates a struct if '*result' is empty and 'type' is a row. + void prepareStructResult(const TypePtr& type, VectorPtr* result) { if (!*result && type->kind() == TypeKind::ROW) { - *result = BaseVector::create(type, 0, &memoryPool_); + *result = BaseVector::create(type, 0, memoryPool_); } } // Apply filter on parent level. Child filtering should be handled separately // in subclasses. - RowSet applyFilter(RowSet rows); - - void setResultNulls(BaseVector& result); + RowSet applyFilter(const RowSet& rows); BufferPtr allLengthsHolder_; vector_size_t* allLengths_; @@ -95,8 +85,8 @@ class SelectiveRepeatedColumnReader : public SelectiveColumnReader { class SelectiveListColumnReader : public SelectiveRepeatedColumnReader { public: SelectiveListColumnReader( - const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const TypePtr& requestedType, + const std::shared_ptr& fileType, FormatParams& params, velox::common::ScanSpec& scanSpec); @@ -108,21 +98,20 @@ class SelectiveListColumnReader : public SelectiveRepeatedColumnReader { void read( vector_size_t offset, - RowSet rows, - const uint64_t* FOLLY_NULLABLE incomingNulls) override; + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* FOLLY_NULLABLE result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; protected: std::unique_ptr child_; - const std::shared_ptr requestedType_; }; class SelectiveMapColumnReader : public SelectiveRepeatedColumnReader { public: SelectiveMapColumnReader( - const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const TypePtr& requestedType, + const std::shared_ptr& fileType, FormatParams& params, velox::common::ScanSpec& scanSpec); @@ -135,14 +124,14 @@ class SelectiveMapColumnReader : public SelectiveRepeatedColumnReader { void read( vector_size_t offset, - RowSet rows, - const uint64_t* FOLLY_NULLABLE incomingNulls) override; + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* FOLLY_NULLABLE result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; + protected: std::unique_ptr keyReader_; std::unique_ptr elementReader_; - const std::shared_ptr requestedType_; }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp index 6702e58602363..cbe5388ce27f1 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.cpp +++ b/velox/dwio/common/SelectiveStructColumnReader.cpp @@ -16,6 +16,7 @@ #include "velox/dwio/common/SelectiveStructColumnReader.h" +#include "velox/common/process/TraceContext.h" #include "velox/dwio/common/ColumnLoader.h" namespace facebook::velox::dwio::common { @@ -52,72 +53,127 @@ uint64_t SelectiveStructColumnReaderBase::skip(uint64_t numValues) { return numValues; } +void SelectiveStructColumnReaderBase::fillOutputRowsFromMutation( + vector_size_t size) { + if (mutation_->deletedRows) { + bits::forEachUnsetBit(mutation_->deletedRows, 0, size, [&](auto i) { + if ((mutation_->randomSkip == nullptr) || + mutation_->randomSkip->testOne()) { + addOutputRow(i); + } + }); + } else { + VELOX_CHECK_NOT_NULL(mutation_->randomSkip); + vector_size_t i = 0; + while (i < size) { + const auto skip = mutation_->randomSkip->nextSkip(); + const auto remaining = size - i; + if (skip >= remaining) { + mutation_->randomSkip->consume(remaining); + break; + } + i += skip; + addOutputRow(i++); + mutation_->randomSkip->consume(skip + 1); + } + } +} + +namespace { + +bool testFilterOnConstant(const velox::common::ScanSpec& spec) { + if (spec.isConstant() && !spec.constantValue()->isNullAt(0)) { + // Non-null constant is known value during split scheduling and filters on + // them should not be handled at execution level. + return true; + } + // Check filter on missing field. + return !spec.hasFilter() || spec.testNull(); +} + +} // namespace + void SelectiveStructColumnReaderBase::next( uint64_t numValues, VectorPtr& result, const Mutation* mutation) { + process::TraceContext trace("SelectiveStructColumnReaderBase::next"); + mutation_ = mutation; + hasDeletion_ = common::hasDeletion(mutation); if (children_.empty()) { - if (mutation && mutation->deletedRows) { - numValues -= bits::countBits(mutation->deletedRows, 0, numValues); + if (hasDeletion_) { + if (fillMutatedOutputRows_) { + fillOutputRowsFromMutation(numValues); + numValues = outputRows_.size(); + } else { + if (mutation->deletedRows) { + numValues -= bits::countBits(mutation->deletedRows, 0, numValues); + } + if (mutation->randomSkip) { + numValues *= mutation->randomSkip->sampleRate(); + } + } + } + for (const auto& childSpec : scanSpec_->children()) { + if (isChildConstant(*childSpec) && !testFilterOnConstant(*childSpec)) { + numValues = 0; + break; + } } - // no readers + // No readers // This can be either count(*) query or a query that select only // constant columns (partition keys or columns missing from an old file - // due to schema evolution) - result->resize(numValues); - + // due to schema evolution) or row number column. auto resultRowVector = std::dynamic_pointer_cast(result); - auto& childSpecs = scanSpec_->children(); - for (auto& childSpec : childSpecs) { - VELOX_CHECK(childSpec->isConstant()); - if (childSpec->projectOut()) { - auto channel = childSpec->channel(); + resultRowVector->unsafeResize(numValues); + + for (auto& childSpec : scanSpec_->children()) { + VELOX_CHECK(childSpec->isConstant() || childSpec->isExplicitRowNumber()); + if (childSpec->projectOut() && childSpec->isConstant()) { + const auto channel = childSpec->channel(); resultRowVector->childAt(channel) = BaseVector::wrapInConstant( numValues, 0, childSpec->constantValue()); } } return; } - auto oldSize = rows_.size(); + + const auto oldSize = rows_.size(); rows_.resize(numValues); if (numValues > oldSize) { std::iota(&rows_[oldSize], &rows_[rows_.size()], oldSize); } - mutation_ = mutation; - hasMutation_ = mutation && mutation->deletedRows; read(readOffset_, rows_, nullptr); getValues(outputRows(), &result); } void SelectiveStructColumnReaderBase::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { numReads_ = scanSpec_->newRead(); prepareRead(offset, rows, incomingNulls); RowSet activeRows = rows; - if (hasMutation_) { + if (hasDeletion_) { // We handle the mutation after prepareRead so that output rows and format // specific initializations (e.g. RepDef in Parquet) are done properly. - VELOX_DCHECK(!nullsInReadRange_, "Only top level can have mutation"); + VELOX_DCHECK_NULL(nullsInReadRange_, "Only top level can have mutation"); VELOX_DCHECK_EQ( rows.back(), rows.size() - 1, "Top level should have a dense row set"); - bits::forEachUnsetBit( - mutation_->deletedRows, 0, rows.back() + 1, [&](auto i) { - addOutputRow(i); - }); + fillOutputRowsFromMutation(rows.size()); if (outputRows_.empty()) { readOffset_ = offset + rows.back() + 1; return; } activeRows = outputRows_; } + const uint64_t* structNulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; - // a struct reader may have a null/non-null filter + // A struct reader may have a null/non-null filter if (scanSpec_->filter()) { - auto kind = scanSpec_->filter()->kind(); + const auto kind = scanSpec_->filter()->kind(); VELOX_CHECK( kind == velox::common::FilterKind::kIsNull || kind == velox::common::FilterKind::kIsNotNull); @@ -132,20 +188,31 @@ void SelectiveStructColumnReaderBase::read( activeRows = outputRows_; } - auto& childSpecs = scanSpec_->children(); + const auto& childSpecs = scanSpec_->children(); VELOX_CHECK(!childSpecs.empty()); for (size_t i = 0; i < childSpecs.size(); ++i) { - auto& childSpec = childSpecs[i]; + const auto& childSpec = childSpecs[i]; + VELOX_TRACE_HISTORY_PUSH("read %s", childSpec->fieldName().c_str()); if (isChildConstant(*childSpec)) { + if (!testFilterOnConstant(*childSpec)) { + activeRows = {}; + break; + } continue; } - auto fieldIndex = childSpec->subscript(); - auto reader = children_.at(fieldIndex); + + if (childSpec->isExplicitRowNumber()) { + continue; + } + + const auto fieldIndex = childSpec->subscript(); + auto* reader = children_.at(fieldIndex); if (reader->isTopLevel() && childSpec->projectOut() && !childSpec->hasFilter() && !childSpec->extractValues()) { // Will make a LazyVector. continue; } + advanceFieldReader(reader, offset); if (childSpec->hasFilter()) { { @@ -181,18 +248,22 @@ void SelectiveStructColumnReaderBase::read( void SelectiveStructColumnReaderBase::recordParentNullsInChildren( vector_size_t offset, - RowSet rows) { + const RowSet& rows) { if (formatData_->parentNullsInLeaves()) { return; } - auto& childSpecs = scanSpec_->children(); + const auto& childSpecs = scanSpec_->children(); for (auto i = 0; i < childSpecs.size(); ++i) { - auto& childSpec = childSpecs[i]; + const auto& childSpec = childSpecs[i]; if (isChildConstant(*childSpec)) { continue; } - auto fieldIndex = childSpec->subscript(); - auto reader = children_.at(fieldIndex); + if (childSpec->isExplicitRowNumber()) { + continue; + } + + const auto fieldIndex = childSpec->subscript(); + auto* reader = children_.at(fieldIndex); reader->addParentNulls( offset, nullsInReadRange_ ? nullsInReadRange_->as() : nullptr, @@ -284,24 +355,27 @@ void setConstantField( } } -void setNullField(vector_size_t size, VectorPtr& field) { +void setNullField( + vector_size_t size, + VectorPtr& field, + const TypePtr& type, + memory::MemoryPool* pool) { if (field && field->isConstantEncoding() && field.unique() && field->size() > 0 && field->isNullAt(0)) { field->resize(size); } else { - field = BaseVector::createNullConstant(field->type(), size, field->pool()); + field = BaseVector::createNullConstant(type, size, pool); } } } // namespace void SelectiveStructColumnReaderBase::getValues( - RowSet rows, + const RowSet& rows, VectorPtr* result) { VELOX_CHECK(!scanSpec_->children().empty()); - VELOX_CHECK( - *result != nullptr, - "SelectiveStructColumnReaderBase expects a non-null result"); + VELOX_CHECK_NOT_NULL( + *result, "SelectiveStructColumnReaderBase expects a non-null result"); VELOX_CHECK( result->get()->type()->isRow(), "Struct reader expects a result of type ROW."); @@ -320,43 +394,47 @@ void SelectiveStructColumnReaderBase::getValues( 0, std::move(children)); } + auto* resultRow = static_cast(result->get()); - resultRow->resize(rows.size()); - if (!rows.size()) { + resultRow->unsafeResize(rows.size()); + if (rows.empty()) { return; } - if (nullsInReadRange_) { - auto readerNulls = nullsInReadRange_->as(); - auto* nulls = resultRow->mutableNulls(rows.size())->asMutable(); - for (size_t i = 0; i < rows.size(); ++i) { - bits::setBit(nulls, i, bits::isBitSet(readerNulls, rows[i])); - } - } else { - resultRow->clearNulls(0, rows.size()); - } + + setComplexNulls(rows, *result); bool lazyPrepared = false; - for (auto& childSpec : scanSpec_->children()) { + for (const auto& childSpec : scanSpec_->children()) { + VELOX_TRACE_HISTORY_PUSH("getValues %s", childSpec->fieldName().c_str()); if (!childSpec->projectOut()) { continue; } - auto channel = childSpec->channel(); + + if (childSpec->isExplicitRowNumber()) { + // Row number data is generated after, skip data loading for it. + continue; + } + const auto channel = childSpec->channel(); auto& childResult = resultRow->childAt(channel); if (childSpec->isConstant()) { setConstantField(childSpec->constantValue(), rows.size(), childResult); continue; } - auto index = childSpec->subscript(); + + const auto index = childSpec->subscript(); // Set missing fields to be null constant, if we're in the top level struct // missing columns should already be a null constant from the check above. if (index == kConstantChildSpecSubscript) { - setNullField(rows.size(), childResult); + const auto& childType = rowType.childAt(channel); + setNullField(rows.size(), childResult, childType, resultRow->pool()); continue; } + if (childSpec->extractValues() || childSpec->hasFilter() || !children_[index]->isTopLevel()) { children_[index]->getValues(rows, &childResult); continue; } + // LazyVector result. if (!lazyPrepared) { if (rows.size() != outputRows_.size()) { @@ -364,21 +442,41 @@ void SelectiveStructColumnReaderBase::getValues( } lazyPrepared = true; } - auto loader = + auto lazyLoader = std::make_unique(this, children_[index], numReads_); if (childResult && childResult->isLazy() && childResult.unique()) { static_cast(*childResult) - .reset(std::move(loader), rows.size()); + .reset(std::move(lazyLoader), rows.size()); } else { childResult = std::make_shared( - &memoryPool_, + memoryPool_, resultRow->type()->childAt(channel), rows.size(), - std::move(loader), + std::move(lazyLoader), std::move(childResult)); } } resultRow->updateContainsLazyNotLoaded(); } +namespace detail { + +#if XSIMD_WITH_AVX2 + +xsimd::batch bitsToInt32s[256]; + +__attribute__((constructor)) void initBitsToInt32s() { + for (int i = 0; i < 256; ++i) { + int32_t data[8]; + for (int j = 0; j < 8; ++j) { + data[j] = bits::isBitSet(&i, j); + } + bitsToInt32s[i] = xsimd::load_unaligned(data); + } +} + +#endif + +} // namespace detail + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/SelectiveStructColumnReader.h b/velox/dwio/common/SelectiveStructColumnReader.h index af5ea07ae288d..6f15b321391c1 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.h +++ b/velox/dwio/common/SelectiveStructColumnReader.h @@ -20,6 +20,9 @@ namespace facebook::velox::dwio::common { +template +class SelectiveFlatMapColumnReaderHelper; + class SelectiveStructColumnReaderBase : public SelectiveColumnReader { public: void resetFilterCaches() override { @@ -37,10 +40,12 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { const dwio::common::StatsContext& context, FormatData::FilterRowGroupsResult&) const override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; uint64_t numReads() const { return numReads_; @@ -95,38 +100,55 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { return debugString_; } + void setFillMutatedOutputRows(bool value) final { + fillMutatedOutputRows_ = value; + } + protected: + template + friend class SelectiveFlatMapColumnReaderHelper; + // The subscript of childSpecs will be set to this value if the column is // constant (either explicitly or because it's missing). static constexpr int32_t kConstantChildSpecSubscript = -1; SelectiveStructColumnReaderBase( - const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const TypePtr& requestedType, + const std::shared_ptr& fileType, FormatParams& params, velox::common::ScanSpec& scanSpec, bool isRoot = false) - : SelectiveColumnReader(dataType->type(), params, scanSpec, dataType), - requestedType_(requestedType), + : SelectiveColumnReader(requestedType, fileType, params, scanSpec), debugString_( getExceptionContext().message(VeloxException::Type::kSystem)), isRoot_(isRoot) {} - // Records the number of nulls added by 'this' between the end - // position of each child reader and the end of the range of - // 'read(). This must be done also if a child is not read so that we - // know how much to skip when seeking forward within the row group. - void recordParentNullsInChildren(vector_size_t offset, RowSet rows); + /// Records the number of nulls added by 'this' between the end position of + /// each child reader and the end of the range of 'read(). This must be done + /// also if a child is not read so that we know how much to skip when seeking + /// forward within the row group. + void recordParentNullsInChildren(vector_size_t offset, const RowSet& rows); - bool hasMutation() const override { - return hasMutation_; + bool hasDeletion() const final { + return hasDeletion_; } // Returns true if we'll return a constant for that childSpec (i.e. we don't // need to read it). bool isChildConstant(const velox::common::ScanSpec& childSpec) const; - const std::shared_ptr requestedType_; + void fillOutputRowsFromMutation(vector_size_t size); + + // Context information obtained from ExceptionContext. Stored here + // so that LazyVector readers under this can add this to their + // ExceptionContext. Allows contextualizing reader errors to split + // and query. Set at construction, which takes place on first + // use. If no ExceptionContext is in effect, this is "". + const std::string debugString_; + + // Whether or not this is the root Struct that represents entire rows of the + // table. + const bool isRoot_; std::vector children_; @@ -143,21 +165,13 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { // After read() call mutation_ could go out of scope. Need to keep this // around for lazy columns. - bool hasMutation_ = false; + bool hasDeletion_ = false; - // Context information obtained from ExceptionContext. Stored here - // so that LazyVector readers under this can add this to their - // ExceptionContext. Allows contextualizing reader errors to split - // and query. Set at construction, which takes place on first - // use. If no ExceptionContext is in effect, this is "". - const std::string debugString_; - - // Whether or not this is the root Struct that represents entire rows of the - // table. - const bool isRoot_; + bool fillMutatedOutputRows_ = false; }; -struct SelectiveStructColumnReader : SelectiveStructColumnReaderBase { +class SelectiveStructColumnReader : public SelectiveStructColumnReaderBase { + public: using SelectiveStructColumnReaderBase::SelectiveStructColumnReaderBase; void addChild(std::unique_ptr child) { @@ -171,4 +185,369 @@ struct SelectiveStructColumnReader : SelectiveStructColumnReaderBase { std::vector> childrenOwned_; }; +namespace detail { + +template +struct FlatMapDirectCopyHelper { + ValueType* targetValues; + uint64_t* targetNulls; + const ValueType* sourceValues; + const uint64_t* sourceNulls; +}; + +} // namespace detail + +// Helper class to implement reading FLATMAP column into MAP type vector. +template +class SelectiveFlatMapColumnReaderHelper { + public: + SelectiveFlatMapColumnReaderHelper( + SelectiveStructColumnReaderBase& reader, + std::vector&& keyNodes) + : reader_(reader), keyNodes_(std::move(keyNodes)) { + reader_.children_.resize(keyNodes_.size()); + for (int i = 0; i < keyNodes_.size(); ++i) { + reader_.children_[i] = keyNodes_[i].reader.get(); + reader_.children_[i]->setIsFlatMapValue(true); + } + if (auto type = reader_.requestedType_->childAt(1); type->isRow()) { + childValues_ = BaseVector::create(type, 0, reader_.memoryPool_); + } + } + + void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls); + + void getValues(RowSet rows, VectorPtr* result); + + private: + MapVector& prepareResult(VectorPtr& result, vector_size_t size) { + if (result && result->encoding() == VectorEncoding::Simple::MAP && + result.unique()) { + result->resetDataDependentFlags(nullptr); + result->resize(size); + } else { + VLOG(1) << "Reallocating result MAP vector of size " << size; + result = + BaseVector::create(reader_.requestedType_, size, reader_.memoryPool_); + } + return *result->asUnchecked(); + } + + static void readInMapDense( + const uint64_t* inMap, + vector_size_t size, + uint64_t* columnBits, + vector_size_t* sizes); + + vector_size_t + calculateOffsets(RowSet rows, vector_size_t* offsets, vector_size_t* sizes); + + template + void copyValuesImpl( + vector_size_t* rawOffsets, + T* rawKeys, + detail::FlatMapDirectCopyHelper& directCopy, + T key, + const uint64_t* columnBits, + vector_size_t size); + + template + void copyValues( + RowSet rows, + FlatVector* flatKeys, + vector_size_t* rawOffsets, + BaseVector& values); + + SelectiveStructColumnReaderBase& reader_; + std::vector keyNodes_; + VectorPtr childValues_; + DecodedVector decodedChildValues_; + std::vector columnRowBits_; + int columnBitsWords_; + std::vector copyRanges_; +}; + +template +void SelectiveFlatMapColumnReaderHelper::read( + vector_size_t offset, + RowSet rows, + const uint64_t* incomingNulls) { + reader_.numReads_ = reader_.scanSpec_->newRead(); + reader_.prepareRead(offset, rows, incomingNulls); + VELOX_DCHECK(!reader_.hasDeletion()); + auto activeRows = rows; + auto* mapNulls = reader_.nullsInReadRange_ + ? reader_.nullsInReadRange_->as() + : nullptr; + if (reader_.scanSpec_->filter()) { + auto kind = reader_.scanSpec_->filter()->kind(); + VELOX_CHECK( + kind == velox::common::FilterKind::kIsNull || + kind == velox::common::FilterKind::kIsNotNull); + reader_.filterNulls( + rows, kind == velox::common::FilterKind::kIsNull, false); + if (reader_.outputRows_.empty()) { + for (auto* child : reader_.children_) { + child->addParentNulls(offset, mapNulls, rows); + } + return; + } + activeRows = reader_.outputRows_; + } + // Separate the loop to be cache friendly. + for (auto* child : reader_.children_) { + reader_.advanceFieldReader(child, offset); + } + for (auto* child : reader_.children_) { + child->read(offset, activeRows, mapNulls); + child->addParentNulls(offset, mapNulls, rows); + } + reader_.lazyVectorReadOffset_ = offset; + reader_.readOffset_ = offset + rows.back() + 1; +} + +namespace detail { +#if XSIMD_WITH_AVX2 +// Convert 8 bits to 8 int32s. Used to increase map sizes according to in-map +// bits. +extern xsimd::batch bitsToInt32s[256]; +#endif +} // namespace detail + +// Optimized function to copy contiguous range of `inMap' bits into +// `columnBits', and at same time increase values in `sizes' so that they will +// contain map sizes after we iterate over all inMap streams. +template +void SelectiveFlatMapColumnReaderHelper::readInMapDense( + const uint64_t* inMap, + vector_size_t size, + uint64_t* columnBits, + vector_size_t* sizes) { +#if XSIMD_WITH_AVX2 + bits::copyBits(inMap, 0, columnBits, 0, size); + auto* inMapBytes = reinterpret_cast(inMap); + int i = 0; + for (int end = size / 8; i < end; ++i) { + auto* data = sizes + i * 8; + (xsimd::load_unaligned(data) + detail::bitsToInt32s[inMapBytes[i]]) + .store_unaligned(data); + } + i *= 8; + for (; i < size; ++i) { + if (bits::isBitSet(inMap, i)) { + ++sizes[i]; + } + } +#else + for (vector_size_t i = 0; i < size; ++i) { + if (bits::isBitSet(inMap, i)) { + bits::setBit(columnBits, i); + ++sizes[i]; + } + } +#endif +} + +// Calculate the offsets and sizes of each map entry in the result. +template +vector_size_t +SelectiveFlatMapColumnReaderHelper::calculateOffsets( + RowSet rows, + vector_size_t* offsets, + vector_size_t* sizes) { + auto* nulls = reader_.nullsInReadRange_ + ? reader_.nullsInReadRange_->as() + : nullptr; + columnBitsWords_ = bits::nwords(rows.size()); + columnRowBits_.resize(columnBitsWords_ * reader_.children_.size()); + std::fill(columnRowBits_.begin(), columnRowBits_.end(), 0); + std::fill(sizes, sizes + rows.size(), 0); + const bool dense = rows.back() == rows.size() - 1; + for (int k = 0; k < reader_.children_.size(); ++k) { + auto* inMap = + static_cast(reader_.children_[k]->formatData()) + .inMap(); + if (!inMap) { + inMap = nulls; + } + auto* columnBits = columnRowBits_.data() + k * columnBitsWords_; + if (inMap) { + if (dense) { + readInMapDense(inMap, rows.size(), columnBits, sizes); + } else { + for (vector_size_t i = 0; i < rows.size(); ++i) { + if (bits::isBitSet(inMap, rows[i])) { + bits::setBit(columnBits, i); + ++sizes[i]; + } + } + } + } else { + bits::fillBits(columnBits, 0, rows.size(), true); + for (vector_size_t i = 0; i < rows.size(); ++i) { + ++sizes[i]; + } + } + } + vector_size_t numNestedRows = 0; + for (vector_size_t i = 0; i < rows.size(); ++i) { + if (!reader_.returnReaderNulls_ && nulls && + bits::isBitNull(nulls, rows[i])) { + reader_.anyNulls_ = true; + } + offsets[i] = numNestedRows; + numNestedRows += sizes[i]; + } + return numNestedRows; +} + +// When `kDirectCopy' is true, copy the values directly into the target vector. +// Otherwise store the copy ranges and they will be copied after calling this +// function. +template +template +void SelectiveFlatMapColumnReaderHelper::copyValuesImpl( + vector_size_t* rawOffsets, + T* rawKeys, + detail::FlatMapDirectCopyHelper& directCopy, + T key, + const uint64_t* columnBits, + vector_size_t size) { + bits::forEachSetBit(columnBits, 0, size, [&](vector_size_t i) { + auto j = rawOffsets[i]++; + rawKeys[j] = key; + if constexpr (!kDirectCopy) { + copyRanges_.push_back({ + .sourceIndex = i, + .targetIndex = j, + .count = 1, + }); + } else if constexpr (kIdentityMapping) { + directCopy.targetValues[j] = directCopy.sourceValues[i]; + // Nulls in identity mapping are handled more efficiently later in the + // code after calling this function. + } else { + directCopy.targetValues[j] = decodedChildValues_.valueAt(i); + if (decodedChildValues_.isNullAt(i)) { + bits::setNull(directCopy.targetNulls, j); + } + } + }); +} + +// Copy the values and nulls bits from source child values into the target +// values. When `kDirectCopy' is true, copy the values directly into the target +// vector, and if the source values are flat (almost always the case), we +// optimize the nulls copy by avoiding copying the bits where in-map is false. +template +template +void SelectiveFlatMapColumnReaderHelper::copyValues( + RowSet rows, + FlatVector* flatKeys, + vector_size_t* rawOffsets, + BaseVector& values) { + // String values are not copied directly because currently we don't have + // them in production so no need to optimize. + constexpr bool kDirectCopy = + TypeKind::TINYINT <= kKind && kKind <= TypeKind::DOUBLE; + using ValueType = typename TypeTraits::NativeType; + T* rawKeys = flatKeys->mutableRawValues(); + [[maybe_unused]] size_t strKeySize; + [[maybe_unused]] char* rawStrKeyBuffer; + if constexpr (std::is_same_v) { + strKeySize = 0; + for (int k = 0; k < reader_.children_.size(); ++k) { + if (!keyNodes_[k].key.get().isInline()) { + strKeySize += keyNodes_[k].key.get().size(); + } + } + if (strKeySize > 0) { + auto buf = AlignedBuffer::allocate(strKeySize, reader_.memoryPool_); + rawStrKeyBuffer = buf->template asMutable(); + flatKeys->addStringBuffer(buf); + strKeySize = 0; + for (int k = 0; k < reader_.children_.size(); ++k) { + auto& s = keyNodes_[k].key.get(); + if (!s.isInline()) { + memcpy(&rawStrKeyBuffer[strKeySize], s.data(), s.size()); + strKeySize += s.size(); + } + } + strKeySize = 0; + } + } + detail::FlatMapDirectCopyHelper directCopy; + if constexpr (kDirectCopy) { + VELOX_CHECK(values.isFlatEncoding()); + auto* flat = values.asUnchecked>(); + directCopy.targetValues = flat->mutableRawValues(); + directCopy.targetNulls = flat->mutableRawNulls(); + bits::fillBits(directCopy.targetNulls, 0, flat->size(), bits::kNotNull); + } + for (int k = 0; k < reader_.children_.size(); ++k) { + T key; + if constexpr (std::is_same_v) { + key = keyNodes_[k].key.get(); + if (!key.isInline()) { + key = {&rawStrKeyBuffer[strKeySize], static_cast(key.size())}; + strKeySize += key.size(); + } + } else { + key = keyNodes_[k].key.get(); + } + reader_.children_[k]->getValues(rows, &childValues_); + if constexpr (kDirectCopy) { + decodedChildValues_.decode(*childValues_); + if (decodedChildValues_.isIdentityMapping()) { + directCopy.sourceValues = decodedChildValues_.data(); + directCopy.sourceNulls = decodedChildValues_.nulls(); + } + } + auto* columnBits = columnRowBits_.data() + k * columnBitsWords_; + if (decodedChildValues_.isIdentityMapping()) { + copyValuesImpl( + rawOffsets, rawKeys, directCopy, key, columnBits, rows.size()); + } else { + copyValuesImpl( + rawOffsets, rawKeys, directCopy, key, columnBits, rows.size()); + } + if constexpr (kDirectCopy) { + if (directCopy.sourceNulls && decodedChildValues_.isIdentityMapping()) { + bits::andWithNegatedBits( + columnBits, directCopy.sourceNulls, 0, rows.size()); + bits::forEachSetBit(columnBits, 0, rows.size(), [&](vector_size_t i) { + bits::setNull(directCopy.targetNulls, rawOffsets[i] - 1); + }); + } + } else { + values.copyRanges(childValues_.get(), copyRanges_); + copyRanges_.clear(); + } + } +} + +template +void SelectiveFlatMapColumnReaderHelper::getValues( + RowSet rows, + VectorPtr* result) { + auto& mapResult = prepareResult(*result, rows.size()); + auto* rawOffsets = mapResult.mutableOffsets(rows.size()) + ->template asMutable(); + auto* rawSizes = + mapResult.mutableSizes(rows.size())->template asMutable(); + auto numNestedRows = calculateOffsets(rows, rawOffsets, rawSizes); + auto& keys = mapResult.mapKeys(); + auto& values = mapResult.mapValues(); + BaseVector::prepareForReuse(keys, numNestedRows); + BaseVector::prepareForReuse(values, numNestedRows); + auto* flatKeys = keys->template asFlatVector(); + VELOX_DYNAMIC_TYPE_DISPATCH( + copyValues, values->typeKind(), rows, flatKeys, rawOffsets, *values); + VELOX_CHECK_EQ(rawOffsets[rows.size() - 1], numNestedRows); + std::copy_backward( + rawOffsets, rawOffsets + rows.size() - 1, rawOffsets + rows.size()); + rawOffsets[0] = 0; + reader_.setComplexNulls(rows, *result); +} + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/SortingWriter.cpp b/velox/dwio/common/SortingWriter.cpp index 813f333e6319f..0243db048bbcd 100644 --- a/velox/dwio/common/SortingWriter.cpp +++ b/velox/dwio/common/SortingWriter.cpp @@ -20,28 +20,138 @@ namespace facebook::velox::dwio::common { SortingWriter::SortingWriter( std::unique_ptr writer, - std::unique_ptr sortBuffer) - : outputWriter_(std::move(writer)), sortBuffer_(std::move(sortBuffer)) {} + std::unique_ptr sortBuffer, + vector_size_t maxOutputRowsConfig, + uint64_t maxOutputBytesConfig) + : outputWriter_(std::move(writer)), + maxOutputRowsConfig_(maxOutputRowsConfig), + maxOutputBytesConfig_(maxOutputBytesConfig), + sortPool_(sortBuffer->pool()), + canReclaim_(sortBuffer->canSpill()), + sortBuffer_(std::move(sortBuffer)) { + VELOX_CHECK_GT(maxOutputRowsConfig_, 0); + VELOX_CHECK_GT(maxOutputBytesConfig_, 0); + if (sortPool_->parent()->reclaimer() != nullptr) { + sortPool_->setReclaimer(MemoryReclaimer::create(this)); + } + setState(State::kRunning); +} + +SortingWriter::~SortingWriter() { + sortPool_->release(); +} void SortingWriter::write(const VectorPtr& data) { + checkRunning(); sortBuffer_->addInput(data); } -void SortingWriter::flush() {} +void SortingWriter::flush() { + checkRunning(); + outputWriter_->flush(); +} void SortingWriter::close() { + setState(State::kClosed); + sortBuffer_->noMoreInput(); - RowVectorPtr output = sortBuffer_->getOutput(); + const auto maxOutputBatchRows = outputBatchRows(); + RowVectorPtr output = sortBuffer_->getOutput(maxOutputBatchRows); while (output != nullptr) { outputWriter_->write(output); - output = sortBuffer_->getOutput(); + output = sortBuffer_->getOutput(maxOutputBatchRows); } + + sortBuffer_.reset(); + sortPool_->release(); outputWriter_->close(); } void SortingWriter::abort() { + setState(State::kAborted); + sortBuffer_.reset(); + sortPool_->release(); outputWriter_->abort(); } +bool SortingWriter::canReclaim() const { + return canReclaim_; +} + +uint64_t SortingWriter::reclaim( + uint64_t targetBytes, + memory::MemoryReclaimer::Stats& stats) { + if (!canReclaim_) { + return 0; + } + + if (!isRunning()) { + LOG(WARNING) << "Can't reclaim from a not running hive sort writer pool: " + << sortPool_->name() << ", state: " << state() + << "used memory: " << succinctBytes(sortPool_->usedBytes()) + << ", reserved memory: " + << succinctBytes(sortPool_->reservedBytes()); + ++stats.numNonReclaimableAttempts; + return 0; + } + VELOX_CHECK_NOT_NULL(sortBuffer_); + + return memory::MemoryReclaimer::run( + [&]() { + int64_t reclaimedBytes{0}; + { + memory::ScopedReclaimedBytesRecorder recorder( + sortPool_, &reclaimedBytes); + sortBuffer_->spill(); + sortPool_->release(); + } + return reclaimedBytes; + }, + stats); +} + +vector_size_t SortingWriter::outputBatchRows() { + vector_size_t estimatedMaxOutputRows = + std::numeric_limits::max(); + if (sortBuffer_->estimateOutputRowSize().has_value() && + sortBuffer_->estimateOutputRowSize().value() != 0) { + const uint64_t maxOutputRows = + maxOutputBytesConfig_ / sortBuffer_->estimateOutputRowSize().value(); + if (UNLIKELY(maxOutputRows > std::numeric_limits::max())) { + return maxOutputRowsConfig_; + } + + estimatedMaxOutputRows = maxOutputRows; + } + return std::min(estimatedMaxOutputRows, maxOutputRowsConfig_); +} + +std::unique_ptr SortingWriter::MemoryReclaimer::create( + SortingWriter* writer) { + return std::unique_ptr(new MemoryReclaimer(writer)); +} + +bool SortingWriter::MemoryReclaimer::reclaimableBytes( + const memory::MemoryPool& pool, + uint64_t& reclaimableBytes) const { + VELOX_CHECK_EQ(pool.name(), writer_->sortPool_->name()); + + reclaimableBytes = 0; + if (!writer_->canReclaim()) { + return false; + } + reclaimableBytes = pool.usedBytes(); + return true; +} + +uint64_t SortingWriter::MemoryReclaimer::reclaim( + memory::MemoryPool* pool, + uint64_t targetBytes, + uint64_t /*unused*/, + memory::MemoryReclaimer::Stats& stats) { + VELOX_CHECK_EQ(pool->name(), writer_->sortPool_->name()); + + return writer_->reclaim(targetBytes, stats); +} } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/SortingWriter.h b/velox/dwio/common/SortingWriter.h index f07b2d9e67f0a..d7b70f09032d2 100644 --- a/velox/dwio/common/SortingWriter.h +++ b/velox/dwio/common/SortingWriter.h @@ -17,6 +17,7 @@ #pragma once #include "velox/dwio/common/Writer.h" +#include "velox/exec/MemoryReclaimer.h" #include "velox/exec/SortBuffer.h" namespace facebook::velox::dwio::common { @@ -26,19 +27,60 @@ class SortingWriter : public Writer { public: SortingWriter( std::unique_ptr writer, - std::unique_ptr sortBuffer); + std::unique_ptr sortBuffer, + vector_size_t maxOutputRowsConfig, + uint64_t maxOutputBytesConfig); - virtual void write(const VectorPtr& data) override; + ~SortingWriter() override; + + void write(const VectorPtr& data) override; /// No action because we need to accumulate all data and sort before data can /// be flushed - virtual void flush() override; + void flush() override; + + void close() override; + + void abort() override; + + private: + class MemoryReclaimer : public exec::MemoryReclaimer { + public: + static std::unique_ptr create( + SortingWriter* writer); + + bool reclaimableBytes( + const memory::MemoryPool& pool, + uint64_t& reclaimableBytes) const override; - virtual void close() override; + uint64_t reclaim( + memory::MemoryPool* pool, + uint64_t targetBytes, + uint64_t maxWaitMs, + memory::MemoryReclaimer::Stats& stats) override; - virtual void abort() override; + private: + explicit MemoryReclaimer(SortingWriter* writer) + : exec::MemoryReclaimer(), + writer_(writer), + canReclaim_(writer_->sortBuffer_->canSpill()) {} + + SortingWriter* const writer_; + const bool canReclaim_; + }; + + bool canReclaim() const; + + uint64_t reclaim(uint64_t targetBytes, memory::MemoryReclaimer::Stats& stats); + + vector_size_t outputBatchRows(); const std::unique_ptr outputWriter_; + const vector_size_t maxOutputRowsConfig_; + const uint64_t maxOutputBytesConfig_; + memory::MemoryPool* const sortPool_; + const bool canReclaim_; + std::unique_ptr sortBuffer_; }; diff --git a/velox/dwio/common/Throttler.cpp b/velox/dwio/common/Throttler.cpp new file mode 100644 index 0000000000000..55dce7003166d --- /dev/null +++ b/velox/dwio/common/Throttler.cpp @@ -0,0 +1,232 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/Throttler.h" + +#include + +#include "velox/common/base/Counters.h" +#include "velox/common/base/StatsReporter.h" +#include "velox/common/base/SuccinctPrinter.h" + +namespace facebook::velox::dwio::common { +namespace { +// Builds key in local throttled cache to make it unique across storage +// clusters. +std::string localThrottleCacheKey( + const std::string& cluster, + const std::string& directory) { + return fmt::format("{}:{}", cluster, directory); +} +} // namespace + +Throttler::Config::Config( + bool _throttleEnabled, + uint64_t _minThrottleBackoffMs, + uint64_t _maxThrottleBackoffMs, + double _backoffScaleFactor, + uint32_t _minLocalThrottledSignals, + uint32_t _minGlobalThrottledSignals, + uint32_t _maxCacheEntries, + uint32_t _cacheTTLMs) + : throttleEnabled(_throttleEnabled), + minThrottleBackoffMs(_minThrottleBackoffMs), + maxThrottleBackoffMs(_maxThrottleBackoffMs), + backoffScaleFactor(_backoffScaleFactor), + minLocalThrottledSignals(_minLocalThrottledSignals), + minGlobalThrottledSignals(_minGlobalThrottledSignals), + maxCacheEntries(_maxCacheEntries), + cacheTTLMs(_cacheTTLMs) {} + +std::string Throttler::Config::toString() const { + return fmt::format( + "throttleEnabled:{} minThrottleBackoffMs:{} maxThrottleBackoffMs:{} backoffScaleFactor:{} minLocalThrottledSignals:{} minGlobalThrottledSignals:{} maxCacheEntries:{} cacheTTLMs:{}", + throttleEnabled, + succinctMillis(minThrottleBackoffMs), + succinctMillis(maxThrottleBackoffMs), + backoffScaleFactor, + minLocalThrottledSignals, + minGlobalThrottledSignals, + maxCacheEntries, + succinctMillis(cacheTTLMs)); +}; + +std::string Throttler::signalTypeName(SignalType type) { + switch (type) { + case SignalType::kNone: + return "None"; + case SignalType::kLocal: + return "Local"; + case SignalType::kGlobal: + return "Global"; + default: + return fmt::format("Unknown Signal Type: {}", static_cast(type)); + } +} + +std::ostream& operator<<(std::ostream& os, Throttler::SignalType type) { + os << Throttler::signalTypeName(type); + return os; +} + +void Throttler::init(const Config& config) { + std::unique_lock guard{instanceLock()}; + auto& instance = instanceRef(); + VELOX_CHECK_NULL(instance, "Throttler has already been set"); + instance = std::unique_ptr(new Throttler(config)); +} + +Throttler* Throttler::instance() { + std::shared_lock guard{instanceLock()}; + auto& instance = instanceRef(); + if (instance == nullptr) { + return nullptr; + } + return instance.get(); +} + +Throttler::Throttler(const Config& config) + : throttleEnabled_(config.throttleEnabled), + minThrottleBackoffDurationMs_(config.minThrottleBackoffMs), + maxThrottleBackoffDurationMs_(config.maxThrottleBackoffMs), + backoffScaleFactor_(config.backoffScaleFactor), + minLocalThrottledSignalsToBackoff_(config.minLocalThrottledSignals), + minGlobalThrottledSignalsToBackoff_(config.minGlobalThrottledSignals), + localThrottleCache_( + !throttleEnabled_ + ? nullptr + : new ThrottleSignalFactory{std::make_unique>( + config.maxCacheEntries, + config.cacheTTLMs), + std::unique_ptr{ + new ThrottleSignalGenerator{}}}), + globalThrottleCache_( + !throttleEnabled_ + ? nullptr + : new ThrottleSignalFactory{std::make_unique>( + config.maxCacheEntries, + config.cacheTTLMs), + std::unique_ptr{ + new ThrottleSignalGenerator{}}}) { + LOG(INFO) << "IO throttler config: " << config.toString(); +} + +uint64_t Throttler::throttleBackoff( + SignalType type, + const std::string& cluster, + const std::string& directory) { + if (!throttleEnabled() || type == SignalType::kNone) { + return kNoBackOffMs_; + } + + const uint64_t backOffDurationMs = + calculateBackoffDurationAndUpdateThrottleCache(type, cluster, directory); + if (backOffDurationMs == kNoBackOffMs_) { + return kNoBackOffMs_; + } + + updateThrottleStats(type, backOffDurationMs); + + std::this_thread::sleep_for( + std::chrono::milliseconds(backOffDurationMs)); // NOLINT + return backOffDurationMs; +} + +void Throttler::updateThrottleStats(SignalType type, uint64_t backoffDelayMs) { + stats_.backOffDelay.increment(backoffDelayMs); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricStorageThrottledDurationMs, backoffDelayMs); + if (type == SignalType::kLocal) { + ++stats_.localThrottled; + RECORD_METRIC_VALUE(kMetricStorageLocalThrottled); + } else { + ++stats_.globalThrottled; + RECORD_METRIC_VALUE(kMetricStorageGlobalThrottled); + } +} + +void Throttler::updateThrottleCacheLocked( + SignalType type, + const std::string& cluster, + const std::string& directory, + CachedThrottleSignalPtr& localSignal, + CachedThrottleSignalPtr& globalSignal) { + VELOX_CHECK(throttleEnabled()); + + if (type == SignalType::kLocal) { + if (localSignal.get() == nullptr) { + localThrottleCache_->generate(localThrottleCacheKey(cluster, directory)); + } else { + ++localSignal->count; + } + } else { + if (globalSignal.get() == nullptr) { + globalThrottleCache_->generate(cluster); + } else { + ++globalSignal->count; + } + } +} + +uint64_t Throttler::calculateBackoffDurationAndUpdateThrottleCache( + SignalType type, + const std::string& cluster, + const std::string& directoy) { + std::lock_guard l(mu_); + // Gets maximum count of local and global throttle signals in Cache. + auto localThrottleCachePtr = + localThrottleCache_->get(localThrottleCacheKey(cluster, directoy)); + int64_t localThrottleCount = + (localThrottleCachePtr.get() != nullptr ? localThrottleCachePtr->count + : 0) + + (type == SignalType::kLocal ? 1 : 0) - minLocalThrottledSignalsToBackoff_; + auto globalThrottleCachePtr = globalThrottleCache_->get(cluster); + const int64_t globalThrottleCount = + (globalThrottleCachePtr.get() != nullptr ? globalThrottleCachePtr->count + : 0) + + (type == SignalType::kGlobal ? 1 : 0) - + minGlobalThrottledSignalsToBackoff_; + // Update throttling signal cache. + updateThrottleCacheLocked( + type, cluster, directoy, localThrottleCachePtr, globalThrottleCachePtr); + + const int64_t throttleAttempts = + std::max(localThrottleCount, globalThrottleCount); + + // Calculates the delay with exponential backoff + if (throttleAttempts <= 0) { + return kNoBackOffMs_; + } + + const uint64_t backoffDelayMs = std::round( + minThrottleBackoffDurationMs_ * + pow(backoffScaleFactor_, throttleAttempts - 1)); + + // Adds some casualness so requests can be waken up at different timestamp + return std::min( + backoffDelayMs + + boost::random::uniform_int_distribution( + 1, std::max(1, (uint64_t)(backoffDelayMs * 0.1)))(rng_), + maxThrottleBackoffDurationMs_); +} + +std::unique_ptr +Throttler::ThrottleSignalGenerator::operator()( + const std::string& /*unused*/, + const void* /*unused*/) { + return std::unique_ptr(new ThrottleSignal{1}); +} +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/Throttler.h b/velox/dwio/common/Throttler.h new file mode 100644 index 0000000000000..eb5667a59287a --- /dev/null +++ b/velox/dwio/common/Throttler.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "velox/common/caching/CachedFactory.h" +#include "velox/common/caching/SimpleLRUCache.h" +#include "velox/common/io/IoStatistics.h" + +#pragma once + +namespace facebook::velox::dwio::common { + +/// A throttler that can be used to backoff IO when the storage is overloaded. +class Throttler { + public: + /// The configuration of the throttler. + struct Config { + /// If true, enables throttling of IO. + bool throttleEnabled; + + /// The minimum backoff duration in milliseconds. + uint64_t minThrottleBackoffMs; + + /// The maximum backoff duration in milliseconds. + uint64_t maxThrottleBackoffMs; + + /// The backoff duration scale factor. + double backoffScaleFactor; + + /// The minimum number of received local throttled signals before starting + /// backoff. + uint32_t minLocalThrottledSignals; + + /// The minimum number of received global throttled signals before starting + /// backoff. + uint32_t minGlobalThrottledSignals; + + /// The maximum number of entries in the throttled signal cache. There is + /// one cache for each throttle signal type. For local throttle signal + /// cache, each cache entry corresponds to a unqiue file direcotry in a + /// storage system. For global throttle signal cache, each entry corresponds + /// to a unique storage system. + uint32_t maxCacheEntries; + + /// The TTL of the throttled signal cache entries in milliseconds. We only + /// track the recently throtted signals. + uint32_t cacheTTLMs; + + static constexpr bool kThrottleEnabledDefault{true}; + static constexpr uint64_t kMinThrottleBackoffMsDefault{200}; + static constexpr uint64_t kMaxThrottleBackoffMsDefault{30'000}; + static constexpr double kBackoffScaleFactorDefault{2.0}; + static constexpr uint32_t kMinLocalThrottledSignalsDefault{1'000}; + static constexpr uint32_t kMinGlobalThrottledSignalsDefault{100'000}; + static constexpr uint32_t kMaxCacheEntriesDefault{10'000}; + static constexpr uint32_t kCacheTTLMsDefault{3 * 60 * 1'000}; + + Config( + bool throttleEnabled = kThrottleEnabledDefault, + uint64_t minThrottleBackoffMs = kMinThrottleBackoffMsDefault, + uint64_t maxThrottleBackoffMs = kMaxThrottleBackoffMsDefault, + double backoffScaleFactor = kBackoffScaleFactorDefault, + uint32_t minLocalThrottledSignals = kMinLocalThrottledSignalsDefault, + uint32_t minGlobalThrottledSignals = kMinGlobalThrottledSignalsDefault, + uint32_t maxCacheEntries = kMaxCacheEntriesDefault, + uint32_t cacheTTLMs = kCacheTTLMsDefault); + + std::string toString() const; + }; + + /// The throttler stats. + struct Stats { + std::atomic_uint64_t localThrottled{0}; + std::atomic_uint64_t globalThrottled{0}; + /// Counts the backoff delay in milliseconds. + io::IoCounter backOffDelay; + }; + + static void init(const Config& config); + + static Throttler* instance(); + + /// The type of throttle signal type. + enum class SignalType { + /// No throttled signal. + kNone, + /// A file directory throttled signal. + kLocal, + /// A cluster-wise throttled signal. + kGlobal, + }; + static std::string signalTypeName(SignalType type); + + /// Invoked to backoff when received a throttled signal on a particular + /// storage location. 'type' specifies the throttled signal type received from + /// the storage system. 'cluster' specifies the storage system. A query system + /// might access data from different storage systems. 'directory' specifies + /// the file directory within the storage system. The function returns the + /// actual throttled duration in milliseconds. It returns zero if not + /// throttled. + uint64_t throttleBackoff( + SignalType type, + const std::string& cluster, + const std::string& directory); + + const Stats& stats() const { + return stats_; + } + + static void testingReset() { + instanceRef().reset(); + } + + private: + static folly::SharedMutex& instanceLock() { + static folly::SharedMutex mu; + return mu; + } + + static std::unique_ptr& instanceRef() { + static std::unique_ptr instance; + return instance; + } + + explicit Throttler(const Config& config); + + bool throttleEnabled() const { + return throttleEnabled_; + } + + // Calculates the delay in milliseconds with exponential backoff for a storage + // location, using the signal counters in cache and flags in config, and + // update the throttle signal caches. + uint64_t calculateBackoffDurationAndUpdateThrottleCache( + SignalType type, + const std::string& cluster, + const std::string& directory); + + struct ThrottleSignal { + uint64_t count{0}; + + explicit ThrottleSignal(uint64_t _count) : count(_count) {} + }; + + // Creates ThrottleSignal via the Generator interface the CachedFactory + // requires. + class ThrottleSignalGenerator { + public: + ThrottleSignalGenerator() = default; + + std::unique_ptr operator()( + const std::string& /*unused*/, + const void* /*unused*/); + }; + + using CachedThrottleSignalPtr = CachedPtr; + + using ThrottleSignalFactory = facebook::velox:: + CachedFactory; + + void updateThrottleCacheLocked( + SignalType type, + const std::string& cluster, + const std::string& directory, + CachedThrottleSignalPtr& localSignal, + CachedThrottleSignalPtr& globalSignal); + + void updateThrottleStats(SignalType type, uint64_t backoffDelayMs); + + static const uint64_t kNoBackOffMs_{0}; + + const bool throttleEnabled_; + const uint64_t minThrottleBackoffDurationMs_; + const uint64_t maxThrottleBackoffDurationMs_; + const double backoffScaleFactor_; + const uint32_t minLocalThrottledSignalsToBackoff_; + const uint32_t minGlobalThrottledSignalsToBackoff_; + const std::unique_ptr localThrottleCache_; + const std::unique_ptr globalThrottleCache_; + + mutable std::mutex mu_; + + std::mt19937 rng_; + Stats stats_; +}; + +std::ostream& operator<<(std::ostream& os, Throttler::SignalType type); +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/TypeUtils.cpp b/velox/dwio/common/TypeUtils.cpp index b5d0d046ced5c..29e22046196fe 100644 --- a/velox/dwio/common/TypeUtils.cpp +++ b/velox/dwio/common/TypeUtils.cpp @@ -39,49 +39,53 @@ void checkChildrenSelected( } } -std::shared_ptr visit( +std::unique_ptr visit( const std::shared_ptr& typeWithId, const std::function& selector) { if (typeWithId->type()->isPrimitiveType()) { - return typeWithId; + return std::make_unique( + typeWithId->type(), + std::vector>(), + typeWithId->id(), + typeWithId->maxId(), + typeWithId->column()); } if (typeWithId->type()->isRow()) { std::vector names; - std::vector> typesWithId; + std::vector> selectedChildren; std::vector> types; auto& row = typeWithId->type()->asRow(); for (auto i = 0; i < typeWithId->size(); ++i) { auto& child = typeWithId->childAt(i); if (selector(child->id())) { names.push_back(row.nameOf(i)); - std::shared_ptr twid; - twid = visit(child, selector); - typesWithId.push_back(twid); - types.push_back(twid->type()); + auto newChild = visit(child, selector); + types.push_back(newChild->type()); + selectedChildren.push_back(std::move(newChild)); } } VELOX_USER_CHECK( !types.empty(), "selected nothing from row: " + row.toString()); - return std::make_shared( + return std::make_unique( ROW(std::move(names), std::move(types)), - std::move(typesWithId), + std::move(selectedChildren), typeWithId->id(), typeWithId->maxId(), typeWithId->column()); } else { checkChildrenSelected(typeWithId, selector); - std::vector> typesWithId; + std::vector> selectedChildren; std::vector> types; for (auto i = 0; i < typeWithId->size(); ++i) { auto& child = typeWithId->childAt(i); - std::shared_ptr twid = visit(child, selector); - typesWithId.push_back(twid); - types.push_back(twid->type()); + auto newChild = visit(child, selector); + types.push_back(newChild->type()); + selectedChildren.push_back(std::move(newChild)); } auto type = createType(typeWithId->type()->kind(), std::move(types)); - return std::make_shared( + return std::make_unique( type, - std::move(typesWithId), + std::move(selectedChildren), typeWithId->id(), typeWithId->maxId(), typeWithId->column()); @@ -108,6 +112,7 @@ std::unordered_set makeCompatibilityMap() { compat.insert(getKey(TypeKind::SMALLINT, TypeKind::INTEGER)); compat.insert(getKey(TypeKind::SMALLINT, TypeKind::BIGINT)); compat.insert(getKey(TypeKind::INTEGER, TypeKind::BIGINT)); + compat.insert(getKey(TypeKind::BIGINT, TypeKind::HUGEINT)); compat.insert(getKey(TypeKind::REAL, TypeKind::DOUBLE)); return compat; } @@ -134,7 +139,7 @@ void checkTypeCompatibility( } if (recurse) { - uint64_t childCount = std::min(from.size(), to.size()); + const uint64_t childCount = std::min(from.size(), to.size()); for (uint64_t i = 0; i < childCount; ++i) { checkTypeCompatibility( *from.childAt(i), diff --git a/velox/dwio/common/TypeWithId.cpp b/velox/dwio/common/TypeWithId.cpp index 6436aa25f55fb..03328024ef67f 100644 --- a/velox/dwio/common/TypeWithId.cpp +++ b/velox/dwio/common/TypeWithId.cpp @@ -15,6 +15,7 @@ */ #include "velox/dwio/common/TypeWithId.h" + #include "velox/dwio/common/exception/Exception.h" namespace facebook::velox::dwio::common { @@ -22,9 +23,21 @@ namespace facebook::velox::dwio::common { using velox::Type; using velox::TypeKind; +namespace { +std::vector> toShared( + std::vector> nodes) { + std::vector> result; + result.reserve(nodes.size()); + for (auto&& node : nodes) { + result.emplace_back(std::move(node)); + } + return result; +} +} // namespace + TypeWithId::TypeWithId( std::shared_ptr type, - std::vector>&& children, + std::vector>&& children, uint32_t id, uint32_t maxId, uint32_t column) @@ -33,18 +46,49 @@ TypeWithId::TypeWithId( id_{id}, maxId_{maxId}, column_{column}, - children_{std::move(children)} { + children_{toShared(std::move(children))} { for (auto& child : children_) { - const_cast(child->parent_) = this; + if (child) { + const_cast(child->parent_) = this; + } } } -std::shared_ptr TypeWithId::create( +std::unique_ptr TypeWithId::create( const std::shared_ptr& root, uint32_t next) { return create(root, next, 0); } +namespace { + +int countNodes(const TypePtr& type) { + int count = 1; + for (auto& child : *type) { + count += countNodes(child); + } + return count; +} + +} // namespace + +std::unique_ptr TypeWithId::create( + const RowTypePtr& type, + const velox::common::ScanSpec& spec) { + uint32_t next = 1; + std::vector> children(type->size()); + for (int i = 0, size = type->size(); i < size; ++i) { + auto* childSpec = spec.childByName(type->nameOf(i)); + if (childSpec && !childSpec->isConstant()) { + children[i] = create(type->childAt(i), next, i); + } else { + next += countNodes(type->childAt(i)); + } + } + return std::make_unique( + type, std::move(children), 0, next - 1, 0); +} + uint32_t TypeWithId::size() const { return children_.size(); } @@ -54,13 +98,13 @@ const std::shared_ptr& TypeWithId::childAt( return children_.at(idx); } -std::shared_ptr TypeWithId::create( +std::unique_ptr TypeWithId::create( const std::shared_ptr& type, uint32_t& next, uint32_t column) { DWIO_ENSURE_NOT_NULL(type); const uint32_t myId = next++; - std::vector> children{}; + std::vector> children; children.reserve(type->size()); auto offset = 0; for (const auto& child : *type) { @@ -70,8 +114,58 @@ std::shared_ptr TypeWithId::create( (myId == 0 && type->kind() == TypeKind::ROW) ? offset++ : column)); } const uint32_t maxId = next - 1; - return std::make_shared( + return std::make_unique( type, std::move(children), myId, maxId, column); } +std::string TypeWithId::fullName() const { + std::vector path; + auto* child = this; + while (auto* parent = child->parent_) { + switch (parent->type()->kind()) { + case TypeKind::ROW: { + auto& siblings = parent->children_; + bool found = false; + for (int i = 0; i < siblings.size(); ++i) { + if (siblings[i].get() == child) { + path.push_back('.' + parent->type()->asRow().nameOf(i)); + found = true; + break; + } + } + if (!found) { + VELOX_FAIL( + "Child {} not found in parent {}", + child->type()->toString(), + parent->type()->toString()); + } + break; + } + case TypeKind::ARRAY: + break; + case TypeKind::MAP: + if (child == parent->children_.at(0).get()) { + path.push_back("."); + } else { + VELOX_CHECK(child == parent->children_.at(1).get()); + path.push_back("."); + } + break; + default: + VELOX_UNREACHABLE(); + } + child = parent; + } + std::string ans; + for (int i = path.size() - 1; i >= 0; --i) { + if (i == path.size() - 1) { + VELOX_CHECK_EQ(path[i][0], '.'); + ans += path[i].substr(1); + } else { + ans += path[i]; + } + } + return ans; +} + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/TypeWithId.h b/velox/dwio/common/TypeWithId.h index 953ac87b2b8c3..a147cfe5066fc 100644 --- a/velox/dwio/common/TypeWithId.h +++ b/velox/dwio/common/TypeWithId.h @@ -18,23 +18,35 @@ #include #include +#include "velox/dwio/common/ScanSpec.h" #include "velox/type/Type.h" namespace facebook::velox::dwio::common { class TypeWithId : public velox::Tree> { public: + /// NOTE: This constructor will re-parent the children. TypeWithId( std::shared_ptr type, - std::vector>&& children, + std::vector>&& children, uint32_t id, uint32_t maxId, uint32_t column); - static std::shared_ptr create( + TypeWithId(const TypeWithId&) = delete; + TypeWithId& operator=(const TypeWithId&) = delete; + + static std::unique_ptr create( const std::shared_ptr& root, uint32_t next = 0); + /// Create TypeWithId node but leave all the unselected children as nullptr. + /// The ids are set correctly even when some of the previous nodes are not + /// selected. + static std::unique_ptr create( + const RowTypePtr& type, + const velox::common::ScanSpec& spec); + uint32_t size() const override; const std::shared_ptr& type() const { @@ -69,8 +81,10 @@ class TypeWithId : public velox::Tree> { return children_; } + std::string fullName() const; + private: - static std::shared_ptr create( + static std::unique_ptr create( const std::shared_ptr& type, uint32_t& next, uint32_t column); diff --git a/velox/dwio/common/UnitLoader.h b/velox/dwio/common/UnitLoader.h new file mode 100644 index 0000000000000..d3125dacc4be6 --- /dev/null +++ b/velox/dwio/common/UnitLoader.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace facebook::velox::dwio::common { + +class LoadUnit { + public: + virtual ~LoadUnit() = default; + + /// Perform the IO (read) + virtual void load() = 0; + + /// Unload the unit to free memory + virtual void unload() = 0; + + /// Number of rows in the unit + virtual uint64_t getNumRows() = 0; + + /// Number of bytes that the IO will read + virtual uint64_t getIoSize() = 0; +}; + +class UnitLoader { + public: + virtual ~UnitLoader() = default; + + /// Must block until the unit is loaded. This call could unload other units. + /// So the returned LoadUnit& is only guaranteed to remain loaded until the + /// next call. + virtual LoadUnit& getLoadedUnit(uint32_t unit) = 0; + + /// Reader reports progress calling this method. The call must be done + /// **after** getLoadedUnit for unit. + virtual void + onRead(uint32_t unit, uint64_t rowOffsetInUnit, uint64_t rowCount) = 0; + + /// Reader reports seek calling this method. The call must be done **before** + /// getLoadedUnit for the new unit. + virtual void onSeek(uint32_t unit, uint64_t rowOffsetInUnit) = 0; +}; + +class UnitLoaderFactory { + public: + virtual ~UnitLoaderFactory() = default; + virtual std::unique_ptr create( + std::vector> loadUnits, + uint64_t rowsToSkip) = 0; +}; + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/UnitLoaderTools.h b/velox/dwio/common/UnitLoaderTools.h new file mode 100644 index 0000000000000..2b25eb87e112a --- /dev/null +++ b/velox/dwio/common/UnitLoaderTools.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "folly/synchronization/CallOnce.h" +#include "velox/common/base/Exceptions.h" + +namespace facebook::velox::dwio::common::unit_loader_tools { + +// This class can create many callbacks that can be distributed to unit loader +// factories. Only when the last created callback is activated, this class will +// emit the original callback. +// If the callback objects created are never explicitly called (because of an +// exception for example), the callback object will do the call in the +// destructor, guaranteeting the call. +class CallbackOnLastSignal { + class Callable { + public: + virtual ~Callable() {} + virtual void call() = 0; + }; + + class CallableFunction : public Callable { + public: + explicit CallableFunction(std::function cb) : cb_{std::move(cb)} {} + + void call() override { + if (cb_) { + // Could be null + cb_(); + } + } + + private: + std::function cb_; + }; + + // This class will ensure that the contained callback is only called once. + class CallOnce : public Callable { + public: + explicit CallOnce(std::shared_ptr cb) : cb_{std::move(cb)} {} + + CallOnce(const CallOnce& other) = delete; + CallOnce(CallOnce&& other) = delete; + CallOnce& operator=(const CallOnce& other) = delete; + CallOnce& operator=(CallOnce&& other) noexcept = delete; + + void call() override { + folly::call_once(called_, [&]() { cb_->call(); }); + } + + private: + std::shared_ptr cb_; + folly::once_flag called_; + }; + + // This class will ensure that only the call from the last caller will go + // through. + class CallOnCountZero : public Callable { + public: + CallOnCountZero( + std::shared_ptr callsLeft, + std::shared_ptr cb) + : callsLeft_{std::move(callsLeft)}, cb_{std::move(cb)} {} + + CallOnCountZero(const CallOnCountZero& other) = delete; + CallOnCountZero(CallOnCountZero&& other) = delete; + CallOnCountZero& operator=(const CallOnCountZero& other) = delete; + CallOnCountZero& operator=(CallOnCountZero&& other) noexcept = delete; + + void call() override { + if (*callsLeft_ > 0) { + --*(callsLeft_); + } + if (*callsLeft_ == 0) { + cb_->call(); + } + } + + private: + std::shared_ptr callsLeft_; + std::shared_ptr cb_; + }; + + // This class will ensure that the contained callback is called when the + // operator() is invoked, or when the object is destructed, whatever comes + // first. + class EnsureCall : public Callable { + public: + explicit EnsureCall(std::shared_ptr cb) + : cb_{std::make_shared(std::move(cb))} {} + + EnsureCall(const EnsureCall& other) = delete; + EnsureCall(EnsureCall&& other) = delete; + EnsureCall& operator=(const EnsureCall& other) = delete; + EnsureCall& operator=(EnsureCall&& other) noexcept = delete; + + ~EnsureCall() override { + cb_->call(); + } + + void call() override { + cb_->call(); + } + + private: + std::shared_ptr cb_; + }; + + class CountCaller { + public: + CountCaller( + std::shared_ptr cb, + std::shared_ptr callsLeft) + : cb_{std::move(cb)} { + ++(*callsLeft); + } + + void operator()() { + cb_->call(); + } + + private: + std::shared_ptr cb_; + }; + + public: + explicit CallbackOnLastSignal(std::function cb) + : callsLeft_{std::make_shared(0)}, + cb_{cb ? std::make_shared( + callsLeft_, + std::make_shared(std::make_shared( + std::make_shared(std::move(cb))))) + : nullptr} {} + + std::function getCallback() const { + if (!cb_) { + return nullptr; + } + return CountCaller{ + std::make_shared(std::make_shared(cb_)), + callsLeft_}; + } + + private: + std::shared_ptr callsLeft_; + std::shared_ptr cb_; +}; + +template +std::pair +howMuchToSkip(uint64_t rowsToSkip, NumRowsIter begin, NumRowsIter end) { + uint64_t rowsLeftToSkip = rowsToSkip; + uint32_t unitsToSkip = 0; + for (NumRowsIter it = begin; it != end; ++it) { + const auto rowsInUnit = *it; + if (rowsLeftToSkip < rowsInUnit) { + return {unitsToSkip, rowsLeftToSkip}; + } + rowsLeftToSkip -= rowsInUnit; + ++unitsToSkip; + } + + VELOX_CHECK_EQ( + rowsLeftToSkip, + 0, + "Can't skip more rows than all the rows in all the units"); + + return {unitsToSkip, rowsLeftToSkip}; +} + +} // namespace facebook::velox::dwio::common::unit_loader_tools diff --git a/velox/dwio/common/Writer.cpp b/velox/dwio/common/Writer.cpp new file mode 100644 index 0000000000000..52ff1848f0a09 --- /dev/null +++ b/velox/dwio/common/Writer.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/Writer.h" + +namespace facebook::velox::dwio::common { + +void Writer::checkStateTransition(State oldState, State newState) { + switch (oldState) { + case State::kInit: + if (newState == State::kRunning) { + return; + } + break; + case State::kRunning: + if (newState == State::kAborted || newState == State::kClosed) { + return; + } + break; + case State::kAborted: + [[fallthrough]]; + case State::kClosed: + [[fallthrough]]; + default: + break; + } + VELOX_FAIL( + "Unexpected state transition from {} to {}", + Writer::stateString(oldState), + Writer::stateString(newState)); +} + +std::string Writer::stateString(State state) { + switch (state) { + case State::kInit: + return "INIT"; + case State::kRunning: + return "RUNNING"; + case State::kClosed: + return "CLOSED"; + case State::kAborted: + return "ABORTED"; + default: + VELOX_UNREACHABLE("BAD STATE: {}", static_cast(state)); + } +} + +bool Writer::isRunning() const { + return state_ == State::kRunning; +} + +void Writer::checkRunning() const { + VELOX_CHECK_EQ( + static_cast(state_), + static_cast(State::kRunning), + "Writer is not running: {}", + Writer::stateString(state_)); +} + +void Writer::setState(State state) { + checkStateTransition(state_, state); + state_ = state; +} + +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/Writer.h b/velox/dwio/common/Writer.h index 78e75a7c42429..011e9f156162b 100644 --- a/velox/dwio/common/Writer.h +++ b/velox/dwio/common/Writer.h @@ -25,41 +25,64 @@ namespace facebook::velox::dwio::common { -/** - * Abstract writer class. - * - * Writer object is used to write a single file. - * - * Writer objects are created through factories implementing - * WriterFactory interface. - */ +/// Abstract writer class. +/// +/// Writer object is used to write a single file. +/// +/// Writer objects are created through factories implementing +/// WriterFactory interface. class Writer { public: + /// Defines the states of a file writer. + enum class State { + kInit = 0, + kRunning = 1, + kAborted = 2, + kClosed = 3, + }; + static std::string stateString(State state); + virtual ~Writer() = default; - /** - * Appends 'data' to writer. Data might still be in memory and not - * yet written to the file. - */ + State state() const { + return state_; + } + + /// Appends 'data' to writer. Data might still be in memory and not + /// yet written to the file. virtual void write(const VectorPtr& data) = 0; - /** - * Forces the writer to flush data to the file. - * Does not close the writer. - */ + /// Forces the writer to flush data to the file. + /// Does not close the writer. virtual void flush() = 0; - /** - * Invokes flush and closes the writer. - * Data can no longer be written. - */ + /// Invokes flush and closes the writer. + /// Data can no longer be written. virtual void close() = 0; - /** - * Aborts the writing by closing the writer and dropping everything. - * Data can no longer be written. - */ + /// Aborts the writing by closing the writer and dropping everything. + /// Data can no longer be written. virtual void abort() = 0; + + protected: + bool isRunning() const; + + void checkRunning() const; + + /// Invoked to set writer 'state_' to new 'state'. + void setState(State state); + + /// Validates the state transition from 'oldState' to 'newState'. + static void checkStateTransition(State oldState, State newState); + + State state_{State::kInit}; }; +FOLLY_ALWAYS_INLINE std::ostream& operator<<( + std::ostream& os, + Writer::State state) { + os << Writer::stateString(state); + return os; +} + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/WriterFactory.cpp b/velox/dwio/common/WriterFactory.cpp index 47d255851b5a1..f88b36b3b3e44 100644 --- a/velox/dwio/common/WriterFactory.cpp +++ b/velox/dwio/common/WriterFactory.cpp @@ -17,7 +17,6 @@ #include "velox/dwio/common/WriterFactory.h" namespace facebook::velox::dwio::common { - namespace { using WriterFactoriesMap = @@ -31,7 +30,7 @@ WriterFactoriesMap& writerFactories() { } // namespace bool registerWriterFactory(std::shared_ptr factory) { - const bool ok = + [[maybe_unused]] const bool ok = writerFactories().insert({factory->fileFormat(), factory}).second; // TODO: enable the check after Prestissimo adds to register the dwrf writer. #if 0 @@ -49,10 +48,10 @@ bool unregisterWriterFactory(FileFormat format) { std::shared_ptr getWriterFactory(FileFormat format) { auto it = writerFactories().find(format); - VELOX_CHECK( - it != writerFactories().end(), - "WriterFactory is not registered for format {}", - toString(format)); + if (it == writerFactories().end()) { + VELOX_UNSUPPORTED( + "WriterFactory is not registered for format {}", toString(format)); + } return it->second; } diff --git a/velox/dwio/common/WriterFactory.h b/velox/dwio/common/WriterFactory.h index 97c644d95d663..c04d5688a6aaf 100644 --- a/velox/dwio/common/WriterFactory.h +++ b/velox/dwio/common/WriterFactory.h @@ -24,74 +24,62 @@ namespace facebook::velox::dwio::common { -/** - * Writer factory interface. - * - * Implement this interface to provide a factory of writers - * for a particular file format. Factory objects should be - * registered using registerWriteFactory method to become - * available for connectors. Only a single writer factory - * per file format is allowed. - */ +/// Writer factory interface. +/// +/// Implement this interface to provide a factory of writers +/// for a particular file format. Factory objects should be +/// registered using registerWriteFactory method to become +/// available for connectors. Only a single writer factory +/// per file format is allowed. class WriterFactory { public: - /** - * Constructor. - * @param format File format this factory is designated to. - */ + /// Constructor. + /// @param format File format this factory is designated to. explicit WriterFactory(FileFormat format) : format_(format) {} virtual ~WriterFactory() = default; - /** - * Get the file format ths factory is designated to. - */ + /// Get the file format ths factory is designated to. FileFormat fileFormat() const { return format_; } - /** - * Create a writer object. - * @param sink output sink - * @param options writer options - * @return writer object - */ + /// Create a writer object. + /// @param sink output sink + /// @param options writer options + /// @return writer object virtual std::unique_ptr createWriter( std::unique_ptr sink, - const dwio::common::WriterOptions& options) = 0; + const std::shared_ptr& options) = 0; + + /// Creates a polymorphic writer options object. + virtual std::unique_ptr + createWriterOptions() = 0; private: const FileFormat format_; }; -/** - * Register a writer factory. Only a single factory can be registered - * for each file format. An attempt to register multiple factories for - * a single file format would cause a failure. - * @return true - */ +/// Register a writer factory. Only a single factory can be registered +/// for each file format. An attempt to register multiple factories for +/// a single file format would cause a failure. +// @return true bool registerWriterFactory(std::shared_ptr factory); -/** - * Unregister a writer factory for a specified file format. - * @return true for unregistered factory and false for a - * missing factory for the specified format. - */ +/// Unregister a writer factory for a specified file format. +/// @return true for unregistered factory and false for a +/// missing factory for the specified format. bool unregisterWriterFactory(FileFormat format); -/** - * Get writer factory object for a specified file format. Results in - * a failure if there is no registered factory for this format. - * @return WriterFactory object - */ +/// Get writer factory object for a specified file format. Results in +/// a failure if there is no registered factory for this format. +/// @return WriterFactory object std::shared_ptr getWriterFactory(FileFormat format); -/** - * Check if a writer factory object exists for a specified file format. - * Returns true if there is a registered factory for this format, false - * otherwise. - * @return true - */ +/// Check if a writer factory object exists for a specified file format. +/// Returns true if there is a registered factory for this format, false +/// otherwise. +/// @return true bool hasWriterFactory(FileFormat format); } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/compression/CMakeLists.txt b/velox/dwio/common/compression/CMakeLists.txt index 20bceedbc576d..b3bc9f4687ca2 100644 --- a/velox/dwio/common/compression/CMakeLists.txt +++ b/velox/dwio/common/compression/CMakeLists.txt @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_dwio_common_compression Compression.cpp PagedInputStream.cpp - PagedOutputStream.cpp) +velox_add_library(velox_dwio_common_compression Compression.cpp + PagedInputStream.cpp PagedOutputStream.cpp) -target_link_libraries(velox_dwio_common_compression velox_dwio_common xsimd - gtest Folly::folly) +velox_link_libraries(velox_dwio_common_compression velox_dwio_common xsimd + Folly::folly) diff --git a/velox/dwio/common/compression/Compression.cpp b/velox/dwio/common/compression/Compression.cpp index 79f8bb2d0f07b..e0f2597cc699a 100644 --- a/velox/dwio/common/compression/Compression.cpp +++ b/velox/dwio/common/compression/Compression.cpp @@ -15,10 +15,9 @@ */ #include "velox/dwio/common/compression/Compression.h" - #include "velox/common/compression/LzoDecompressor.h" +#include "velox/dwio/common/IntCodecCommon.h" #include "velox/dwio/common/compression/PagedInputStream.h" -#include "velox/dwio/common/compression/PagedOutputStream.h" #include #include @@ -194,14 +193,159 @@ uint64_t ZlibDecompressor::decompress( return destLength - zstream_.avail_out; } -class LzoDecompressor : public Decompressor { +class LzoAndLz4DecompressorCommon : public Decompressor { public: - explicit LzoDecompressor( + explicit LzoAndLz4DecompressorCommon( uint64_t blockSize, + const CompressionKind& kind, + bool isHadoopFrameFormat, const std::string& streamDebugInfo) - : Decompressor{blockSize, streamDebugInfo} {} + : Decompressor{blockSize, streamDebugInfo}, + kind_(kind), + isHadoopFrameFormat_(isHadoopFrameFormat) {} uint64_t decompress( + const char* src, + uint64_t srcLength, + char* dest, + uint64_t destLength) override; + + virtual uint64_t decompressInternal( + const char* src, + uint64_t srcLength, + char* dest, + uint64_t destLength) = 0; + + protected: + CompressionKind kind_; + // When compressor creates multiple compressed blocks, this will be + // 'true', e.g., parquet uses this, whereas dwrf/orc creates single + // compressed block. + bool isHadoopFrameFormat_; +}; + +uint64_t LzoAndLz4DecompressorCommon::decompress( + const char* src, + uint64_t srcLength, + char* dest, + uint64_t destLength) { + if (!isHadoopFrameFormat_) { + return decompressInternal(src, srcLength, dest, destLength); + } + + // For parquet, the format could be frame format, try to decompress that + // format. + uint32_t decompressedTotalSize = 0; + auto* inputPtr = src; + auto* outPtr = dest; + uint64_t compressedSize = srcLength; + auto uncompressedSize = destLength; + + while (compressedSize > 0) { + DWIO_ENSURE_GE( + compressedSize, + dwio::common::INT_BYTE_SIZE, + "{} decompression failed, input len is too small: {}", + kind_, + compressedSize); + + uint32_t decompressedBlockSize = + folly::Endian::big(folly::loadUnaligned(inputPtr)); + inputPtr += dwio::common::INT_BYTE_SIZE; + compressedSize -= dwio::common::INT_BYTE_SIZE; + uint32_t remainingOutputSize = uncompressedSize - decompressedTotalSize; + + DWIO_ENSURE_GE( + remainingOutputSize, + decompressedBlockSize, + "{} decompression failed, remainingOutputSize is less than " + "decompressedBlockSize, remainingOutputSize: {}, " + "decompressedBlockSize: {}", + kind_, + remainingOutputSize, + decompressedBlockSize); + + if (compressedSize <= 0) { + break; + } + + do { + // Check that input length should not be negative. + DWIO_ENSURE_GE( + compressedSize, + dwio::common::INT_BYTE_SIZE, + "{} decompression failed, input len is too small: {}", + kind_, + compressedSize); + // Read the length of the next lz4/lzo compressed block. + uint32_t compressedBlockSize = + folly::Endian::big(folly::loadUnaligned(inputPtr)); + inputPtr += dwio::common::INT_BYTE_SIZE; + compressedSize -= dwio::common::INT_BYTE_SIZE; + + if (compressedBlockSize == 0) { + continue; + } + + DWIO_ENSURE_LE( + compressedBlockSize, + compressedSize, + "{} decompression failed, compressedBlockSize is greater than compressedSize, " + "compressedBlockSize: {}, compressedSize: {}", + kind_, + compressedBlockSize, + compressedSize); + + // Decompress this block. + remainingOutputSize = uncompressedSize - decompressedTotalSize; + uint64_t decompressedSize = decompressInternal( + inputPtr, + static_cast(compressedBlockSize), + outPtr, + static_cast(remainingOutputSize)); + + DWIO_ENSURE_LE( + decompressedSize, + remainingOutputSize, + "{} decompression failed, decompressedSize is not less than or equal to remainingOutputSize, " + "decompressedSize: {}, remainingOutputSize: {}", + ::facebook::velox::common::compressionKindToString(kind_), + decompressedSize, + remainingOutputSize); + + outPtr += decompressedSize; + inputPtr += compressedBlockSize; + compressedSize -= compressedBlockSize; + decompressedBlockSize -= decompressedSize; + decompressedTotalSize += decompressedSize; + } while (decompressedBlockSize > 0); + } + + DWIO_ENSURE_EQ( + decompressedTotalSize, + uncompressedSize, + "{} decompression failed, decompressedTotalSize is not equal to uncompressedSize, " + "decompressedTotalSize: {}, uncompressedSize: {}", + kind_, + decompressedTotalSize, + uncompressedSize); + + return decompressedTotalSize; +} + +class LzoDecompressor : public LzoAndLz4DecompressorCommon { + public: + explicit LzoDecompressor( + uint64_t blockSize, + bool isHadoopFrameFormat, + const std::string& streamDebugInfo) + : LzoAndLz4DecompressorCommon{ + blockSize, + velox::common::CompressionKind_LZO, + isHadoopFrameFormat, + streamDebugInfo} {} + + uint64_t decompressInternal( const char* src, uint64_t srcLength, char* dest, @@ -211,21 +355,26 @@ class LzoDecompressor : public Decompressor { } }; -class Lz4Decompressor : public Decompressor { +class Lz4Decompressor : public LzoAndLz4DecompressorCommon { public: explicit Lz4Decompressor( uint64_t blockSize, + bool isHadoopFrameFormat, const std::string& streamDebugInfo) - : Decompressor{blockSize, streamDebugInfo} {} + : LzoAndLz4DecompressorCommon{ + blockSize, + velox::common::CompressionKind_LZ4, + isHadoopFrameFormat, + streamDebugInfo} {} - uint64_t decompress( + uint64_t decompressInternal( const char* src, uint64_t srcLength, char* dest, uint64_t destLength) override; }; -uint64_t Lz4Decompressor::decompress( +uint64_t Lz4Decompressor::decompressInternal( const char* src, uint64_t srcLength, char* dest, @@ -461,36 +610,25 @@ bool ZlibDecompressionStream::readOrSkip(const void** data, int32_t* size) { } // namespace -std::unique_ptr createCompressor( +std::unique_ptr createCompressor( CompressionKind kind, - CompressionBufferPool& bufferPool, - DataBufferHolder& bufferHolder, - uint8_t pageHeaderSize, - const CompressionOptions& options, - const Encrypter* encrypter) { - std::unique_ptr compressor; + const CompressionOptions& options) { switch (kind) { case CompressionKind::CompressionKind_NONE: - if (!encrypter) { - return std::make_unique(bufferHolder); - } - // compressor remain as nullptr - break; + return nullptr; case CompressionKind::CompressionKind_ZLIB: { - compressor = std::make_unique( - options.format.zlib.compressionLevel); XLOG_FIRST_N(INFO, 1) << fmt::format( "Initialized zlib compressor with compression level {}", options.format.zlib.compressionLevel); - break; + return std::make_unique( + options.format.zlib.compressionLevel); } case CompressionKind::CompressionKind_ZSTD: { - compressor = std::make_unique( - options.format.zstd.compressionLevel); XLOG_FIRST_N(INFO, 1) << fmt::format( "Initialized zstd compressor with compression level {}", options.format.zstd.compressionLevel); - break; + return std::make_unique( + options.format.zstd.compressionLevel); } case CompressionKind::CompressionKind_SNAPPY: case CompressionKind::CompressionKind_LZO: @@ -499,13 +637,7 @@ std::unique_ptr createCompressor( VELOX_UNSUPPORTED( "Unsupported compression type: {}", compressionKindToString(kind)); } - return std::make_unique( - bufferPool, - bufferHolder, - options.compressionThreshold, - pageHeaderSize, - std::move(compressor), - encrypter); + return nullptr; } std::unique_ptr createDecompressor( @@ -565,12 +697,16 @@ std::unique_ptr createDecompressor( std::make_unique(blockSize, streamDebugInfo); break; case CompressionKind::CompressionKind_LZO: - decompressor = - std::make_unique(blockSize, streamDebugInfo); + decompressor = std::make_unique( + blockSize, + options.format.lz4_lzo.isHadoopFrameFormat, + streamDebugInfo); break; case CompressionKind::CompressionKind_LZ4: - decompressor = - std::make_unique(blockSize, streamDebugInfo); + decompressor = std::make_unique( + blockSize, + options.format.lz4_lzo.isHadoopFrameFormat, + streamDebugInfo); break; case CompressionKind::CompressionKind_ZSTD: decompressor = diff --git a/velox/dwio/common/compression/Compression.h b/velox/dwio/common/compression/Compression.h index 2209ac52f20fd..3d26b3af98a42 100644 --- a/velox/dwio/common/compression/Compression.h +++ b/velox/dwio/common/compression/Compression.h @@ -17,9 +17,7 @@ #pragma once #include "velox/common/compression/Compression.h" -#include "velox/dwio/common/OutputStream.h" #include "velox/dwio/common/SeekableInputStream.h" -#include "velox/dwio/common/compression/CompressionBufferPool.h" #include "velox/dwio/common/encryption/Encryption.h" namespace facebook::velox::dwio::common::compression { @@ -82,6 +80,10 @@ struct CompressionOptions { struct { int32_t compressionLevel; } zstd; + + struct { + bool isHadoopFrameFormat; + } lz4_lzo; } format; uint32_t compressionThreshold; @@ -111,18 +113,10 @@ std::unique_ptr createDecompressor( /** * Create a compressor for the given compression kind. * @param kind The compression type to implement - * @param bufferPool Pool for compression buffer - * @param bufferHolder Buffer holder that handles buffer allocation and - * collection - * @param pageHeaderSize Header size of compressed block * @param options The compression options to use */ -std::unique_ptr createCompressor( +std::unique_ptr createCompressor( facebook::velox::common::CompressionKind kind, - CompressionBufferPool& bufferPool, - DataBufferHolder& bufferHolder, - uint8_t pageHeaderSize, - const CompressionOptions& options, - const dwio::common::encryption::Encrypter* encrypter = nullptr); + const CompressionOptions& options); } // namespace facebook::velox::dwio::common::compression diff --git a/velox/dwio/common/compression/PagedInputStream.cpp b/velox/dwio/common/compression/PagedInputStream.cpp index 2cbed38f40a94..08357298cf57a 100644 --- a/velox/dwio/common/compression/PagedInputStream.cpp +++ b/velox/dwio/common/compression/PagedInputStream.cpp @@ -213,6 +213,7 @@ bool PagedInputStream::readOrSkip(const void** data, int32_t* size) { } void PagedInputStream::BackUp(int32_t count) { + VELOX_CHECK_GE(count, 0); if (pendingSkip_ > 0) { auto len = std::min(count, pendingSkip_); pendingSkip_ -= len; @@ -257,7 +258,8 @@ bool PagedInputStream::skipAllPending() { return true; } -bool PagedInputStream::Skip(int32_t count) { +bool PagedInputStream::SkipInt64(int64_t count) { + VELOX_CHECK_GE(count, 0); pendingSkip_ += count; // We never use the return value of this function so this is OK. return true; @@ -269,6 +271,8 @@ void PagedInputStream::clearDecompressionState() { remainingLength_ = 0; inputBufferPtr_ = nullptr; inputBufferPtrEnd_ = nullptr; + lastHeaderOffset_ = input_->ByteCount(); + bytesReturnedAtLastHeaderOffset_ = bytesReturned_; } void PagedInputStream::seekToPosition( @@ -280,8 +284,7 @@ void PagedInputStream::seekToPosition( // to the beginning of the last view or last header, whichever is // later. If we are returning views into the decompression buffer, // we can backup to the beginning of the decompressed buffer - auto alreadyRead = - bytesReturned_ - bytesReturnedAtLastHeaderOffset_ + pendingSkip_; + auto alreadyRead = bytesReturned_ - bytesReturnedAtLastHeaderOffset_; // outsideOriginalWindow is true if we are returning views into // the input stream's buffer and we are seeking below the start of the last @@ -300,6 +303,7 @@ void PagedInputStream::seekToPosition( clearDecompressionState(); pendingSkip_ = uncompressedOffset; } else { + alreadyRead += pendingSkip_; if (uncompressedOffset < alreadyRead) { BackUp(alreadyRead - uncompressedOffset); } else { diff --git a/velox/dwio/common/compression/PagedInputStream.h b/velox/dwio/common/compression/PagedInputStream.h index 54faced08eaa9..b7ecd99fd4c94 100644 --- a/velox/dwio/common/compression/PagedInputStream.h +++ b/velox/dwio/common/compression/PagedInputStream.h @@ -54,7 +54,7 @@ class PagedInputStream : public dwio::common::SeekableInputStream { void BackUp(int32_t count) override; // NOTE: This always returns true. - bool Skip(int32_t count) override; + bool SkipInt64(int64_t count) override; google::protobuf::int64 ByteCount() const override { return bytesReturned_ + pendingSkip_; diff --git a/velox/dwio/common/compression/PagedOutputStream.cpp b/velox/dwio/common/compression/PagedOutputStream.cpp index 9b83e84deff37..18d993bf74f05 100644 --- a/velox/dwio/common/compression/PagedOutputStream.cpp +++ b/velox/dwio/common/compression/PagedOutputStream.cpp @@ -93,7 +93,9 @@ uint64_t PagedOutputStream::flush() { auto buffers = createPage(); const auto cleanup = folly::makeGuard([this]() { resetBuffers(); - // Reset input buffers. + // Reset input buffers. clear() forces the buffer to shrink. + // Not doing so lead to very high flush memory overhead. + buffer_.clear(); buffer_.resize(pageHeaderSize_); }); bufferHolder_.take(std::move(buffers)); diff --git a/velox/dwio/common/compression/PagedOutputStream.h b/velox/dwio/common/compression/PagedOutputStream.h index f15e22908690a..498bba3c781c3 100644 --- a/velox/dwio/common/compression/PagedOutputStream.h +++ b/velox/dwio/common/compression/PagedOutputStream.h @@ -16,7 +16,9 @@ #pragma once +#include "velox/dwio/common/OutputStream.h" #include "velox/dwio/common/compression/Compression.h" +#include "velox/dwio/common/compression/CompressionBufferPool.h" namespace facebook::velox::dwio::common::compression { diff --git a/velox/dwio/common/encryption/CMakeLists.txt b/velox/dwio/common/encryption/CMakeLists.txt index b6c55051e38f9..610ebb4684af7 100644 --- a/velox/dwio/common/encryption/CMakeLists.txt +++ b/velox/dwio/common/encryption/CMakeLists.txt @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_dwio_common_encryption Encryption.cpp) +velox_add_library(velox_dwio_common_encryption Encryption.cpp) -target_link_libraries(velox_dwio_common_encryption Folly::folly) +velox_link_libraries(velox_dwio_common_encryption Folly::folly) diff --git a/velox/dwio/common/encryption/TestProvider.h b/velox/dwio/common/encryption/TestProvider.h index f1bd5a029d55b..7ef41de8e5743 100644 --- a/velox/dwio/common/encryption/TestProvider.h +++ b/velox/dwio/common/encryption/TestProvider.h @@ -58,7 +58,7 @@ class TestEncryption { private: std::string key_; - mutable size_t count_; + mutable std::atomic count_; }; class TestEncrypter : public TestEncryption, public Encrypter { diff --git a/velox/dwio/common/exception/CMakeLists.txt b/velox/dwio/common/exception/CMakeLists.txt index 325c2c1a94f16..43d2d8a067b21 100644 --- a/velox/dwio/common/exception/CMakeLists.txt +++ b/velox/dwio/common/exception/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_dwio_common_exception Exception.cpp Exceptions.cpp) +velox_add_library(velox_dwio_common_exception Exception.cpp Exceptions.cpp) -target_link_libraries(velox_dwio_common_exception velox_exception Folly::folly - glog::glog) +velox_link_libraries(velox_dwio_common_exception velox_exception Folly::folly + glog::glog) diff --git a/velox/dwio/common/exception/Exceptions.cpp b/velox/dwio/common/exception/Exceptions.cpp index c3074064535ae..6aaadef3a31a4 100644 --- a/velox/dwio/common/exception/Exceptions.cpp +++ b/velox/dwio/common/exception/Exceptions.cpp @@ -44,7 +44,7 @@ void verify(bool c, std::string fmt...) { } } -void corrupt(std::string fmt...) { +[[noreturn]] void corrupt(std::string fmt...) { va_list ap; va_start(ap, fmt); auto s = error_string(fmt, ap); diff --git a/velox/dwio/common/exception/Exceptions.h b/velox/dwio/common/exception/Exceptions.h index 67a31ed90c479..d6963468ea40a 100644 --- a/velox/dwio/common/exception/Exceptions.h +++ b/velox/dwio/common/exception/Exceptions.h @@ -89,7 +89,7 @@ void verify_range(uint64_t v, uint64_t rangeMask); void verify(bool c, std::string fmt...); -void corrupt(std::string fmt...); +[[noreturn]] void corrupt(std::string fmt...); std::string error_string(std::string fmt, va_list ap); std::string format_error_string(std::string fmt...); diff --git a/velox/dwio/common/tests/BitConcatenationTest.cpp b/velox/dwio/common/tests/BitConcatenationTest.cpp index 8ace181321e1b..2529d03b471de 100644 --- a/velox/dwio/common/tests/BitConcatenationTest.cpp +++ b/velox/dwio/common/tests/BitConcatenationTest.cpp @@ -22,7 +22,8 @@ using namespace facebook::velox; using namespace facebook::velox::dwio::common; TEST(BitConcatenationTests, basic) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); + memory::MemoryManager::testingSetInstance({}); + auto pool = facebook::velox::memory::memoryManager()->addLeafPool(); BitConcatenation bits(*pool); BufferPtr result; diff --git a/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp b/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp index 62d847fd99ecf..19072dd2a55e9 100644 --- a/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp +++ b/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp @@ -22,13 +22,14 @@ #include "velox/dwio/common/tests/Lemire/bmipacking32.h" #endif -#include "velox/external/duckdb/duckdb-fastpforlib.hpp" +#include "velox/dwio/common/tests/Lemire/FastPFor/bitpackinghelpers.h" #include #include #include #include -#include "velox/external/duckdb/duckdb.hpp" + +#include // @manual using namespace folly; using namespace facebook::velox; @@ -37,7 +38,7 @@ using RowSet = folly::Range; static const uint64_t kNumValues = 1024768 * 8; -namespace duckdb { +namespace facebook::velox::parquet { class ByteBuffer { // on to the 10 thousandth impl public: @@ -64,7 +65,7 @@ class ByteBuffer { // on to the 10 thousandth impl template T get() { available(sizeof(T)); - T val = Load((data_ptr_t)ptr); + T val = duckdb::Load((duckdb::data_ptr_t)ptr); return val; } @@ -103,7 +104,7 @@ class ParquetDecodeUtils { uint32_t count, uint8_t width) { if (width >= ParquetDecodeUtils::BITPACK_MASKS_SIZE) { - throw InvalidInputException( + throw duckdb::InvalidInputException( "The width (%d) of the bitpacked data exceeds the supported max width (%d), " "the file might be corrupted.", width, @@ -144,9 +145,9 @@ class ParquetDecodeUtils { return result; } }; -} // namespace duckdb +} // namespace facebook::velox::parquet -const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS[] = { +const uint64_t facebook::velox::parquet::ParquetDecodeUtils::BITPACK_MASKS[] = { 0, 1, 3, @@ -213,10 +214,11 @@ const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS[] = { 9223372036854775807, 18446744073709551615ULL}; -const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS_SIZE = - sizeof(ParquetDecodeUtils::BITPACK_MASKS) / sizeof(uint64_t); +const uint64_t + facebook::velox::parquet::ParquetDecodeUtils::BITPACK_MASKS_SIZE = + sizeof(ParquetDecodeUtils::BITPACK_MASKS) / sizeof(uint64_t); -const uint8_t duckdb::ParquetDecodeUtils::BITPACK_DLEN = 8; +const uint8_t facebook::velox::parquet::ParquetDecodeUtils::BITPACK_DLEN = 8; // Array of bit packed representations of randomInts_u32. The array at index i // is packed i bits wide and the values come from the low bits of @@ -244,18 +246,18 @@ std::vector buffer_u64; template void naiveDecodeBitsLE( - const uint64_t* FOLLY_NONNULL bits, + const uint64_t* bits, int32_t bitOffset, RowSet rows, int32_t rowBias, uint8_t bitWidth, const char* bufferEnd, - T* FOLLY_NONNULL result); + T* result); template void legacyUnpackNaive(RowSet rows, uint8_t bitWidth, T* result) { auto data = bitPackedData[bitWidth].data(); - auto numBytes = bits::roundUp((rows.back() + 1) * bitWidth, 8) / 8; + auto numBytes = bits::divRoundUp((rows.back() + 1) * bitWidth, 8); auto end = reinterpret_cast(data) + numBytes; naiveDecodeBitsLE(data, 0, rows, 0, bitWidth, end, result32.data()); } @@ -263,7 +265,7 @@ void legacyUnpackNaive(RowSet rows, uint8_t bitWidth, T* result) { template void legacyUnpackFast(RowSet rows, uint8_t bitWidth, T* result) { auto data = bitPackedData[bitWidth].data(); - auto numBytes = bits::roundUp((rows.back() + 1) * bitWidth, 8) / 8; + auto numBytes = bits::divRoundUp((rows.back() + 1) * bitWidth, 8); auto end = reinterpret_cast(data) + numBytes; facebook::velox::dwio::common::unpack( data, @@ -289,7 +291,7 @@ void fastpforlib(uint8_t bitWidth, T* result) { auto inputBuffer = reinterpret_cast(bitPackedData[bitWidth].data()); for (auto i = 0; i < numBatches; i++) { // Read 4 bytes and unpack 32 values - duckdb_fastpforlib::fastunpack( + velox::fastpforlib::fastunpack( inputBuffer + i * 4, result + i * 32, bitWidth); } } @@ -315,11 +317,11 @@ void arrowBitUnpack(uint8_t bitWidth, T* result) { template void duckdbBitUnpack(uint8_t bitWidth, T* result) { - duckdb::ByteBuffer duckInputBuffer( + facebook::velox::parquet::ByteBuffer duckInputBuffer( reinterpret_cast(bitPackedData[bitWidth].data()), BYTES(kNumValues, bitWidth)); uint8_t bitpack_pos = 0; - duckdb::ParquetDecodeUtils::BitUnpack( + facebook::velox::parquet::ParquetDecodeUtils::BitUnpack( duckInputBuffer, bitpack_pos, result, kNumValues, bitWidth); } @@ -501,7 +503,7 @@ BENCHMARK_UNPACK_ODDROWS_CASE_32(31) void populateBitPacked() { bitPackedData.resize(33); for (auto bitWidth = 1; bitWidth <= 32; ++bitWidth) { - auto numWords = bits::roundUp(randomInts_u32.size() * bitWidth, 64) / 64; + auto numWords = bits::divRoundUp(randomInts_u32.size() * bitWidth, 64); bitPackedData[bitWidth].resize(numWords); auto source = reinterpret_cast(randomInts_u32.data()); auto destination = @@ -526,13 +528,13 @@ void populateBitPacked() { // Naive unpacking, original version of IntDecoder::unpack. template void naiveDecodeBitsLE( - const uint64_t* FOLLY_NONNULL bits, + const uint64_t* bits, int32_t bitOffset, RowSet rows, int32_t rowBias, uint8_t bitWidth, const char* bufferEnd, - T* FOLLY_NONNULL result) { + T* result) { uint64_t mask = bits::lowMask(bitWidth); auto numRows = rows.size(); if (bitWidth > 56) { @@ -542,13 +544,12 @@ void naiveDecodeBitsLE( } return; } - auto FOLLY_NONNULL lastSafe = bufferEnd - sizeof(uint64_t); + auto lastSafe = bufferEnd - sizeof(uint64_t); int32_t numSafeRows = numRows; bool anyUnsafe = false; if (bufferEnd) { const char* endByte = reinterpret_cast(bits) + - bits::roundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8) / - 8; + bits::divRoundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8); // redzone is the number of bytes at the end of the accessed range that // could overflow the buffer if accessed 64 its wide. int64_t redZone = @@ -592,7 +593,7 @@ void naiveDecodeBitsLE( } int32_t main(int32_t argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; // Populate uint32 buffer diff --git a/velox/dwio/common/tests/BitPackDecoderTest.cpp b/velox/dwio/common/tests/BitPackDecoderTest.cpp index 945a11caba9f3..271fe8533784a 100644 --- a/velox/dwio/common/tests/BitPackDecoderTest.cpp +++ b/velox/dwio/common/tests/BitPackDecoderTest.cpp @@ -45,7 +45,7 @@ class BitPackDecoderTest : public testing::Test { void populateBitPackedData() { bitPackedData_.resize(33); for (auto bitWidth = 1; bitWidth <= 32; ++bitWidth) { - auto numWords = bits::roundUp(randomInts_.size() * bitWidth, 64) / 64; + auto numWords = bits::divRoundUp(randomInts_.size() * bitWidth, 64); bitPackedData_[bitWidth].resize(numWords); auto source = randomInts_.data(); auto destination = diff --git a/velox/dwio/common/tests/CMakeLists.txt b/velox/dwio/common/tests/CMakeLists.txt index 4487b18ce4411..f413dcdab12e9 100644 --- a/velox/dwio/common/tests/CMakeLists.txt +++ b/velox/dwio/common/tests/CMakeLists.txt @@ -22,57 +22,62 @@ add_executable( ColumnSelectorTests.cpp DataBufferTests.cpp DecoderUtilTest.cpp + ExecutorBarrierTest.cpp + OnDemandUnitLoaderTests.cpp LocalFileSinkTest.cpp + MemorySinkTest.cpp LoggedExceptionTest.cpp + MeasureTimeTests.cpp + ParallelForTest.cpp RangeTests.cpp ReadFileInputStreamTests.cpp + ReaderTest.cpp RetryTests.cpp TestBufferedInput.cpp - TypeTests.cpp) + ThrottlerTest.cpp + TypeTests.cpp + UnitLoaderToolsTests.cpp + WriterTest.cpp + OptionsTests.cpp) add_test(velox_dwio_common_test velox_dwio_common_test) target_link_libraries( velox_dwio_common_test velox_dwio_common_test_utils velox_temp_path + velox_vector_test_lib Boost::regex velox_link_libs Folly::folly ${TEST_LINK_LIBS} gflags::gflags - gtest - gtest_main - gmock + GTest::gtest + GTest::gtest_main + GTest::gmock glog::glog - fmt::fmt) + fmt::fmt + protobuf::libprotobuf) add_executable(velox_dwio_common_data_buffer_benchmark DataBufferBenchmark.cpp) target_link_libraries( - velox_dwio_common_data_buffer_benchmark velox_dwio_common velox_memory - velox_dwio_common_exception Folly::folly ${FOLLY_BENCHMARK}) + velox_dwio_common_data_buffer_benchmark + velox_dwio_common + velox_memory + velox_dwio_common_exception + Folly::folly + ${FOLLY_BENCHMARK}) add_executable(velox_dwio_common_int_decoder_benchmark IntDecoderBenchmark.cpp) target_link_libraries( - velox_dwio_common_int_decoder_benchmark velox_dwio_common_exception - velox_exception velox_dwio_dwrf_common Folly::folly ${FOLLY_BENCHMARK}) - -add_library(velox_e2e_filter_test_base E2EFilterTestBase.cpp) - -target_link_libraries( - velox_e2e_filter_test_base - velox_functions_prestosql - velox_parse_parser - velox_vector_test_lib - velox_link_libs + velox_dwio_common_int_decoder_benchmark + velox_dwio_common_exception + velox_exception + velox_dwio_dwrf_common Folly::folly - fmt::fmt - lz4::lz4 - lzo2::lzo2 - zstd::zstd - ZLIB::ZLIB - ${TEST_LINK_LIBS}) + ${FOLLY_BENCHMARK}) -if(VELOX_ENABLE_ARROW) +if(VELOX_ENABLE_ARROW AND VELOX_ENABLE_BENCHMARKS) + add_subdirectory(Lemire/FastPFor) add_executable(velox_dwio_common_bitpack_decoder_benchmark BitPackDecoderBenchmark.cpp) @@ -80,6 +85,11 @@ if(VELOX_ENABLE_ARROW) PRIVATE -Wno-deprecated-declarations) target_link_libraries( - velox_dwio_common_bitpack_decoder_benchmark velox_dwio_common arrow duckdb - Folly::folly ${FOLLY_BENCHMARK}) + velox_dwio_common_bitpack_decoder_benchmark + velox_dwio_common + arrow + velox_fastpforlib + duckdb_static + Folly::folly + ${FOLLY_BENCHMARK}) endif() diff --git a/velox/dwio/common/tests/ChainedBufferTests.cpp b/velox/dwio/common/tests/ChainedBufferTests.cpp index 46bb292681e3f..a83fbcf69fd56 100644 --- a/velox/dwio/common/tests/ChainedBufferTests.cpp +++ b/velox/dwio/common/tests/ChainedBufferTests.cpp @@ -16,6 +16,8 @@ #include #include +#include "velox/common/memory/Memory.h" +#include "velox/common/memory/MemoryPool.h" #include "velox/dwio/common/ChainedBuffer.h" using namespace ::testing; @@ -25,25 +27,33 @@ namespace velox { namespace dwio { namespace common { -TEST(ChainedBufferTests, testCreate) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - ChainedBuffer buf{*pool, 128, 1024}; +class ChainedBufferTests : public Test { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + std::shared_ptr pool_{ + memory::memoryManager()->addLeafPool()}; +}; + +TEST_F(ChainedBufferTests, testCreate) { + ChainedBuffer buf{*pool_, 128, 1024}; ASSERT_EQ(buf.capacity(), 128); ASSERT_EQ(buf.pages_.size(), 1); - ChainedBuffer buf2{*pool, 256, 1024}; + ChainedBuffer buf2{*pool_, 256, 1024}; ASSERT_EQ(buf2.capacity(), 256); ASSERT_EQ(buf2.pages_.size(), 1); - ChainedBuffer buf3{*pool, 257, 1024}; + ChainedBuffer buf3{*pool_, 257, 1024}; ASSERT_EQ(buf3.capacity(), 512); ASSERT_EQ(buf3.pages_.size(), 2); ASSERT_THROW( - (ChainedBuffer{*pool, 256, 257}), exception::LoggedException); + (ChainedBuffer{*pool_, 256, 257}), exception::LoggedException); } -TEST(ChainedBufferTests, testReserve) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - ChainedBuffer buf{*pool, 16, 1024}; +TEST_F(ChainedBufferTests, testReserve) { + ChainedBuffer buf{*pool_, 16, 1024}; buf.reserve(16); buf.reserve(17); ASSERT_EQ(buf.capacity(), 32); @@ -59,9 +69,8 @@ TEST(ChainedBufferTests, testReserve) { ASSERT_EQ(buf.pages_.size(), 5); } -TEST(ChainedBufferTests, testAppend) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - ChainedBuffer buf{*pool, 16, 64}; +TEST_F(ChainedBufferTests, testAppend) { + ChainedBuffer buf{*pool_, 16, 64}; for (size_t i = 0; i < 16; ++i) { buf.unsafeAppend(i); ASSERT_EQ(buf.capacity(), 16); @@ -84,29 +93,27 @@ TEST(ChainedBufferTests, testAppend) { ASSERT_EQ(buf[buf.size() - 1], 100); } -TEST(ChainedBufferTests, testClear) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - ChainedBuffer buf{*pool, 128, 1024}; +TEST_F(ChainedBufferTests, testClear) { + ChainedBuffer buf{*pool_, 128, 1024}; buf.clear(); ASSERT_EQ(buf.capacity(), 128); ASSERT_EQ(buf.size(), 0); ASSERT_EQ(buf.pages_.size(), 1); - ChainedBuffer buf2{*pool, 1024, 1024}; + ChainedBuffer buf2{*pool_, 1024, 1024}; buf2.clear(); ASSERT_EQ(buf2.capacity(), 256); ASSERT_EQ(buf2.size(), 0); ASSERT_EQ(buf2.pages_.size(), 1); } -TEST(ChainedBufferTests, testApplyRange) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); +TEST_F(ChainedBufferTests, testApplyRange) { std::vector> result; auto fn = [&](auto ptr, auto begin, auto end) { result.push_back({begin, end, *ptr}); }; - ChainedBuffer buf{*pool, 64, 64}; + ChainedBuffer buf{*pool_, 64, 64}; for (size_t i = 0; i < 64 / 16; ++i) { for (size_t j = 0; j < 16; ++j) { buf.unsafeAppend(i); @@ -153,9 +160,8 @@ TEST(ChainedBufferTests, testApplyRange) { std::tuple{0, 16, 3})); } -TEST(ChainedBufferTests, testGetPage) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - ChainedBuffer buf{*pool, 1024, 1024}; +TEST_F(ChainedBufferTests, testGetPage) { + ChainedBuffer buf{*pool_, 1024, 1024}; ASSERT_EQ( std::addressof(buf.getPageUnsafe(0)), std::addressof(buf.pages_.at(0))); ASSERT_EQ( @@ -166,7 +172,7 @@ TEST(ChainedBufferTests, testGetPage) { std::addressof(buf.getPageUnsafe(1023)), std::addressof(buf.pages_.at(3))); - ChainedBuffer buf2{*pool, 1024, 1024}; + ChainedBuffer buf2{*pool_, 1024, 1024}; ASSERT_EQ( std::addressof(buf2.getPageUnsafe(0)), std::addressof(buf2.pages_.at(0))); ASSERT_EQ( @@ -180,9 +186,8 @@ TEST(ChainedBufferTests, testGetPage) { std::addressof(buf2.pages_.at(7))); } -TEST(ChainedBufferTests, testGetPageIndex) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - ChainedBuffer buf{*pool, 1024, 1024}; +TEST_F(ChainedBufferTests, testGetPageIndex) { + ChainedBuffer buf{*pool_, 1024, 1024}; ASSERT_EQ(buf.getPageIndex(0), 0); ASSERT_EQ(buf.getPageIndex(256), 0); ASSERT_EQ(buf.getPageIndex(1023), 0); @@ -190,7 +195,7 @@ TEST(ChainedBufferTests, testGetPageIndex) { ASSERT_EQ(buf.getPageIndex(4095), 3); ASSERT_EQ(buf.getPageIndex(4096), 4); - ChainedBuffer buf2{*pool, 1024, 1024}; + ChainedBuffer buf2{*pool_, 1024, 1024}; ASSERT_EQ(buf2.getPageIndex(0), 0); ASSERT_EQ(buf2.getPageIndex(255), 0); ASSERT_EQ(buf2.getPageIndex(256), 1); @@ -198,9 +203,8 @@ TEST(ChainedBufferTests, testGetPageIndex) { ASSERT_EQ(buf2.getPageIndex(4096), 16); } -TEST(ChainedBufferTests, testGetPageOffset) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - ChainedBuffer buf{*pool, 1024, 1024}; +TEST_F(ChainedBufferTests, testGetPageOffset) { + ChainedBuffer buf{*pool_, 1024, 1024}; ASSERT_EQ(buf.getPageOffset(0), 0); ASSERT_EQ(buf.getPageOffset(256), 256); ASSERT_EQ(buf.getPageOffset(1023), 1023); @@ -208,7 +212,7 @@ TEST(ChainedBufferTests, testGetPageOffset) { ASSERT_EQ(buf.getPageOffset(4095), 1023); ASSERT_EQ(buf.getPageOffset(4096), 0); - ChainedBuffer buf2{*pool, 1024, 1024}; + ChainedBuffer buf2{*pool_, 1024, 1024}; ASSERT_EQ(buf2.getPageOffset(0), 0); ASSERT_EQ(buf2.getPageOffset(255), 255); ASSERT_EQ(buf2.getPageOffset(256), 0); @@ -216,14 +220,14 @@ TEST(ChainedBufferTests, testGetPageOffset) { ASSERT_EQ(buf2.getPageOffset(4096), 0); } -TEST(ChainedBufferTests, testBitCount) { +TEST_F(ChainedBufferTests, testBitCount) { ASSERT_EQ(ChainedBuffer::bitCount(0), 0); ASSERT_EQ(ChainedBuffer::bitCount(1), 1); ASSERT_EQ(ChainedBuffer::bitCount(4), 1); ASSERT_EQ(ChainedBuffer::bitCount(15), 4); } -TEST(ChainedBufferTests, testTrailingZeros) { +TEST_F(ChainedBufferTests, testTrailingZeros) { ASSERT_EQ(ChainedBuffer::trailingZeros(1), 0); ASSERT_EQ(ChainedBuffer::trailingZeros(12), 2); ASSERT_EQ(ChainedBuffer::trailingZeros(1u << 31), 31); @@ -231,7 +235,7 @@ TEST(ChainedBufferTests, testTrailingZeros) { ChainedBuffer::trailingZeros(0), exception::LoggedException); } -TEST(ChainedBufferTests, testPowerOf2) { +TEST_F(ChainedBufferTests, testPowerOf2) { ASSERT_EQ(ChainedBuffer::trailingZeros(1), 0); ASSERT_EQ(ChainedBuffer::trailingZeros(12), 2); ASSERT_EQ(ChainedBuffer::trailingZeros(1u << 31), 31); diff --git a/velox/dwio/common/tests/DataBufferBenchmark.cpp b/velox/dwio/common/tests/DataBufferBenchmark.cpp index 4890f184a7383..42397b3e837d2 100644 --- a/velox/dwio/common/tests/DataBufferBenchmark.cpp +++ b/velox/dwio/common/tests/DataBufferBenchmark.cpp @@ -24,7 +24,7 @@ using namespace facebook::velox::dwio; using namespace facebook::velox::dwio::common; BENCHMARK(DataBufferOps, iters) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); + auto pool = facebook::velox::memory::memoryManager()->addLeafPool(); constexpr size_t size = 1024 * 1024 * 16; for (size_t i = 0; i < iters; ++i) { DataBuffer buf{*pool}; @@ -39,7 +39,7 @@ BENCHMARK(DataBufferOps, iters) { } BENCHMARK(ChainedBufferOps, iters) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); + auto pool = facebook::velox::memory::memoryManager()->addLeafPool(); constexpr size_t size = 1024 * 1024 * 16; for (size_t i = 0; i < iters; ++i) { ChainedBuffer buf{*pool, size, size * 4}; @@ -53,7 +53,8 @@ BENCHMARK(ChainedBufferOps, iters) { } int main(int argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; folly::runBenchmarks(); + facebook::velox::memory::MemoryManager::initialize({}); return 0; } diff --git a/velox/dwio/common/tests/DataBufferTests.cpp b/velox/dwio/common/tests/DataBufferTests.cpp index 36a59a7297586..b46cad8ed622e 100644 --- a/velox/dwio/common/tests/DataBufferTests.cpp +++ b/velox/dwio/common/tests/DataBufferTests.cpp @@ -28,10 +28,18 @@ using namespace facebook::velox::memory; using namespace testing; using MemoryPool = facebook::velox::memory::MemoryPool; -TEST(DataBuffer, ZeroOut) { +class DataBufferTest : public testing::Test { + protected: + static void SetUpTestCase() { + MemoryManager::testingSetInstance({}); + } + + const std::shared_ptr pool_ = memoryManager()->addLeafPool(); +}; + +TEST_F(DataBufferTest, ZeroOut) { const uint8_t VALUE = 13; - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - DataBuffer buffer(*pool, 16); + DataBuffer buffer(*pool_, 16); for (auto i = 0; i < buffer.size(); i++) { auto data = buffer.data(); ASSERT_EQ(data[i], 0); @@ -57,10 +65,8 @@ TEST(DataBuffer, ZeroOut) { } } -TEST(DataBuffer, At) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - - DataBuffer buffer{*pool}; +TEST_F(DataBufferTest, At) { + DataBuffer buffer{*pool_}; for (auto i = 0; i != 15; ++i) { buffer.append(i); } @@ -79,10 +85,8 @@ TEST(DataBuffer, At) { } } -TEST(DataBuffer, Reset) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - - DataBuffer buffer{*pool}; +TEST_F(DataBufferTest, Reset) { + DataBuffer buffer{*pool_}; buffer.reserve(16); for (auto i = 0; i != 15; ++i) { buffer.append(i); @@ -135,10 +139,9 @@ TEST(DataBuffer, Reset) { } } -TEST(DataBuffer, Wrap) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); +TEST_F(DataBufferTest, Wrap) { auto size = 26; - auto buffer = velox::AlignedBuffer::allocate(size, pool.get()); + auto buffer = velox::AlignedBuffer::allocate(size, pool_.get()); auto raw = buffer->asMutable(); for (size_t i = 0; i < size; ++i) { raw[i] = 'a' + i; @@ -152,25 +155,24 @@ TEST(DataBuffer, Wrap) { } } -TEST(DataBuffer, Move) { - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); +TEST_F(DataBufferTest, Move) { { - DataBuffer buffer{*pool}; + DataBuffer buffer{*pool_}; buffer.reserve(16); for (auto i = 0; i != 15; ++i) { buffer.append(i); } ASSERT_EQ(15, buffer.size()); ASSERT_EQ(16, buffer.capacity()); - const auto usedBytes = pool->currentBytes(); + const auto usedBytes = pool_->usedBytes(); // Expect no double freeing from memory pool. DataBuffer newBuffer{std::move(buffer)}; ASSERT_EQ(15, newBuffer.size()); ASSERT_EQ(16, newBuffer.capacity()); - ASSERT_EQ(usedBytes, pool->currentBytes()); + ASSERT_EQ(usedBytes, pool_->usedBytes()); } - ASSERT_EQ(0, pool->currentBytes()); + ASSERT_EQ(0, pool_->usedBytes()); } } // namespace common } // namespace dwio diff --git a/velox/dwio/common/tests/DecoderUtilTest.cpp b/velox/dwio/common/tests/DecoderUtilTest.cpp index a3fc69db4f6de..f142d2baf4645 100644 --- a/velox/dwio/common/tests/DecoderUtilTest.cpp +++ b/velox/dwio/common/tests/DecoderUtilTest.cpp @@ -166,9 +166,8 @@ namespace facebook::velox::dwio::common { struct NoHook { void addValues( const int32_t* /*rows*/, - const void* /*values*/, - int32_t /*size*/, - uint8_t /*valueWidth*/) {} + const int32_t* /*values*/, + int32_t /*size*/) {} }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/tests/ExecutorBarrierTest.cpp b/velox/dwio/common/tests/ExecutorBarrierTest.cpp new file mode 100644 index 0000000000000..0eb528c0bef9c --- /dev/null +++ b/velox/dwio/common/tests/ExecutorBarrierTest.cpp @@ -0,0 +1,317 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "folly/executors/CPUThreadPoolExecutor.h" +#include "velox/dwio/common/ExecutorBarrier.h" + +using namespace ::testing; +using namespace ::facebook::velox::dwio::common; + +TEST(ExecutorBarrierTest, GetNumPriorities) { + const uint8_t kNumPriorities = 5; + auto executor = + std::make_shared(10, kNumPriorities); + auto barrier = std::make_shared(*executor); + EXPECT_EQ(barrier->getNumPriorities(), kNumPriorities); +} + +TEST(ExecutorBarrierTest, CanOwn) { + auto executor = std::make_shared(10); + { + auto barrier = std::make_shared(executor); + EXPECT_EQ(executor.use_count(), 2); + } + EXPECT_EQ(executor.use_count(), 1); +} + +TEST(ExecutorBarrierTest, CanAwaitMultipleTimes) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + for (int time = 0, multipleTimes = 10; time < multipleTimes; ++time) { + barrier->waitAll(); + } +} + +TEST(ExecutorBarrierTest, AddCanBeReused) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->add([&]() { ++count; }); + } + barrier->waitAll(); + EXPECT_EQ(count, kCalls); + + for (int i = 0; i < kCalls; ++i) { + barrier->add([&]() { ++count; }); + } + barrier->waitAll(); + EXPECT_EQ(count, (2 * kCalls)); +} + +TEST(ExecutorBarrierTest, AddWithPriorityCanBeReused) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + const int8_t kPriority = 4; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->addWithPriority([&]() { ++count; }, kPriority); + } + barrier->waitAll(); + EXPECT_EQ(count, kCalls); + + for (int i = 0; i < kCalls; ++i) { + barrier->addWithPriority([&]() { ++count; }, kPriority); + } + barrier->waitAll(); + EXPECT_EQ(count, (2 * kCalls)); +} + +TEST(ExecutorBarrierTest, AddCanBeReusedAfterException) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->add([&count]() { + ++count; + throw std::runtime_error(""); + }); + } + EXPECT_THROW(barrier->waitAll(), std::runtime_error); + EXPECT_EQ(count, kCalls); + + for (int i = 0; i < kCalls; ++i) { + barrier->add([&]() { ++count; }); + } + barrier->waitAll(); + EXPECT_EQ(count, (2 * kCalls)); +} + +TEST(ExecutorBarrierTest, AddWithPriorityCanBeReusedAfterException) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + const int8_t kPriority = 4; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->addWithPriority( + [&count]() { + ++count; + throw std::runtime_error(""); + }, + kPriority); + } + EXPECT_THROW(barrier->waitAll(), std::runtime_error); + EXPECT_EQ(count, kCalls); + + for (int i = 0; i < kCalls; ++i) { + barrier->addWithPriority([&]() { ++count; }, kPriority); + } + barrier->waitAll(); + EXPECT_EQ(count, (2 * kCalls)); +} + +TEST(ExecutorBarrierTest, Add) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->add([&]() { ++count; }); + } + barrier->waitAll(); + EXPECT_EQ(count, kCalls); +} + +TEST(ExecutorBarrierTest, AddWithPriority) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + const int8_t kPriority = 4; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->addWithPriority([&]() { ++count; }, kPriority); + } + barrier->waitAll(); + EXPECT_EQ(count, kCalls); +} + +TEST(ExecutorBarrierTest, AddCanIgnore) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + for (int i = 0; i < kCalls; ++i) { + barrier->add([]() {}); + } + // Discard: barrier->waitAll(); +} + +TEST(ExecutorBarrierTest, AddWithPriorityCanIgnore) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + for (int i = 0; i < kCalls; ++i) { + barrier->addWithPriority([]() {}, i); + } + // Discard: barrier->waitAll(); +} + +TEST(ExecutorBarrierTest, DestructorDoesntThrow) { + const int kCalls = 30; + std::atomic count{0}; + { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + for (int i = 0; i < kCalls; ++i) { + barrier->add([shouldThrow = (i == 0), &count]() { + ++count; + if (shouldThrow) { + throw std::runtime_error(""); + } + }); + } + } // executor awaits but doesn't throw + EXPECT_EQ(count, kCalls); +} + +TEST(ExecutorBarrierTest, AddException) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->add([shouldThrow = (i == 0), &count]() { + ++count; + if (shouldThrow) { + throw std::runtime_error(""); + } + }); + } + EXPECT_THROW(barrier->waitAll(), std::runtime_error); + EXPECT_EQ(count, kCalls); +} + +TEST(ExecutorBarrierTest, AddWithPriorityException) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + const int8_t kPriority = 4; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->addWithPriority( + [shouldThrow = (i == 0), &count]() { + ++count; + if (shouldThrow) { + throw std::runtime_error(""); + } + }, + kPriority); + } + EXPECT_THROW(barrier->waitAll(), std::runtime_error); + EXPECT_EQ(count, kCalls); +} + +TEST(ExecutorBarrierTest, AddNonStdException) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->add([shouldThrow = (i == 0), &count]() { + ++count; + if (shouldThrow) { + // @lint-ignore CLANGTIDY facebook-hte-ThrowNonStdExceptionIssue + throw 1; + } + }); + } + EXPECT_THROW(barrier->waitAll(), int); + EXPECT_EQ(count, kCalls); +} + +TEST(ExecutorBarrierTest, AddWithPriorityNonStdException) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + const int8_t kPriority = 4; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->addWithPriority( + [shouldThrow = (i == 0), &count]() { + ++count; + if (shouldThrow) { + // @lint-ignore CLANGTIDY facebook-hte-ThrowNonStdExceptionIssue + throw 1; + } + }, + kPriority); + } + EXPECT_THROW(barrier->waitAll(), int); + EXPECT_EQ(count, kCalls); +} + +TEST(ExecutorBarrierTest, AddExceptions) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->add([&]() { + ++count; + throw std::runtime_error(""); + }); + } + EXPECT_THROW(barrier->waitAll(), std::runtime_error); + EXPECT_EQ(count, kCalls); +} + +TEST(ExecutorBarrierTest, AddWithPriorityExceptions) { + auto executor = std::make_shared(10); + auto barrier = std::make_shared(*executor); + + const int kCalls = 30; + const int8_t kPriority = 4; + std::atomic count{0}; + for (int i = 0; i < kCalls; ++i) { + barrier->addWithPriority( + [&]() { + ++count; + throw std::runtime_error(""); + }, + kPriority); + } + EXPECT_THROW(barrier->waitAll(), std::runtime_error); + EXPECT_EQ(count, kCalls); +} diff --git a/velox/dwio/common/tests/IntDecoderBenchmark.cpp b/velox/dwio/common/tests/IntDecoderBenchmark.cpp index fab234c025b28..48e61768a2277 100644 --- a/velox/dwio/common/tests/IntDecoderBenchmark.cpp +++ b/velox/dwio/common/tests/IntDecoderBenchmark.cpp @@ -939,7 +939,7 @@ BENCHMARK_RELATIVE(decodeNew_64) { } int32_t main(int32_t argc, char* argv[]) { - folly::init(&argc, &argv); + folly::Init init{&argc, &argv}; // Populate uint16 buffer buffer_u16.resize(kNumElements); diff --git a/velox/dwio/common/tests/Lemire/FastPFor/CMakeLists.txt b/velox/dwio/common/tests/Lemire/FastPFor/CMakeLists.txt new file mode 100644 index 0000000000000..5f6ff7652e326 --- /dev/null +++ b/velox/dwio/common/tests/Lemire/FastPFor/CMakeLists.txt @@ -0,0 +1,17 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +add_library(velox_fastpforlib STATIC bitpacking.cpp) + +target_include_directories( + velox_fastpforlib PUBLIC $) diff --git a/velox/dwio/common/tests/Lemire/FastPFor/LICENSE b/velox/dwio/common/tests/Lemire/FastPFor/LICENSE new file mode 100644 index 0000000000000..8405e89a0b120 --- /dev/null +++ b/velox/dwio/common/tests/Lemire/FastPFor/LICENSE @@ -0,0 +1,191 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and +You must cause any modified files to carry prominent notices stating that You +changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/velox/dwio/common/tests/Lemire/FastPFor/bitpacking.cpp b/velox/dwio/common/tests/Lemire/FastPFor/bitpacking.cpp new file mode 100644 index 0000000000000..d2c3dbaa1da33 --- /dev/null +++ b/velox/dwio/common/tests/Lemire/FastPFor/bitpacking.cpp @@ -0,0 +1,1320 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#include +#include + +namespace velox::fastpforlib { +namespace internal { + +// Used for uint8_t, uint16_t and uint32_t +template < + uint8_t DELTA, + uint8_t SHR, + class TYPE, + uint8_t TYPE_SIZE = sizeof(TYPE) * 8> +typename std::enable_if<(DELTA + SHR) < TYPE_SIZE>::type unpack_single_out( + const TYPE* __restrict in, + TYPE* __restrict out) { + *out = ((*in) >> SHR) % (1 << DELTA); +} + +// Used for uint8_t, uint16_t and uint32_t +template < + uint8_t DELTA, + uint8_t SHR, + class TYPE, + uint8_t TYPE_SIZE = sizeof(TYPE) * 8> +typename std::enable_if<(DELTA + SHR) >= TYPE_SIZE>::type unpack_single_out( + const TYPE* __restrict& in, + TYPE* __restrict out) { + *out = (*in) >> SHR; + ++in; + + static const TYPE NEXT_SHR = SHR + DELTA - TYPE_SIZE; + *out |= ((*in) % (1U << NEXT_SHR)) << (TYPE_SIZE - SHR); +} + +template +typename std::enable_if<(DELTA + SHR) < 32>::type unpack_single_out( + const uint32_t* __restrict in, + uint64_t* __restrict out) { + *out = ((static_cast(*in)) >> SHR) % (1ULL << DELTA); +} + +template +typename std::enable_if<(DELTA + SHR) >= 32 && (DELTA + SHR) < 64>::type +unpack_single_out(const uint32_t* __restrict& in, uint64_t* __restrict out) { + *out = static_cast(*in) >> SHR; + ++in; + if (DELTA + SHR > 32) { + static const uint8_t NEXT_SHR = SHR + DELTA - 32; + *out |= static_cast((*in) % (1U << NEXT_SHR)) << (32 - SHR); + } +} + +template +typename std::enable_if<(DELTA + SHR) >= 64>::type unpack_single_out( + const uint32_t* __restrict& in, + uint64_t* __restrict out) { + *out = static_cast(*in) >> SHR; + ++in; + + *out |= static_cast(*in) << (32 - SHR); + ++in; + + if (DELTA + SHR > 64) { + static const uint8_t NEXT_SHR = DELTA + SHR - 64; + *out |= static_cast((*in) % (1U << NEXT_SHR)) << (64 - SHR); + } +} + +// Used for uint8_t, uint16_t and uint32_t +template < + class TYPE, + uint16_t DELTA, + uint16_t SHL, + TYPE MASK, + uint8_t TYPE_SIZE = sizeof(TYPE) * 8> + typename std::enable_if < DELTA + + SHL::type pack_single_in( + const TYPE in, + TYPE* __restrict out) { + if (SHL == 0) { + *out = in & MASK; + } else { + *out |= (in & MASK) << SHL; + } +} + +// Used for uint8_t, uint16_t and uint32_t +template < + class TYPE, + uint16_t DELTA, + uint16_t SHL, + TYPE MASK, + uint8_t TYPE_SIZE = sizeof(TYPE) * 8> +typename std::enable_if= TYPE_SIZE>::type pack_single_in( + const TYPE in, + TYPE* __restrict& out) { + *out |= in << SHL; + ++out; + + if (DELTA + SHL > TYPE_SIZE) { + *out = (in & MASK) >> (TYPE_SIZE - SHL); + } +} + +template + typename std::enable_if < DELTA + + SHL<32>::type pack_single_in64( + const uint64_t in, + uint32_t* __restrict out) { + if (SHL == 0) { + *out = static_cast(in & MASK); + } else { + *out |= (in & MASK) << SHL; + } +} +template + typename std::enable_if < DELTA + SHL >= 32 && + DELTA + + SHL<64>::type pack_single_in64( + const uint64_t in, + uint32_t* __restrict& out) { + if (SHL == 0) { + *out = static_cast(in & MASK); + } else { + *out |= (in & MASK) << SHL; + } + + ++out; + + if (DELTA + SHL > 32) { + *out = static_cast((in & MASK) >> (32 - SHL)); + } +} +template +typename std::enable_if= 64>::type pack_single_in64( + const uint64_t in, + uint32_t* __restrict& out) { + *out |= in << SHL; + ++out; + + *out = static_cast((in & MASK) >> (32 - SHL)); + ++out; + + if (DELTA + SHL > 64) { + *out = (in & MASK) >> (64 - SHL); + } +} +template +struct Unroller8 { + static void Unpack(const uint8_t* __restrict& in, uint8_t* __restrict out) { + unpack_single_out(in, out + OINDEX); + + Unroller8::Unpack(in, out); + } + + static void Pack(const uint8_t* __restrict in, uint8_t* __restrict out) { + pack_single_in( + in[OINDEX], out); + + Unroller8::Pack(in, out); + } +}; +template +struct Unroller8 { + enum { SHIFT = (DELTA * 7) % 8 }; + + static void Unpack(const uint8_t* __restrict in, uint8_t* __restrict out) { + out[7] = (*in) >> SHIFT; + } + + static void Pack(const uint8_t* __restrict in, uint8_t* __restrict out) { + *out |= (in[7] << SHIFT); + } +}; + +template +struct Unroller16 { + static void Unpack(const uint16_t* __restrict& in, uint16_t* __restrict out) { + unpack_single_out(in, out + OINDEX); + + Unroller16::Unpack(in, out); + } + + static void Pack(const uint16_t* __restrict in, uint16_t* __restrict out) { + pack_single_in( + in[OINDEX], out); + + Unroller16::Pack(in, out); + } +}; + +template +struct Unroller16 { + enum { SHIFT = (DELTA * 15) % 16 }; + + static void Unpack(const uint16_t* __restrict in, uint16_t* __restrict out) { + out[15] = (*in) >> SHIFT; + } + + static void Pack(const uint16_t* __restrict in, uint16_t* __restrict out) { + *out |= (in[15] << SHIFT); + } +}; + +template +struct Unroller { + static void Unpack(const uint32_t* __restrict& in, uint32_t* __restrict out) { + unpack_single_out(in, out + OINDEX); + + Unroller::Unpack(in, out); + } + + static void Unpack(const uint32_t* __restrict& in, uint64_t* __restrict out) { + unpack_single_out(in, out + OINDEX); + + Unroller::Unpack(in, out); + } + + static void Pack(const uint32_t* __restrict in, uint32_t* __restrict out) { + pack_single_in( + in[OINDEX], out); + + Unroller::Pack(in, out); + } + + static void Pack(const uint64_t* __restrict in, uint32_t* __restrict out) { + pack_single_in64( + in[OINDEX], out); + + Unroller::Pack(in, out); + } +}; + +template +struct Unroller { + enum { SHIFT = (DELTA * 31) % 32 }; + + static void Unpack(const uint32_t* __restrict in, uint32_t* __restrict out) { + out[31] = (*in) >> SHIFT; + } + + static void Unpack(const uint32_t* __restrict in, uint64_t* __restrict out) { + out[31] = (*in) >> SHIFT; + if (DELTA > 32) { + ++in; + out[31] |= static_cast(*in) << (32 - SHIFT); + } + } + + static void Pack(const uint32_t* __restrict in, uint32_t* __restrict out) { + *out |= (in[31] << SHIFT); + } + + static void Pack(const uint64_t* __restrict in, uint32_t* __restrict out) { + *out |= (in[31] << SHIFT); + if (DELTA > 32) { + ++out; + *out = static_cast(in[31] >> (32 - SHIFT)); + } + } +}; + +// Special cases +void __fastunpack0(const uint8_t* __restrict, uint8_t* __restrict out) { + for (uint8_t i = 0; i < 8; ++i) + *(out++) = 0; +} + +void __fastunpack0(const uint16_t* __restrict, uint16_t* __restrict out) { + for (uint16_t i = 0; i < 16; ++i) + *(out++) = 0; +} + +void __fastunpack0(const uint32_t* __restrict, uint32_t* __restrict out) { + for (uint32_t i = 0; i < 32; ++i) + *(out++) = 0; +} + +void __fastunpack0(const uint32_t* __restrict, uint64_t* __restrict out) { + for (uint32_t i = 0; i < 32; ++i) + *(out++) = 0; +} + +void __fastpack0(const uint8_t* __restrict, uint8_t* __restrict) {} +void __fastpack0(const uint16_t* __restrict, uint16_t* __restrict) {} +void __fastpack0(const uint32_t* __restrict, uint32_t* __restrict) {} +void __fastpack0(const uint64_t* __restrict, uint32_t* __restrict) {} + +// fastunpack for 8 bits +void __fastunpack1(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<1>::Unpack(in, out); +} + +void __fastunpack2(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<2>::Unpack(in, out); +} + +void __fastunpack3(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<3>::Unpack(in, out); +} + +void __fastunpack4(const uint8_t* __restrict in, uint8_t* __restrict out) { + for (uint8_t outer = 0; outer < 4; ++outer) { + for (uint8_t inwordpointer = 0; inwordpointer < 8; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<5>::Unpack(in, out); +} + +void __fastunpack6(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<6>::Unpack(in, out); +} + +void __fastunpack7(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<7>::Unpack(in, out); +} + +void __fastunpack8(const uint8_t* __restrict in, uint8_t* __restrict out) { + for (int k = 0; k < 8; ++k) + out[k] = in[k]; +} + +// fastunpack for 16 bits +void __fastunpack1(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<1>::Unpack(in, out); +} + +void __fastunpack2(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<2>::Unpack(in, out); +} + +void __fastunpack3(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<3>::Unpack(in, out); +} + +void __fastunpack4(const uint16_t* __restrict in, uint16_t* __restrict out) { + for (uint16_t outer = 0; outer < 4; ++outer) { + for (uint16_t inwordpointer = 0; inwordpointer < 16; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<5>::Unpack(in, out); +} + +void __fastunpack6(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<6>::Unpack(in, out); +} + +void __fastunpack7(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<7>::Unpack(in, out); +} + +void __fastunpack8(const uint16_t* __restrict in, uint16_t* __restrict out) { + for (uint16_t outer = 0; outer < 8; ++outer) { + for (uint16_t inwordpointer = 0; inwordpointer < 16; inwordpointer += 8) + *(out++) = ((*in) >> inwordpointer) % (1U << 8); + ++in; + } +} + +void __fastunpack9(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<9>::Unpack(in, out); +} + +void __fastunpack10(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<10>::Unpack(in, out); +} + +void __fastunpack11(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<11>::Unpack(in, out); +} + +void __fastunpack12(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<12>::Unpack(in, out); +} + +void __fastunpack13(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<13>::Unpack(in, out); +} + +void __fastunpack14(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<14>::Unpack(in, out); +} + +void __fastunpack15(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<15>::Unpack(in, out); +} + +void __fastunpack16(const uint16_t* __restrict in, uint16_t* __restrict out) { + for (int k = 0; k < 16; ++k) + out[k] = in[k]; +} + +// fastunpack for 32 bits +void __fastunpack1(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<1>::Unpack(in, out); +} + +void __fastunpack2(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<2>::Unpack(in, out); +} + +void __fastunpack3(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<3>::Unpack(in, out); +} + +void __fastunpack4(const uint32_t* __restrict in, uint32_t* __restrict out) { + for (uint32_t outer = 0; outer < 4; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<5>::Unpack(in, out); +} + +void __fastunpack6(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<6>::Unpack(in, out); +} + +void __fastunpack7(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<7>::Unpack(in, out); +} + +void __fastunpack8(const uint32_t* __restrict in, uint32_t* __restrict out) { + for (uint32_t outer = 0; outer < 8; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) + *(out++) = ((*in) >> inwordpointer) % (1U << 8); + ++in; + } +} + +void __fastunpack9(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<9>::Unpack(in, out); +} + +void __fastunpack10(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<10>::Unpack(in, out); +} + +void __fastunpack11(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<11>::Unpack(in, out); +} + +void __fastunpack12(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<12>::Unpack(in, out); +} + +void __fastunpack13(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<13>::Unpack(in, out); +} + +void __fastunpack14(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<14>::Unpack(in, out); +} + +void __fastunpack15(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<15>::Unpack(in, out); +} + +void __fastunpack16(const uint32_t* __restrict in, uint32_t* __restrict out) { + for (uint32_t outer = 0; outer < 16; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 16) + *(out++) = ((*in) >> inwordpointer) % (1U << 16); + ++in; + } +} + +void __fastunpack17(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<17>::Unpack(in, out); +} + +void __fastunpack18(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<18>::Unpack(in, out); +} + +void __fastunpack19(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<19>::Unpack(in, out); +} + +void __fastunpack20(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<20>::Unpack(in, out); +} + +void __fastunpack21(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<21>::Unpack(in, out); +} + +void __fastunpack22(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<22>::Unpack(in, out); +} + +void __fastunpack23(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<23>::Unpack(in, out); +} + +void __fastunpack24(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<24>::Unpack(in, out); +} + +void __fastunpack25(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<25>::Unpack(in, out); +} + +void __fastunpack26(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<26>::Unpack(in, out); +} + +void __fastunpack27(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<27>::Unpack(in, out); +} + +void __fastunpack28(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<28>::Unpack(in, out); +} + +void __fastunpack29(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<29>::Unpack(in, out); +} + +void __fastunpack30(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<30>::Unpack(in, out); +} + +void __fastunpack31(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<31>::Unpack(in, out); +} + +void __fastunpack32(const uint32_t* __restrict in, uint32_t* __restrict out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; +} + +// fastupack for 64 bits +void __fastunpack1(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<1>::Unpack(in, out); +} + +void __fastunpack2(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<2>::Unpack(in, out); +} + +void __fastunpack3(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<3>::Unpack(in, out); +} + +void __fastunpack4(const uint32_t* __restrict in, uint64_t* __restrict out) { + for (uint32_t outer = 0; outer < 4; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<5>::Unpack(in, out); +} + +void __fastunpack6(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<6>::Unpack(in, out); +} + +void __fastunpack7(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<7>::Unpack(in, out); +} + +void __fastunpack8(const uint32_t* __restrict in, uint64_t* __restrict out) { + for (uint32_t outer = 0; outer < 8; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) { + *(out++) = ((*in) >> inwordpointer) % (1U << 8); + } + ++in; + } +} + +void __fastunpack9(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<9>::Unpack(in, out); +} + +void __fastunpack10(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<10>::Unpack(in, out); +} + +void __fastunpack11(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<11>::Unpack(in, out); +} + +void __fastunpack12(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<12>::Unpack(in, out); +} + +void __fastunpack13(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<13>::Unpack(in, out); +} + +void __fastunpack14(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<14>::Unpack(in, out); +} + +void __fastunpack15(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<15>::Unpack(in, out); +} + +void __fastunpack16(const uint32_t* __restrict in, uint64_t* __restrict out) { + for (uint32_t outer = 0; outer < 16; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 16) + *(out++) = ((*in) >> inwordpointer) % (1U << 16); + ++in; + } +} + +void __fastunpack17(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<17>::Unpack(in, out); +} + +void __fastunpack18(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<18>::Unpack(in, out); +} + +void __fastunpack19(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<19>::Unpack(in, out); +} + +void __fastunpack20(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<20>::Unpack(in, out); +} + +void __fastunpack21(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<21>::Unpack(in, out); +} + +void __fastunpack22(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<22>::Unpack(in, out); +} + +void __fastunpack23(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<23>::Unpack(in, out); +} + +void __fastunpack24(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<24>::Unpack(in, out); +} + +void __fastunpack25(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<25>::Unpack(in, out); +} + +void __fastunpack26(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<26>::Unpack(in, out); +} + +void __fastunpack27(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<27>::Unpack(in, out); +} + +void __fastunpack28(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<28>::Unpack(in, out); +} + +void __fastunpack29(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<29>::Unpack(in, out); +} + +void __fastunpack30(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<30>::Unpack(in, out); +} + +void __fastunpack31(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<31>::Unpack(in, out); +} + +void __fastunpack32(const uint32_t* __restrict in, uint64_t* __restrict out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; +} + +void __fastunpack33(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<33>::Unpack(in, out); +} + +void __fastunpack34(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<34>::Unpack(in, out); +} + +void __fastunpack35(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<35>::Unpack(in, out); +} + +void __fastunpack36(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<36>::Unpack(in, out); +} + +void __fastunpack37(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<37>::Unpack(in, out); +} + +void __fastunpack38(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<38>::Unpack(in, out); +} + +void __fastunpack39(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<39>::Unpack(in, out); +} + +void __fastunpack40(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<40>::Unpack(in, out); +} + +void __fastunpack41(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<41>::Unpack(in, out); +} + +void __fastunpack42(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<42>::Unpack(in, out); +} + +void __fastunpack43(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<43>::Unpack(in, out); +} + +void __fastunpack44(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<44>::Unpack(in, out); +} + +void __fastunpack45(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<45>::Unpack(in, out); +} + +void __fastunpack46(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<46>::Unpack(in, out); +} + +void __fastunpack47(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<47>::Unpack(in, out); +} + +void __fastunpack48(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<48>::Unpack(in, out); +} + +void __fastunpack49(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<49>::Unpack(in, out); +} + +void __fastunpack50(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<50>::Unpack(in, out); +} + +void __fastunpack51(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<51>::Unpack(in, out); +} + +void __fastunpack52(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<52>::Unpack(in, out); +} + +void __fastunpack53(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<53>::Unpack(in, out); +} + +void __fastunpack54(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<54>::Unpack(in, out); +} + +void __fastunpack55(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<55>::Unpack(in, out); +} + +void __fastunpack56(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<56>::Unpack(in, out); +} + +void __fastunpack57(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<57>::Unpack(in, out); +} + +void __fastunpack58(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<58>::Unpack(in, out); +} + +void __fastunpack59(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<59>::Unpack(in, out); +} + +void __fastunpack60(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<60>::Unpack(in, out); +} + +void __fastunpack61(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<61>::Unpack(in, out); +} + +void __fastunpack62(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<62>::Unpack(in, out); +} + +void __fastunpack63(const uint32_t* __restrict in, uint64_t* __restrict out) { + Unroller<63>::Unpack(in, out); +} + +void __fastunpack64(const uint32_t* __restrict in, uint64_t* __restrict out) { + for (int k = 0; k < 32; ++k) { + out[k] = in[k * 2]; + out[k] |= static_cast(in[k * 2 + 1]) << 32; + } +} + +// fastpack for 8 bits + +void __fastpack1(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<1>::Pack(in, out); +} + +void __fastpack2(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<2>::Pack(in, out); +} + +void __fastpack3(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<3>::Pack(in, out); +} + +void __fastpack4(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<4>::Pack(in, out); +} + +void __fastpack5(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<5>::Pack(in, out); +} + +void __fastpack6(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<6>::Pack(in, out); +} + +void __fastpack7(const uint8_t* __restrict in, uint8_t* __restrict out) { + Unroller8<7>::Pack(in, out); +} + +void __fastpack8(const uint8_t* __restrict in, uint8_t* __restrict out) { + for (int k = 0; k < 8; ++k) + out[k] = in[k]; +} + +// fastpack for 16 bits + +void __fastpack1(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<1>::Pack(in, out); +} + +void __fastpack2(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<2>::Pack(in, out); +} + +void __fastpack3(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<3>::Pack(in, out); +} + +void __fastpack4(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<4>::Pack(in, out); +} + +void __fastpack5(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<5>::Pack(in, out); +} + +void __fastpack6(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<6>::Pack(in, out); +} + +void __fastpack7(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<7>::Pack(in, out); +} + +void __fastpack8(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<8>::Pack(in, out); +} + +void __fastpack9(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<9>::Pack(in, out); +} + +void __fastpack10(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<10>::Pack(in, out); +} + +void __fastpack11(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<11>::Pack(in, out); +} + +void __fastpack12(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<12>::Pack(in, out); +} + +void __fastpack13(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<13>::Pack(in, out); +} + +void __fastpack14(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<14>::Pack(in, out); +} + +void __fastpack15(const uint16_t* __restrict in, uint16_t* __restrict out) { + Unroller16<15>::Pack(in, out); +} + +void __fastpack16(const uint16_t* __restrict in, uint16_t* __restrict out) { + for (int k = 0; k < 16; ++k) + out[k] = in[k]; +} + +// fastpack for 32 bits + +void __fastpack1(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<1>::Pack(in, out); +} + +void __fastpack2(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<2>::Pack(in, out); +} + +void __fastpack3(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<3>::Pack(in, out); +} + +void __fastpack4(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<4>::Pack(in, out); +} + +void __fastpack5(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<5>::Pack(in, out); +} + +void __fastpack6(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<6>::Pack(in, out); +} + +void __fastpack7(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<7>::Pack(in, out); +} + +void __fastpack8(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<8>::Pack(in, out); +} + +void __fastpack9(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<9>::Pack(in, out); +} + +void __fastpack10(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<10>::Pack(in, out); +} + +void __fastpack11(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<11>::Pack(in, out); +} + +void __fastpack12(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<12>::Pack(in, out); +} + +void __fastpack13(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<13>::Pack(in, out); +} + +void __fastpack14(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<14>::Pack(in, out); +} + +void __fastpack15(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<15>::Pack(in, out); +} + +void __fastpack16(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<16>::Pack(in, out); +} + +void __fastpack17(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<17>::Pack(in, out); +} + +void __fastpack18(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<18>::Pack(in, out); +} + +void __fastpack19(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<19>::Pack(in, out); +} + +void __fastpack20(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<20>::Pack(in, out); +} + +void __fastpack21(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<21>::Pack(in, out); +} + +void __fastpack22(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<22>::Pack(in, out); +} + +void __fastpack23(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<23>::Pack(in, out); +} + +void __fastpack24(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<24>::Pack(in, out); +} + +void __fastpack25(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<25>::Pack(in, out); +} + +void __fastpack26(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<26>::Pack(in, out); +} + +void __fastpack27(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<27>::Pack(in, out); +} + +void __fastpack28(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<28>::Pack(in, out); +} + +void __fastpack29(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<29>::Pack(in, out); +} + +void __fastpack30(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<30>::Pack(in, out); +} + +void __fastpack31(const uint32_t* __restrict in, uint32_t* __restrict out) { + Unroller<31>::Pack(in, out); +} + +void __fastpack32(const uint32_t* __restrict in, uint32_t* __restrict out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; +} + +// fastpack for 64 bits + +void __fastpack1(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<1>::Pack(in, out); +} + +void __fastpack2(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<2>::Pack(in, out); +} + +void __fastpack3(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<3>::Pack(in, out); +} + +void __fastpack4(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<4>::Pack(in, out); +} + +void __fastpack5(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<5>::Pack(in, out); +} + +void __fastpack6(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<6>::Pack(in, out); +} + +void __fastpack7(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<7>::Pack(in, out); +} + +void __fastpack8(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<8>::Pack(in, out); +} + +void __fastpack9(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<9>::Pack(in, out); +} + +void __fastpack10(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<10>::Pack(in, out); +} + +void __fastpack11(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<11>::Pack(in, out); +} + +void __fastpack12(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<12>::Pack(in, out); +} + +void __fastpack13(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<13>::Pack(in, out); +} + +void __fastpack14(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<14>::Pack(in, out); +} + +void __fastpack15(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<15>::Pack(in, out); +} + +void __fastpack16(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<16>::Pack(in, out); +} + +void __fastpack17(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<17>::Pack(in, out); +} + +void __fastpack18(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<18>::Pack(in, out); +} + +void __fastpack19(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<19>::Pack(in, out); +} + +void __fastpack20(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<20>::Pack(in, out); +} + +void __fastpack21(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<21>::Pack(in, out); +} + +void __fastpack22(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<22>::Pack(in, out); +} + +void __fastpack23(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<23>::Pack(in, out); +} + +void __fastpack24(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<24>::Pack(in, out); +} + +void __fastpack25(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<25>::Pack(in, out); +} + +void __fastpack26(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<26>::Pack(in, out); +} + +void __fastpack27(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<27>::Pack(in, out); +} + +void __fastpack28(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<28>::Pack(in, out); +} + +void __fastpack29(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<29>::Pack(in, out); +} + +void __fastpack30(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<30>::Pack(in, out); +} + +void __fastpack31(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<31>::Pack(in, out); +} + +void __fastpack32(const uint64_t* __restrict in, uint32_t* __restrict out) { + for (int k = 0; k < 32; ++k) { + out[k] = static_cast(in[k]); + } +} + +void __fastpack33(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<33>::Pack(in, out); +} + +void __fastpack34(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<34>::Pack(in, out); +} + +void __fastpack35(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<35>::Pack(in, out); +} + +void __fastpack36(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<36>::Pack(in, out); +} + +void __fastpack37(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<37>::Pack(in, out); +} + +void __fastpack38(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<38>::Pack(in, out); +} + +void __fastpack39(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<39>::Pack(in, out); +} + +void __fastpack40(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<40>::Pack(in, out); +} + +void __fastpack41(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<41>::Pack(in, out); +} + +void __fastpack42(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<42>::Pack(in, out); +} + +void __fastpack43(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<43>::Pack(in, out); +} + +void __fastpack44(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<44>::Pack(in, out); +} + +void __fastpack45(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<45>::Pack(in, out); +} + +void __fastpack46(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<46>::Pack(in, out); +} + +void __fastpack47(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<47>::Pack(in, out); +} + +void __fastpack48(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<48>::Pack(in, out); +} + +void __fastpack49(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<49>::Pack(in, out); +} + +void __fastpack50(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<50>::Pack(in, out); +} + +void __fastpack51(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<51>::Pack(in, out); +} + +void __fastpack52(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<52>::Pack(in, out); +} + +void __fastpack53(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<53>::Pack(in, out); +} + +void __fastpack54(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<54>::Pack(in, out); +} + +void __fastpack55(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<55>::Pack(in, out); +} + +void __fastpack56(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<56>::Pack(in, out); +} + +void __fastpack57(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<57>::Pack(in, out); +} + +void __fastpack58(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<58>::Pack(in, out); +} + +void __fastpack59(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<59>::Pack(in, out); +} + +void __fastpack60(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<60>::Pack(in, out); +} + +void __fastpack61(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<61>::Pack(in, out); +} + +void __fastpack62(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<62>::Pack(in, out); +} + +void __fastpack63(const uint64_t* __restrict in, uint32_t* __restrict out) { + Unroller<63>::Pack(in, out); +} + +void __fastpack64(const uint64_t* __restrict in, uint32_t* __restrict out) { + for (int i = 0; i < 32; ++i) { + out[2 * i] = static_cast(in[i]); + out[2 * i + 1] = in[i] >> 32; + } +} +} // namespace internal +} // namespace velox::fastpforlib diff --git a/velox/dwio/common/tests/Lemire/FastPFor/bitpacking.h b/velox/dwio/common/tests/Lemire/FastPFor/bitpacking.h new file mode 100644 index 0000000000000..e1675eb24712c --- /dev/null +++ b/velox/dwio/common/tests/Lemire/FastPFor/bitpacking.h @@ -0,0 +1,294 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#pragma once +#include +#include + +namespace velox::fastpforlib { +namespace internal { + +// Unpacks 8 uint8_t values +void __fastunpack0(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastunpack1(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastunpack2(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastunpack3(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastunpack4(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastunpack5(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastunpack6(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastunpack7(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastunpack8(const uint8_t* __restrict in, uint8_t* __restrict out); + +// Unpacks 16 uint16_t values +void __fastunpack0(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack1(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack2(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack3(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack4(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack5(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack6(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack7(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack8(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack9(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack10(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack11(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack12(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack13(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack14(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack15(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastunpack16(const uint16_t* __restrict in, uint16_t* __restrict out); + +// Unpacks 32 uint32_t values +void __fastunpack0(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack1(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack2(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack3(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack4(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack5(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack6(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack7(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack8(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack9(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack10(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack11(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack12(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack13(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack14(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack15(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack16(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack17(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack18(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack19(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack20(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack21(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack22(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack23(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack24(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack25(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack26(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack27(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack28(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack29(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack30(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack31(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastunpack32(const uint32_t* __restrict in, uint32_t* __restrict out); + +// Unpacks 32 uint64_t values +void __fastunpack0(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack1(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack2(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack3(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack4(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack5(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack6(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack7(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack8(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack9(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack10(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack11(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack12(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack13(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack14(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack15(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack16(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack17(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack18(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack19(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack20(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack21(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack22(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack23(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack24(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack25(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack26(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack27(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack28(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack29(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack30(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack31(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack32(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack33(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack34(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack35(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack36(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack37(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack38(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack39(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack40(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack41(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack42(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack43(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack44(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack45(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack46(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack47(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack48(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack49(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack50(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack51(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack52(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack53(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack54(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack55(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack56(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack57(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack58(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack59(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack60(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack61(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack62(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack63(const uint32_t* __restrict in, uint64_t* __restrict out); +void __fastunpack64(const uint32_t* __restrict in, uint64_t* __restrict out); + +// Packs 8 int8_t values +void __fastpack0(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastpack1(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastpack2(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastpack3(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastpack4(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastpack5(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastpack6(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastpack7(const uint8_t* __restrict in, uint8_t* __restrict out); +void __fastpack8(const uint8_t* __restrict in, uint8_t* __restrict out); + +// Packs 16 int16_t values +void __fastpack0(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack1(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack2(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack3(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack4(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack5(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack6(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack7(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack8(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack9(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack10(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack11(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack12(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack13(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack14(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack15(const uint16_t* __restrict in, uint16_t* __restrict out); +void __fastpack16(const uint16_t* __restrict in, uint16_t* __restrict out); + +// Packs 32 int32_t values +void __fastpack0(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack1(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack2(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack3(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack4(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack5(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack6(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack7(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack8(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack9(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack10(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack11(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack12(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack13(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack14(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack15(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack16(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack17(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack18(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack19(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack20(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack21(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack22(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack23(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack24(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack25(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack26(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack27(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack28(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack29(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack30(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack31(const uint32_t* __restrict in, uint32_t* __restrict out); +void __fastpack32(const uint32_t* __restrict in, uint32_t* __restrict out); + +// Packs 32 int64_t values +void __fastpack0(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack1(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack2(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack3(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack4(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack5(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack6(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack7(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack8(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack9(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack10(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack11(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack12(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack13(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack14(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack15(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack16(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack17(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack18(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack19(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack20(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack21(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack22(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack23(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack24(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack25(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack26(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack27(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack28(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack29(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack30(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack31(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack32(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack33(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack34(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack35(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack36(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack37(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack38(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack39(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack40(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack41(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack42(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack43(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack44(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack45(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack46(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack47(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack48(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack49(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack50(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack51(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack52(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack53(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack54(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack55(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack56(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack57(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack58(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack59(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack60(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack61(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack62(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack63(const uint64_t* __restrict in, uint32_t* __restrict out); +void __fastpack64(const uint64_t* __restrict in, uint32_t* __restrict out); +} // namespace internal +} // namespace velox::fastpforlib diff --git a/velox/dwio/common/tests/Lemire/FastPFor/bitpackinghelpers.h b/velox/dwio/common/tests/Lemire/FastPFor/bitpackinghelpers.h new file mode 100644 index 0000000000000..f49f7bc563db7 --- /dev/null +++ b/velox/dwio/common/tests/Lemire/FastPFor/bitpackinghelpers.h @@ -0,0 +1,900 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#pragma once +#include "bitpacking.h" + +#include + +namespace velox::fastpforlib { + +namespace internal { + +// Note that this only packs 8 values +inline void fastunpack_quarter( + const uint8_t* __restrict in, + uint8_t* __restrict out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + internal::__fastunpack0(in, out); + break; + case 1: + internal::__fastunpack1(in, out); + break; + case 2: + internal::__fastunpack2(in, out); + break; + case 3: + internal::__fastunpack3(in, out); + break; + case 4: + internal::__fastunpack4(in, out); + break; + case 5: + internal::__fastunpack5(in, out); + break; + case 6: + internal::__fastunpack6(in, out); + break; + case 7: + internal::__fastunpack7(in, out); + break; + case 8: + internal::__fastunpack8(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +// Note that this only packs 8 values +inline void fastpack_quarter( + const uint8_t* __restrict in, + uint8_t* __restrict out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + internal::__fastpack0(in, out); + break; + case 1: + internal::__fastpack1(in, out); + break; + case 2: + internal::__fastpack2(in, out); + break; + case 3: + internal::__fastpack3(in, out); + break; + case 4: + internal::__fastpack4(in, out); + break; + case 5: + internal::__fastpack5(in, out); + break; + case 6: + internal::__fastpack6(in, out); + break; + case 7: + internal::__fastpack7(in, out); + break; + case 8: + internal::__fastpack8(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +// Note that this only packs 16 values +inline void fastunpack_half( + const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + internal::__fastunpack0(in, out); + break; + case 1: + internal::__fastunpack1(in, out); + break; + case 2: + internal::__fastunpack2(in, out); + break; + case 3: + internal::__fastunpack3(in, out); + break; + case 4: + internal::__fastunpack4(in, out); + break; + case 5: + internal::__fastunpack5(in, out); + break; + case 6: + internal::__fastunpack6(in, out); + break; + case 7: + internal::__fastunpack7(in, out); + break; + case 8: + internal::__fastunpack8(in, out); + break; + case 9: + internal::__fastunpack9(in, out); + break; + case 10: + internal::__fastunpack10(in, out); + break; + case 11: + internal::__fastunpack11(in, out); + break; + case 12: + internal::__fastunpack12(in, out); + break; + case 13: + internal::__fastunpack13(in, out); + break; + case 14: + internal::__fastunpack14(in, out); + break; + case 15: + internal::__fastunpack15(in, out); + break; + case 16: + internal::__fastunpack16(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +// Note that this only packs 16 values +inline void fastpack_half( + const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + internal::__fastpack0(in, out); + break; + case 1: + internal::__fastpack1(in, out); + break; + case 2: + internal::__fastpack2(in, out); + break; + case 3: + internal::__fastpack3(in, out); + break; + case 4: + internal::__fastpack4(in, out); + break; + case 5: + internal::__fastpack5(in, out); + break; + case 6: + internal::__fastpack6(in, out); + break; + case 7: + internal::__fastpack7(in, out); + break; + case 8: + internal::__fastpack8(in, out); + break; + case 9: + internal::__fastpack9(in, out); + break; + case 10: + internal::__fastpack10(in, out); + break; + case 11: + internal::__fastpack11(in, out); + break; + case 12: + internal::__fastpack12(in, out); + break; + case 13: + internal::__fastpack13(in, out); + break; + case 14: + internal::__fastpack14(in, out); + break; + case 15: + internal::__fastpack15(in, out); + break; + case 16: + internal::__fastpack16(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} +} // namespace internal + +inline void fastunpack( + const uint8_t* __restrict in, + uint8_t* __restrict out, + const uint32_t bit) { + for (uint8_t i = 0; i < 4; i++) { + internal::fastunpack_quarter(in + (i * bit), out + (i * 8), bit); + } +} + +inline void fastunpack( + const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint32_t bit) { + internal::fastunpack_half(in, out, bit); + internal::fastunpack_half(in + bit, out + 16, bit); +} + +inline void fastunpack( + const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + internal::__fastunpack0(in, out); + break; + case 1: + internal::__fastunpack1(in, out); + break; + case 2: + internal::__fastunpack2(in, out); + break; + case 3: + internal::__fastunpack3(in, out); + break; + case 4: + internal::__fastunpack4(in, out); + break; + case 5: + internal::__fastunpack5(in, out); + break; + case 6: + internal::__fastunpack6(in, out); + break; + case 7: + internal::__fastunpack7(in, out); + break; + case 8: + internal::__fastunpack8(in, out); + break; + case 9: + internal::__fastunpack9(in, out); + break; + case 10: + internal::__fastunpack10(in, out); + break; + case 11: + internal::__fastunpack11(in, out); + break; + case 12: + internal::__fastunpack12(in, out); + break; + case 13: + internal::__fastunpack13(in, out); + break; + case 14: + internal::__fastunpack14(in, out); + break; + case 15: + internal::__fastunpack15(in, out); + break; + case 16: + internal::__fastunpack16(in, out); + break; + case 17: + internal::__fastunpack17(in, out); + break; + case 18: + internal::__fastunpack18(in, out); + break; + case 19: + internal::__fastunpack19(in, out); + break; + case 20: + internal::__fastunpack20(in, out); + break; + case 21: + internal::__fastunpack21(in, out); + break; + case 22: + internal::__fastunpack22(in, out); + break; + case 23: + internal::__fastunpack23(in, out); + break; + case 24: + internal::__fastunpack24(in, out); + break; + case 25: + internal::__fastunpack25(in, out); + break; + case 26: + internal::__fastunpack26(in, out); + break; + case 27: + internal::__fastunpack27(in, out); + break; + case 28: + internal::__fastunpack28(in, out); + break; + case 29: + internal::__fastunpack29(in, out); + break; + case 30: + internal::__fastunpack30(in, out); + break; + case 31: + internal::__fastunpack31(in, out); + break; + case 32: + internal::__fastunpack32(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +inline void fastunpack( + const uint32_t* __restrict in, + uint64_t* __restrict out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + internal::__fastunpack0(in, out); + break; + case 1: + internal::__fastunpack1(in, out); + break; + case 2: + internal::__fastunpack2(in, out); + break; + case 3: + internal::__fastunpack3(in, out); + break; + case 4: + internal::__fastunpack4(in, out); + break; + case 5: + internal::__fastunpack5(in, out); + break; + case 6: + internal::__fastunpack6(in, out); + break; + case 7: + internal::__fastunpack7(in, out); + break; + case 8: + internal::__fastunpack8(in, out); + break; + case 9: + internal::__fastunpack9(in, out); + break; + case 10: + internal::__fastunpack10(in, out); + break; + case 11: + internal::__fastunpack11(in, out); + break; + case 12: + internal::__fastunpack12(in, out); + break; + case 13: + internal::__fastunpack13(in, out); + break; + case 14: + internal::__fastunpack14(in, out); + break; + case 15: + internal::__fastunpack15(in, out); + break; + case 16: + internal::__fastunpack16(in, out); + break; + case 17: + internal::__fastunpack17(in, out); + break; + case 18: + internal::__fastunpack18(in, out); + break; + case 19: + internal::__fastunpack19(in, out); + break; + case 20: + internal::__fastunpack20(in, out); + break; + case 21: + internal::__fastunpack21(in, out); + break; + case 22: + internal::__fastunpack22(in, out); + break; + case 23: + internal::__fastunpack23(in, out); + break; + case 24: + internal::__fastunpack24(in, out); + break; + case 25: + internal::__fastunpack25(in, out); + break; + case 26: + internal::__fastunpack26(in, out); + break; + case 27: + internal::__fastunpack27(in, out); + break; + case 28: + internal::__fastunpack28(in, out); + break; + case 29: + internal::__fastunpack29(in, out); + break; + case 30: + internal::__fastunpack30(in, out); + break; + case 31: + internal::__fastunpack31(in, out); + break; + case 32: + internal::__fastunpack32(in, out); + break; + case 33: + internal::__fastunpack33(in, out); + break; + case 34: + internal::__fastunpack34(in, out); + break; + case 35: + internal::__fastunpack35(in, out); + break; + case 36: + internal::__fastunpack36(in, out); + break; + case 37: + internal::__fastunpack37(in, out); + break; + case 38: + internal::__fastunpack38(in, out); + break; + case 39: + internal::__fastunpack39(in, out); + break; + case 40: + internal::__fastunpack40(in, out); + break; + case 41: + internal::__fastunpack41(in, out); + break; + case 42: + internal::__fastunpack42(in, out); + break; + case 43: + internal::__fastunpack43(in, out); + break; + case 44: + internal::__fastunpack44(in, out); + break; + case 45: + internal::__fastunpack45(in, out); + break; + case 46: + internal::__fastunpack46(in, out); + break; + case 47: + internal::__fastunpack47(in, out); + break; + case 48: + internal::__fastunpack48(in, out); + break; + case 49: + internal::__fastunpack49(in, out); + break; + case 50: + internal::__fastunpack50(in, out); + break; + case 51: + internal::__fastunpack51(in, out); + break; + case 52: + internal::__fastunpack52(in, out); + break; + case 53: + internal::__fastunpack53(in, out); + break; + case 54: + internal::__fastunpack54(in, out); + break; + case 55: + internal::__fastunpack55(in, out); + break; + case 56: + internal::__fastunpack56(in, out); + break; + case 57: + internal::__fastunpack57(in, out); + break; + case 58: + internal::__fastunpack58(in, out); + break; + case 59: + internal::__fastunpack59(in, out); + break; + case 60: + internal::__fastunpack60(in, out); + break; + case 61: + internal::__fastunpack61(in, out); + break; + case 62: + internal::__fastunpack62(in, out); + break; + case 63: + internal::__fastunpack63(in, out); + break; + case 64: + internal::__fastunpack64(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +inline void fastpack( + const uint8_t* __restrict in, + uint8_t* __restrict out, + const uint32_t bit) { + for (uint8_t i = 0; i < 4; i++) { + internal::fastpack_quarter(in + (i * 8), out + (i * bit), bit); + } +} + +inline void fastpack( + const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint32_t bit) { + internal::fastpack_half(in, out, bit); + internal::fastpack_half(in + 16, out + bit, bit); +} + +inline void fastpack( + const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + internal::__fastpack0(in, out); + break; + case 1: + internal::__fastpack1(in, out); + break; + case 2: + internal::__fastpack2(in, out); + break; + case 3: + internal::__fastpack3(in, out); + break; + case 4: + internal::__fastpack4(in, out); + break; + case 5: + internal::__fastpack5(in, out); + break; + case 6: + internal::__fastpack6(in, out); + break; + case 7: + internal::__fastpack7(in, out); + break; + case 8: + internal::__fastpack8(in, out); + break; + case 9: + internal::__fastpack9(in, out); + break; + case 10: + internal::__fastpack10(in, out); + break; + case 11: + internal::__fastpack11(in, out); + break; + case 12: + internal::__fastpack12(in, out); + break; + case 13: + internal::__fastpack13(in, out); + break; + case 14: + internal::__fastpack14(in, out); + break; + case 15: + internal::__fastpack15(in, out); + break; + case 16: + internal::__fastpack16(in, out); + break; + case 17: + internal::__fastpack17(in, out); + break; + case 18: + internal::__fastpack18(in, out); + break; + case 19: + internal::__fastpack19(in, out); + break; + case 20: + internal::__fastpack20(in, out); + break; + case 21: + internal::__fastpack21(in, out); + break; + case 22: + internal::__fastpack22(in, out); + break; + case 23: + internal::__fastpack23(in, out); + break; + case 24: + internal::__fastpack24(in, out); + break; + case 25: + internal::__fastpack25(in, out); + break; + case 26: + internal::__fastpack26(in, out); + break; + case 27: + internal::__fastpack27(in, out); + break; + case 28: + internal::__fastpack28(in, out); + break; + case 29: + internal::__fastpack29(in, out); + break; + case 30: + internal::__fastpack30(in, out); + break; + case 31: + internal::__fastpack31(in, out); + break; + case 32: + internal::__fastpack32(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +inline void fastpack( + const uint64_t* __restrict in, + uint32_t* __restrict out, + const uint32_t bit) { + switch (bit) { + case 0: + internal::__fastpack0(in, out); + break; + case 1: + internal::__fastpack1(in, out); + break; + case 2: + internal::__fastpack2(in, out); + break; + case 3: + internal::__fastpack3(in, out); + break; + case 4: + internal::__fastpack4(in, out); + break; + case 5: + internal::__fastpack5(in, out); + break; + case 6: + internal::__fastpack6(in, out); + break; + case 7: + internal::__fastpack7(in, out); + break; + case 8: + internal::__fastpack8(in, out); + break; + case 9: + internal::__fastpack9(in, out); + break; + case 10: + internal::__fastpack10(in, out); + break; + case 11: + internal::__fastpack11(in, out); + break; + case 12: + internal::__fastpack12(in, out); + break; + case 13: + internal::__fastpack13(in, out); + break; + case 14: + internal::__fastpack14(in, out); + break; + case 15: + internal::__fastpack15(in, out); + break; + case 16: + internal::__fastpack16(in, out); + break; + case 17: + internal::__fastpack17(in, out); + break; + case 18: + internal::__fastpack18(in, out); + break; + case 19: + internal::__fastpack19(in, out); + break; + case 20: + internal::__fastpack20(in, out); + break; + case 21: + internal::__fastpack21(in, out); + break; + case 22: + internal::__fastpack22(in, out); + break; + case 23: + internal::__fastpack23(in, out); + break; + case 24: + internal::__fastpack24(in, out); + break; + case 25: + internal::__fastpack25(in, out); + break; + case 26: + internal::__fastpack26(in, out); + break; + case 27: + internal::__fastpack27(in, out); + break; + case 28: + internal::__fastpack28(in, out); + break; + case 29: + internal::__fastpack29(in, out); + break; + case 30: + internal::__fastpack30(in, out); + break; + case 31: + internal::__fastpack31(in, out); + break; + case 32: + internal::__fastpack32(in, out); + break; + case 33: + internal::__fastpack33(in, out); + break; + case 34: + internal::__fastpack34(in, out); + break; + case 35: + internal::__fastpack35(in, out); + break; + case 36: + internal::__fastpack36(in, out); + break; + case 37: + internal::__fastpack37(in, out); + break; + case 38: + internal::__fastpack38(in, out); + break; + case 39: + internal::__fastpack39(in, out); + break; + case 40: + internal::__fastpack40(in, out); + break; + case 41: + internal::__fastpack41(in, out); + break; + case 42: + internal::__fastpack42(in, out); + break; + case 43: + internal::__fastpack43(in, out); + break; + case 44: + internal::__fastpack44(in, out); + break; + case 45: + internal::__fastpack45(in, out); + break; + case 46: + internal::__fastpack46(in, out); + break; + case 47: + internal::__fastpack47(in, out); + break; + case 48: + internal::__fastpack48(in, out); + break; + case 49: + internal::__fastpack49(in, out); + break; + case 50: + internal::__fastpack50(in, out); + break; + case 51: + internal::__fastpack51(in, out); + break; + case 52: + internal::__fastpack52(in, out); + break; + case 53: + internal::__fastpack53(in, out); + break; + case 54: + internal::__fastpack54(in, out); + break; + case 55: + internal::__fastpack55(in, out); + break; + case 56: + internal::__fastpack56(in, out); + break; + case 57: + internal::__fastpack57(in, out); + break; + case 58: + internal::__fastpack58(in, out); + break; + case 59: + internal::__fastpack59(in, out); + break; + case 60: + internal::__fastpack60(in, out); + break; + case 61: + internal::__fastpack61(in, out); + break; + case 62: + internal::__fastpack62(in, out); + break; + case 63: + internal::__fastpack63(in, out); + break; + case 64: + internal::__fastpack64(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} +} // namespace velox::fastpforlib diff --git a/velox/dwio/common/tests/LocalFileSinkTest.cpp b/velox/dwio/common/tests/LocalFileSinkTest.cpp index 641dfe62b3bae..17ac1869474fb 100644 --- a/velox/dwio/common/tests/LocalFileSinkTest.cpp +++ b/velox/dwio/common/tests/LocalFileSinkTest.cpp @@ -26,26 +26,35 @@ using namespace facebook::velox::exec::test; namespace facebook::velox::dwio::common { -void runTest() { - auto root = TempDirectoryPath::create(); - auto filePath = fs::path(root->path) / "xxx/yyy/zzz/test_file.ext"; +class LocalFileSinkTest : public testing::Test { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } - ASSERT_FALSE(fs::exists(filePath.string())); + void runTest() { + auto root = TempDirectoryPath::create(); + auto filePath = fs::path(root->getPath()) / "xxx/yyy/zzz/test_file.ext"; - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - auto localFileSink = FileSink::create( - fmt::format("file:{}", filePath.string()), {.pool = pool.get()}); - ASSERT_TRUE(localFileSink->isBuffered()); - localFileSink->close(); + ASSERT_FALSE(fs::exists(filePath.string())); - EXPECT_TRUE(fs::exists(filePath.string())); -} + auto localFileSink = FileSink::create( + fmt::format("file:{}", filePath.string()), {.pool = pool_.get()}); + ASSERT_TRUE(localFileSink->isBuffered()); + localFileSink->close(); + + EXPECT_TRUE(fs::exists(filePath.string())); + } + + std::shared_ptr pool_{ + memory::memoryManager()->addLeafPool()}; +}; -TEST(LocalFileSinkTest, missingRegistration) { +TEST_F(LocalFileSinkTest, missingRegistration) { VELOX_ASSERT_THROW(runTest(), "FileSink is not registered for file:"); } -TEST(LocalFileSinkTest, create) { +TEST_F(LocalFileSinkTest, create) { LocalFileSink::registerFactory(); runTest(); } diff --git a/velox/dwio/common/tests/MeasureTimeTests.cpp b/velox/dwio/common/tests/MeasureTimeTests.cpp new file mode 100644 index 0000000000000..a3749df4bd6e8 --- /dev/null +++ b/velox/dwio/common/tests/MeasureTimeTests.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "velox/dwio/common/MeasureTime.h" + +using namespace ::testing; +using namespace ::facebook::velox::dwio::common; + +TEST(MeasureTimeTests, DoesntCreateMeasureIfNoCallback) { + EXPECT_FALSE(measureTimeIfCallback(nullptr).has_value()); +} + +TEST(MeasureTimeTests, CreatesMeasureIfCallback) { + auto callback = + std::function( + [](const auto&) {}); + EXPECT_TRUE(measureTimeIfCallback(callback).has_value()); +} + +TEST(MeasureTimeTests, MeasuresTime) { + bool measured{false}; + { + auto callback = + std::function( + [&measured](const auto&) { measured = true; }); + auto measure = measureTimeIfCallback(callback); + EXPECT_TRUE(measure.has_value()); + EXPECT_FALSE(measured); + } + EXPECT_TRUE(measured); +} diff --git a/velox/dwio/common/tests/MemorySinkTest.cpp b/velox/dwio/common/tests/MemorySinkTest.cpp new file mode 100644 index 0000000000000..4f348811d938e --- /dev/null +++ b/velox/dwio/common/tests/MemorySinkTest.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/FileSink.h" + +#include + +namespace facebook::velox::dwio::common { + +class MemorySinkTest : public testing::Test { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + std::shared_ptr pool_{ + memory::memoryManager()->addLeafPool()}; +}; + +TEST_F(MemorySinkTest, create) { + std::string chars("abcdefghijklmnopqrst"); + std::vector> buffers; + + // Add 'abcdefghij' to first buffer + buffers.emplace_back(*pool_); + buffers.back().append(0, chars.data(), 10); + + // Add 'klmnopqrst' to second buffer + buffers.emplace_back(*pool_); + buffers.back().append(0, chars.data() + 10, 10); + + ASSERT_EQ(buffers.size(), 2); + + auto memorySink = std::make_unique( + 1024, dwio::common::FileSink::Options{.pool = pool_.get()}); + + ASSERT_TRUE(memorySink->isBuffered()); + // Write data to MemorySink. + memorySink->write(buffers); + ASSERT_EQ(memorySink->size(), chars.length()); + ASSERT_EQ(memorySink->data(), chars); + memorySink->close(); +} +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp b/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp new file mode 100644 index 0000000000000..50ac96c623d05 --- /dev/null +++ b/velox/dwio/common/tests/OnDemandUnitLoaderTests.cpp @@ -0,0 +1,236 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/dwio/common/OnDemandUnitLoader.h" +#include "velox/dwio/common/UnitLoaderTools.h" +#include "velox/dwio/common/tests/utils/UnitLoaderTestTools.h" + +using namespace ::testing; +using facebook::velox::dwio::common::LoadUnit; +using facebook::velox::dwio::common::OnDemandUnitLoaderFactory; +using facebook::velox::dwio::common::UnitLoader; +using facebook::velox::dwio::common::UnitLoaderFactory; +using facebook::velox::dwio::common::test::getUnitsLoadedWithFalse; +using facebook::velox::dwio::common::test::LoadUnitMock; +using facebook::velox::dwio::common::test::ReaderMock; + +TEST(OnDemandUnitLoaderTests, LoadsCorrectlyWithReader) { + size_t blockedOnIoCount = 0; + OnDemandUnitLoaderFactory factory([&](auto) { ++blockedOnIoCount; }); + ReaderMock readerMock{{10, 20, 30}, {0, 0, 0}, factory, 0}; + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, false})); + EXPECT_EQ(blockedOnIoCount, 0); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 0, rows: 0-2, load(0) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + EXPECT_EQ(blockedOnIoCount, 1); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 0, rows: 3-5 + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + EXPECT_EQ(blockedOnIoCount, 1); + + EXPECT_TRUE(readerMock.read(4)); // Unit: 0, rows: 6-9 + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + EXPECT_EQ(blockedOnIoCount, 1); + + EXPECT_TRUE(readerMock.read(14)); // Unit: 1, rows: 0-13, unload(0), load(1) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, true, false})); + EXPECT_EQ(blockedOnIoCount, 2); + + // will only read 5 rows, no more rows in unit 1 + EXPECT_TRUE(readerMock.read(10)); // Unit: 1, rows: 14-19 + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, true, false})); + EXPECT_EQ(blockedOnIoCount, 2); + + EXPECT_TRUE(readerMock.read(30)); // Unit: 2, rows: 0-29, unload(1), load(2) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, true})); + EXPECT_EQ(blockedOnIoCount, 3); + + EXPECT_FALSE(readerMock.read(30)); // No more data + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, true})); + EXPECT_EQ(blockedOnIoCount, 3); +} + +TEST(OnDemandUnitLoaderTests, LoadsCorrectlyWithNoCallback) { + OnDemandUnitLoaderFactory factory(nullptr); + ReaderMock readerMock{{10, 20, 30}, {0, 0, 0}, factory, 0}; + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, false})); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 0, rows: 0-2, load(0) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 0, rows: 3-5 + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + + EXPECT_TRUE(readerMock.read(4)); // Unit: 0, rows: 6-9 + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + + EXPECT_TRUE(readerMock.read(14)); // Unit: 1, rows: 0-13, unload(2), load(1) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, true, false})); + + // will only read 5 rows, no more rows in unit 1 + EXPECT_TRUE(readerMock.read(10)); // Unit: 1, rows: 14-19 + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, true, false})); + + EXPECT_TRUE(readerMock.read(30)); // Unit: 2, rows: 0-29, unload(1), load(2) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, true})); + + EXPECT_FALSE(readerMock.read(30)); // No more data + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, true})); +} + +TEST(OnDemandUnitLoaderTests, CanSeek) { + size_t blockedOnIoCount = 0; + OnDemandUnitLoaderFactory factory([&](auto) { ++blockedOnIoCount; }); + ReaderMock readerMock{{10, 20, 30}, {0, 0, 0}, factory, 0}; + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, false})); + EXPECT_EQ(blockedOnIoCount, 0); + + EXPECT_NO_THROW(readerMock.seek(10);); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 1, rows: 0-2, load(1) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, true, false})); + EXPECT_EQ(blockedOnIoCount, 1); + + EXPECT_NO_THROW(readerMock.seek(0);); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 0, rows: 0-2, load(0), unload(1) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + EXPECT_EQ(blockedOnIoCount, 2); + + EXPECT_NO_THROW(readerMock.seek(30);); + + EXPECT_TRUE(readerMock.read(3)); // Unit: 2, rows: 0-2, load(2), unload(0) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, true})); + EXPECT_EQ(blockedOnIoCount, 3); + + EXPECT_NO_THROW(readerMock.seek(5);); + + EXPECT_TRUE(readerMock.read(5)); // Unit: 0, rows: 5-9, load(0), unload(1) + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({true, false, false})); + EXPECT_EQ(blockedOnIoCount, 4); +} + +TEST(OnDemandUnitLoaderTests, SeekOutOfRangeReaderError) { + size_t blockedOnIoCount = 0; + OnDemandUnitLoaderFactory factory([&](auto) { ++blockedOnIoCount; }); + ReaderMock readerMock{{10, 20, 30}, {0, 0, 0}, factory, 0}; + EXPECT_EQ(readerMock.unitsLoaded(), std::vector({false, false, false})); + EXPECT_EQ(blockedOnIoCount, 0); + readerMock.seek(59); + + readerMock.seek(60); + + EXPECT_THAT( + [&]() { readerMock.seek(61); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr("Can't seek to possition 61 in file. Must be up to 60.")))); +} + +TEST(OnDemandUnitLoaderTests, SeekOutOfRange) { + OnDemandUnitLoaderFactory factory(nullptr); + std::vector unitsLoaded(getUnitsLoadedWithFalse(1)); + std::vector> units; + units.push_back(std::make_unique(10, 0, unitsLoaded, 0)); + + auto unitLoader = factory.create(std::move(units), 0); + + unitLoader->onSeek(0, 10); + + EXPECT_THAT( + [&]() { unitLoader->onSeek(0, 11); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr("Row out of range")))); +} + +TEST(OnDemandUnitLoaderTests, UnitOutOfRange) { + OnDemandUnitLoaderFactory factory(nullptr); + std::vector unitsLoaded(getUnitsLoadedWithFalse(1)); + std::vector> units; + units.push_back(std::make_unique(10, 0, unitsLoaded, 0)); + + auto unitLoader = factory.create(std::move(units), 0); + unitLoader->getLoadedUnit(0); + EXPECT_THAT( + [&]() { unitLoader->getLoadedUnit(1); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr("Unit out of range")))); +} + +TEST(OnDemandUnitLoaderTests, CanRequestUnitMultipleTimes) { + OnDemandUnitLoaderFactory factory(nullptr); + std::vector unitsLoaded(getUnitsLoadedWithFalse(1)); + std::vector> units; + units.push_back(std::make_unique(10, 0, unitsLoaded, 0)); + + auto unitLoader = factory.create(std::move(units), 0); + unitLoader->getLoadedUnit(0); + unitLoader->getLoadedUnit(0); + unitLoader->getLoadedUnit(0); +} + +TEST(OnDemandUnitLoaderTests, InitialSkip) { + auto getFactoryWithSkip = [](uint64_t skipToRow) { + auto factory = std::make_unique(nullptr); + std::vector unitsLoaded(getUnitsLoadedWithFalse(1)); + std::vector> units; + units.push_back(std::make_unique(10, 0, unitsLoaded, 0)); + units.push_back(std::make_unique(20, 0, unitsLoaded, 1)); + units.push_back(std::make_unique(30, 0, unitsLoaded, 2)); + factory->create(std::move(units), skipToRow); + }; + + EXPECT_NO_THROW(getFactoryWithSkip(0)); + EXPECT_NO_THROW(getFactoryWithSkip(1)); + EXPECT_NO_THROW(getFactoryWithSkip(9)); + EXPECT_NO_THROW(getFactoryWithSkip(10)); + EXPECT_NO_THROW(getFactoryWithSkip(11)); + EXPECT_NO_THROW(getFactoryWithSkip(29)); + EXPECT_NO_THROW(getFactoryWithSkip(30)); + EXPECT_NO_THROW(getFactoryWithSkip(31)); + EXPECT_NO_THROW(getFactoryWithSkip(59)); + EXPECT_NO_THROW(getFactoryWithSkip(60)); + EXPECT_THAT( + [&]() { getFactoryWithSkip(61); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr("Can only skip up to the past-the-end row of the file.")))); + EXPECT_THAT( + [&]() { getFactoryWithSkip(100); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr("Can only skip up to the past-the-end row of the file.")))); +} + +TEST(OnDemandUnitLoaderTests, NoUnitButSkip) { + OnDemandUnitLoaderFactory factory(nullptr); + std::vector> units; + + EXPECT_NO_THROW(factory.create(std::move(units), 0)); + + std::vector> units2; + EXPECT_THAT( + [&]() { factory.create(std::move(units2), 1); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr("Can only skip up to the past-the-end row of the file.")))); +} diff --git a/velox/dwio/common/tests/OptionsTests.cpp b/velox/dwio/common/tests/OptionsTests.cpp index f46834cb91457..4492a62420d0a 100644 --- a/velox/dwio/common/tests/OptionsTests.cpp +++ b/velox/dwio/common/tests/OptionsTests.cpp @@ -19,24 +19,37 @@ using namespace ::testing; using namespace facebook::velox::dwio::common; -TEST(OptionsTests, defaultAppendRowNumberColumnTest) { +TEST(OptionsTests, defaultRowNumberColumnInfoTest) { // appendRowNumberColumn flag should be false by default RowReaderOptions rowReaderOptions; - ASSERT_EQ(false, rowReaderOptions.getAppendRowNumberColumn()); + ASSERT_EQ(std::nullopt, rowReaderOptions.rowNumberColumnInfo()); } -TEST(OptionsTests, setAppendRowNumberColumnToTrueTest) { +TEST(OptionsTests, setRowNumberColumnInfoTest) { RowReaderOptions rowReaderOptions; - rowReaderOptions.setAppendRowNumberColumn(true); - ASSERT_EQ(true, rowReaderOptions.getAppendRowNumberColumn()); + RowNumberColumnInfo rowNumberColumnInfo; + rowNumberColumnInfo.insertPosition = 0; + rowNumberColumnInfo.name = "test"; + rowNumberColumnInfo.isExplicit = true; + rowReaderOptions.setRowNumberColumnInfo(rowNumberColumnInfo); + auto rowNumberColumn = rowReaderOptions.rowNumberColumnInfo().value(); + ASSERT_EQ(rowNumberColumnInfo.insertPosition, rowNumberColumn.insertPosition); + ASSERT_EQ(rowNumberColumnInfo.name, rowNumberColumn.name); + ASSERT_EQ(rowNumberColumnInfo.isExplicit, rowNumberColumn.isExplicit); } -TEST(OptionsTests, testAppendRowNumberColumnInCopy) { +TEST(OptionsTests, testRowNumberColumnInfoInCopy) { RowReaderOptions rowReaderOptions; RowReaderOptions rowReaderOptionsCopy{rowReaderOptions}; - ASSERT_EQ(false, rowReaderOptionsCopy.getAppendRowNumberColumn()); + ASSERT_EQ(std::nullopt, rowReaderOptionsCopy.rowNumberColumnInfo()); - rowReaderOptions.setAppendRowNumberColumn(true); + RowNumberColumnInfo rowNumberColumnInfo; + rowNumberColumnInfo.insertPosition = 0; + rowNumberColumnInfo.name = "test"; + rowReaderOptions.setRowNumberColumnInfo(rowNumberColumnInfo); RowReaderOptions rowReaderOptionsSecondCopy{rowReaderOptions}; - ASSERT_EQ(true, rowReaderOptionsSecondCopy.getAppendRowNumberColumn()); + auto rowNumberColumn = + rowReaderOptionsSecondCopy.rowNumberColumnInfo().value(); + ASSERT_EQ(rowNumberColumnInfo.insertPosition, rowNumberColumn.insertPosition); + ASSERT_EQ(rowNumberColumnInfo.name, rowNumberColumn.name); } diff --git a/velox/dwio/common/tests/ParallelForTest.cpp b/velox/dwio/common/tests/ParallelForTest.cpp new file mode 100644 index 0000000000000..023dd6f5278f3 --- /dev/null +++ b/velox/dwio/common/tests/ParallelForTest.cpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/ParallelFor.h" +#include "folly/Executor.h" +#include "folly/executors/CPUThreadPoolExecutor.h" +#include "folly/executors/InlineExecutor.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "velox/common/base/VeloxException.h" + +using namespace ::testing; +using namespace ::facebook::velox::dwio::common; + +namespace { + +class CountingExecutor : public folly::Executor { + public: + explicit CountingExecutor(folly::Executor& executor) + : executor_(executor), count_(0) {} + + void add(folly::Func f) override { + executor_.add(std::move(f)); + ++count_; + } + + size_t getCount() const { + return count_; + } + + private: + folly::Executor& executor_; + size_t count_; +}; + +void testParallelFor( + folly::Executor* executor, + size_t from, + size_t to, + size_t parallelismFactor) { + std::optional countedExecutor; + std::ostringstream oss; + oss << "ParallelFor(executor: " << executor << ", from: " << from + << ", to: " << to << ", parallelismFactor: " << parallelismFactor << ")"; + SCOPED_TRACE(oss.str()); + if (executor) { + countedExecutor.emplace(*executor); + executor = &countedExecutor.value(); + } + + std::unordered_map> indexInvoked; + for (size_t i = from; i < to; ++i) { + indexInvoked[i] = 0UL; + } + + ParallelFor(executor, from, to, parallelismFactor) + .execute([&indexInvoked](size_t i) { + auto it = indexInvoked.find(i); + ASSERT_NE(it, indexInvoked.end()); + ++it->second; + }); + + // Parallel For should have thrown otherwise + ASSERT_LE(from, to); + + // The method was called for each index just once, and didn't call out of + // bounds indices. + EXPECT_EQ(indexInvoked.size(), (to - from)); + for (auto& [i, count] : indexInvoked) { + if (i < from || i >= to) { + EXPECT_EQ(indexInvoked[i], 0); + } else { + EXPECT_EQ(indexInvoked[i], 1); + } + } + + if (countedExecutor) { + const auto extraThreadsUsed = countedExecutor->getCount(); + const auto numTasks = to - from; + const auto expectedExtraThreads = std::min( + parallelismFactor > 0 ? parallelismFactor - 1 : 0, + numTasks > 0 ? numTasks - 1 : 0); + EXPECT_EQ(extraThreadsUsed, expectedExtraThreads); + } +} + +} // namespace + +TEST(ParallelForTest, E2E) { + auto inlineExecutor = folly::InlineExecutor::instance(); + for (size_t parallelism = 0; parallelism < 25; ++parallelism) { + for (size_t begin = 0; begin < 25; ++begin) { + for (size_t end = 0; end < 25; ++end) { + if (begin <= end) { + testParallelFor(&inlineExecutor, begin, end, parallelism); + } else { + EXPECT_THROW( + testParallelFor(&inlineExecutor, begin, end, parallelism), + facebook::velox::VeloxRuntimeError); + } + } + } + } +} + +TEST(ParallelForTest, E2EParallel) { + for (size_t parallelism = 1; parallelism < 2; ++parallelism) { + folly::CPUThreadPoolExecutor executor(parallelism); + for (size_t begin = 0; begin < 25; ++begin) { + for (size_t end = 0; end < 25; ++end) { + if (begin <= end) { + testParallelFor(&executor, begin, end, parallelism); + } else { + EXPECT_THROW( + testParallelFor(&executor, begin, end, parallelism), + facebook::velox::VeloxRuntimeError); + } + } + } + } +} + +TEST(ParallelForTest, CanOwnExecutor) { + auto executor = std::make_shared(2); + const size_t indexInvokedSize = 100; + std::unordered_map> indexInvoked; + indexInvoked.reserve(indexInvokedSize); + for (size_t i = 0; i < indexInvokedSize; ++i) { + indexInvoked[i] = 0UL; + } + + ParallelFor pf(executor, 0, indexInvokedSize, 9); + pf.execute([&indexInvoked](size_t i) { ++indexInvoked[i]; }); + + EXPECT_EQ(indexInvoked.size(), indexInvokedSize); + for (size_t i = 0; i < indexInvokedSize; ++i) { + EXPECT_EQ(indexInvoked[i], 1); + } +} diff --git a/velox/dwio/common/tests/RangeTests.cpp b/velox/dwio/common/tests/RangeTests.cpp index a603445f3602f..1488c95fe2e60 100644 --- a/velox/dwio/common/tests/RangeTests.cpp +++ b/velox/dwio/common/tests/RangeTests.cpp @@ -16,16 +16,16 @@ #include #include +#include "velox/common/base/tests/GTestUtils.h" #include "velox/dwio/common/Range.h" using namespace ::testing; -using namespace facebook::velox::dwio::common; namespace facebook::velox::common { TEST(RangeTests, Add) { Ranges ranges; - ASSERT_THROW(ranges.add(2, 1), exception::LoggedException); + VELOX_ASSERT_THROW(ranges.add(2, 1), ""); ranges.add(2, 2); ranges.add(1, 3); ASSERT_THAT(ranges.ranges_, ElementsAre(std::tuple{1, 3})); diff --git a/velox/dwio/common/tests/ReadFileInputStreamTests.cpp b/velox/dwio/common/tests/ReadFileInputStreamTests.cpp index 4beab04458474..273b523549f77 100644 --- a/velox/dwio/common/tests/ReadFileInputStreamTests.cpp +++ b/velox/dwio/common/tests/ReadFileInputStreamTests.cpp @@ -35,8 +35,8 @@ class ReadFileInputStreamTest : public testing::Test { }; TEST_F(ReadFileInputStreamTest, LocalReadFile) { - auto tempFile = ::exec::test::TempFilePath::create(); - const auto& filename = tempFile->path; + auto tempFile = exec::test::TempFilePath::create(); + const auto& filename = tempFile->getPath(); remove(filename.c_str()); { LocalWriteFile writeFile(filename); diff --git a/velox/dwio/common/tests/ReaderTest.cpp b/velox/dwio/common/tests/ReaderTest.cpp new file mode 100644 index 0000000000000..cd4840f84ba7b --- /dev/null +++ b/velox/dwio/common/tests/ReaderTest.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/Reader.h" +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +#include + +namespace facebook::velox::dwio::common { +namespace { + +using namespace facebook::velox::common; + +class ReaderTest : public testing::Test, public test::VectorTestBase { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } +}; + +TEST_F(ReaderTest, getOrCreateChild) { + constexpr int kSize = 5; + auto input = makeRowVector( + {"c.0", "c.1"}, + { + makeFlatVector({1, 2, 3, 4, 5}), + makeFlatVector({2, 4, 6, 7, 8}), + }); + + common::ScanSpec spec(""); + spec.addField("c.0", 0); + // Create child from name. + spec.getOrCreateChild("c.1")->setFilter( + common::createBigintValues({2, 4, 6}, false)); + + auto actual = RowReader::projectColumns(input, spec, nullptr); + auto expected = makeRowVector({ + makeFlatVector({1, 2, 3}), + }); + test::assertEqualVectors(expected, actual); + + // Create child from subfield. + spec.getOrCreateChild(common::Subfield("c.1")) + ->setFilter(common::createBigintValues({2, 4, 6}, false)); + VELOX_ASSERT_USER_THROW( + RowReader::projectColumns(input, spec, nullptr), + "Field not found: c. Available fields are: c.0, c.1."); +} + +TEST_F(ReaderTest, projectColumnsFilterStruct) { + constexpr int kSize = 10; + auto input = makeRowVector({ + makeFlatVector(kSize, folly::identity), + makeRowVector({ + makeFlatVector(kSize, folly::identity), + }), + }); + common::ScanSpec spec(""); + spec.addField("c0", 0); + spec.getOrCreateChild(common::Subfield("c1.c0")) + ->setFilter(common::createBigintValues({2, 4, 6}, false)); + auto actual = RowReader::projectColumns(input, spec, nullptr); + auto expected = makeRowVector({ + makeFlatVector({2, 4, 6}), + }); + test::assertEqualVectors(expected, actual); +} + +TEST_F(ReaderTest, projectColumnsFilterArray) { + constexpr int kSize = 10; + auto input = makeRowVector({ + makeFlatVector(kSize, folly::identity), + makeArrayVector( + kSize, + [](auto) { return 1; }, + [](auto i) { return i; }, + [](auto i) { return i % 2 != 0; }), + }); + common::ScanSpec spec(""); + spec.addField("c0", 0); + auto* c1 = spec.getOrCreateChild(common::Subfield("c1")); + { + SCOPED_TRACE("IS NULL"); + c1->setFilter(std::make_unique()); + auto actual = RowReader::projectColumns(input, spec, nullptr); + auto expected = makeRowVector({ + makeFlatVector({1, 3, 5, 7, 9}), + }); + test::assertEqualVectors(expected, actual); + } + { + SCOPED_TRACE("IS NOT NULL"); + c1->setFilter(std::make_unique()); + auto actual = RowReader::projectColumns(input, spec, nullptr); + auto expected = makeRowVector({ + makeFlatVector({0, 2, 4, 6, 8}), + }); + test::assertEqualVectors(expected, actual); + } +} + +TEST_F(ReaderTest, projectColumnsMutation) { + constexpr int kSize = 10; + auto input = makeRowVector({makeFlatVector(kSize, folly::identity)}); + common::ScanSpec spec(""); + spec.addAllChildFields(*input->type()); + std::vector deleted(bits::nwords(kSize)); + bits::setBit(deleted.data(), 2); + Mutation mutation; + mutation.deletedRows = deleted.data(); + auto actual = RowReader::projectColumns(input, spec, &mutation); + auto expected = makeRowVector({ + makeFlatVector({0, 1, 3, 4, 5, 6, 7, 8, 9}), + }); + test::assertEqualVectors(expected, actual); + random::setSeed(42); + random::RandomSkipTracker randomSkip(0.5); + mutation.randomSkip = &randomSkip; + actual = RowReader::projectColumns(input, spec, &mutation); + expected = makeRowVector({ + makeFlatVector({0, 1, 3, 5, 6, 8}), + }); + test::assertEqualVectors(expected, actual); +} + +} // namespace +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/tests/RetryTests.cpp b/velox/dwio/common/tests/RetryTests.cpp index f02544c6fe839..dc6f5bdf2e425 100644 --- a/velox/dwio/common/tests/RetryTests.cpp +++ b/velox/dwio/common/tests/RetryTests.cpp @@ -21,14 +21,14 @@ using namespace std::chrono_literals; using namespace facebook::velox::dwio::common; using namespace facebook::velox::dwio::common::retrypolicy; -template class Raise { public: - Raise() : count_(0) {} + explicit Raise(uint8_t exceptionCount) + : exceptionCount_(exceptionCount), count_(0) {} template T call(const T& value) { - if (count_++ < ExceptionCount) { + if (count_++ < exceptionCount_) { throw retriable_error(std::runtime_error("Bad!!!")); } @@ -42,11 +42,12 @@ class Raise { ~Raise() = default; private: + const uint8_t exceptionCount_; uint8_t count_; }; TEST(RetryModuleTests, retryUntilSuccessDefault) { - Raise<4> raise; + Raise raise(4); std::function retriable = [&raise]() { return raise.call(10); }; @@ -64,7 +65,7 @@ TEST(RetryModuleTests, retryUntilSuccessDefault) { } TEST(RetryModuleTests, retryUntilSuccessBackoff) { - Raise<4> raise; + Raise raise(4); std::function retriable = [&raise]() { return raise.call(10); }; @@ -82,7 +83,7 @@ TEST(RetryModuleTests, retryUntilSuccessBackoff) { } TEST(RetryModuleTests, retryCapMaxDelay) { - Raise<4> raise; + Raise raise(4); std::function retriable = [&raise]() { return raise.call(10); }; @@ -100,7 +101,7 @@ TEST(RetryModuleTests, retryCapMaxDelay) { } TEST(RetryModuleTests, failOnRetriesExceededDefault) { - Raise<6> raise; + Raise raise(6); std::function retriable = [&raise]() { return raise.call(10); }; @@ -113,7 +114,7 @@ TEST(RetryModuleTests, failOnRetriesExceededDefault) { } TEST(RetryModuleTests, failOnRetriesExceededBackoff) { - Raise<6> raise; + Raise raise(6); std::function retriable = [&raise]() { return raise.call(10); }; @@ -126,7 +127,7 @@ TEST(RetryModuleTests, failOnRetriesExceededBackoff) { } TEST(RetryModuleTests, failOnRetriesExceededTotalBackoff) { - Raise<100> raise; + Raise raise(100); std::function retriable = [&raise]() { return raise.call(10); }; @@ -143,7 +144,7 @@ TEST(RetryModuleTests, failOnRetriesExceededTotalBackoff) { } TEST(RetryModuleTests, defineDifferentUnit) { - Raise<1> raise; + Raise raise(1); std::function retriable = [&raise]() { return raise.call(11); }; @@ -172,3 +173,15 @@ TEST(RetryModuleTests, testJitter) { ASSERT_GT(wait.count(), nextWait); } + +TEST(RetryModuleTests, exponentialBackOffCountExecutionTime) { + Raise raise(1); + auto retriable = [&raise]() { + std::this_thread::sleep_for(20ms); + return raise.call(10); + }; + ExponentialBackoffPolicyFactory policyFactory(1ms, 1ms, 5, 10ms, true); + ASSERT_THROW( + RetryModule::withRetry(retriable, policyFactory.getRetryPolicy()), + retries_exhausted); +} diff --git a/velox/dwio/common/tests/TestBufferedInput.cpp b/velox/dwio/common/tests/TestBufferedInput.cpp index 042986fbfc7e2..c0795e3c07601 100644 --- a/velox/dwio/common/tests/TestBufferedInput.cpp +++ b/velox/dwio/common/tests/TestBufferedInput.cpp @@ -20,6 +20,7 @@ using namespace facebook::velox::dwio::common; using facebook::velox::common::Region; +using namespace facebook::velox::memory; using namespace ::testing; namespace { @@ -31,7 +32,7 @@ class ReadFileMock : public ::facebook::velox::ReadFile { MOCK_METHOD( std::string_view, pread, - (uint64_t offset, uint64_t length, void* FOLLY_NONNULL buf), + (uint64_t offset, uint64_t length, void* buf), (const, override)); MOCK_METHOD(bool, shouldCoalesce, (), (const, override)); @@ -40,7 +41,7 @@ class ReadFileMock : public ::facebook::velox::ReadFile { MOCK_METHOD(std::string, getName, (), (const, override)); MOCK_METHOD(uint64_t, getNaturalReadSize, (), (const, override)); MOCK_METHOD( - void, + uint64_t, preadv, (folly::Range regions, folly::Range iobufs), (const, override)); @@ -73,26 +74,31 @@ void expectPreadvs( EXPECT_CALL(file, size()).WillRepeatedly(Return(content.size())); EXPECT_CALL(file, preadv(_, _)) .Times(1) - .WillOnce([content, reads]( - folly::Range regions, - folly::Range iobufs) { - ASSERT_EQ(regions.size(), reads.size()); - for (size_t i = 0; i < reads.size(); ++i) { - const auto& region = regions[i]; - const auto& read = reads[i]; - auto& iobuf = iobufs[i]; - ASSERT_EQ(region.offset, read.offset); - ASSERT_EQ(region.length, read.length); - if (!read.label.empty()) { - EXPECT_EQ(read.label, region.label); - } - ASSERT_LE(region.offset + region.length, content.size()); - iobuf = folly::IOBuf( - folly::IOBuf::COPY_BUFFER, - content.data() + region.offset, - region.length); - } - }); + .WillOnce( + [content, reads]( + folly::Range regions, + folly::Range iobufs) -> uint64_t { + EXPECT_EQ(regions.size(), reads.size()); + uint64_t length = 0; + for (size_t i = 0; i < reads.size(); ++i) { + const auto& region = regions[i]; + const auto& read = reads[i]; + auto& iobuf = iobufs[i]; + length += region.length; + EXPECT_EQ(region.offset, read.offset); + EXPECT_EQ(region.length, read.length); + if (!read.label.empty()) { + EXPECT_EQ(read.label, region.label); + } + EXPECT_LE(region.offset + region.length, content.size()); + iobuf = folly::IOBuf( + folly::IOBuf::COPY_BUFFER, + content.data() + region.offset, + region.length); + } + + return length; + }); } std::optional getNext(SeekableInputStream& input) { @@ -106,21 +112,22 @@ std::optional getNext(SeekableInputStream& input) { } } -} // namespace +class TestBufferedInput : public testing::Test { + protected: + static void SetUpTestCase() { + MemoryManager::testingSetInstance({}); + } -TEST(TestBufferedInput, AllowMoveConstructor) { - auto readFileMock = std::make_shared(); - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - BufferedInput a(readFileMock, *pool); - BufferedInput b(std::move(a)); -} + const std::shared_ptr pool_ = memoryManager()->addLeafPool(); +}; +} // namespace -TEST(TestBufferedInput, ZeroLengthStream) { +TEST_F(TestBufferedInput, ZeroLengthStream) { auto readFile = std::make_shared(std::string()); - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); - BufferedInput input(readFile, *pool); + BufferedInput input(readFile, *pool_); auto ret = input.enqueue({0, 0}); + EXPECT_EQ(input.nextFetchSize(), 0); EXPECT_NE(ret, nullptr); const void* buf = nullptr; int32_t size = 1; @@ -128,15 +135,14 @@ TEST(TestBufferedInput, ZeroLengthStream) { EXPECT_EQ(size, 0); } -TEST(TestBufferedInput, UseRead) { +TEST_F(TestBufferedInput, UseRead) { std::string content = "hello"; auto readFileMock = std::make_shared(); expectPreads(*readFileMock, content, {{0, 5}}); - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); // Use read BufferedInput input( readFileMock, - *pool, + *pool_, MetricsLog::voidLog(), nullptr, 10, @@ -144,6 +150,7 @@ TEST(TestBufferedInput, UseRead) { auto ret = input.enqueue({0, 5}); ASSERT_NE(ret, nullptr); + EXPECT_EQ(input.nextFetchSize(), 5); input.load(LogType::TEST); auto next = getNext(*ret); @@ -151,15 +158,14 @@ TEST(TestBufferedInput, UseRead) { EXPECT_EQ(next.value(), content); } -TEST(TestBufferedInput, UseVRead) { +TEST_F(TestBufferedInput, UseVRead) { std::string content = "hello"; auto readFileMock = std::make_shared(); expectPreadvs(*readFileMock, content, {{0, 5}}); - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); // Use vread BufferedInput input( readFileMock, - *pool, + *pool_, MetricsLog::voidLog(), nullptr, 10, @@ -167,6 +173,7 @@ TEST(TestBufferedInput, UseVRead) { auto ret = input.enqueue({0, 5}); ASSERT_NE(ret, nullptr); + EXPECT_EQ(input.nextFetchSize(), 5); input.load(LogType::TEST); auto next = getNext(*ret); @@ -174,7 +181,7 @@ TEST(TestBufferedInput, UseVRead) { EXPECT_EQ(next.value(), content); } -TEST(TestBufferedInput, WillMerge) { +TEST_F(TestBufferedInput, WillMerge) { std::string content = "hello world"; auto readFileMock = std::make_shared(); @@ -182,10 +189,9 @@ TEST(TestBufferedInput, WillMerge) { // Expect only one call. expectPreads(*readFileMock, content, {{0, 11}}); - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); BufferedInput input( readFileMock, - *pool, + *pool_, MetricsLog::voidLog(), nullptr, 10, // Will merge if distance <= 10 @@ -196,6 +202,7 @@ TEST(TestBufferedInput, WillMerge) { ASSERT_NE(ret1, nullptr); ASSERT_NE(ret2, nullptr); + EXPECT_EQ(input.nextFetchSize(), 10); input.load(LogType::TEST); auto next1 = getNext(*ret1); @@ -207,7 +214,7 @@ TEST(TestBufferedInput, WillMerge) { EXPECT_EQ(next2.value(), "world"); } -TEST(TestBufferedInput, WontMerge) { +TEST_F(TestBufferedInput, WontMerge) { std::string content = "hello world"; // two spaces auto readFileMock = std::make_shared(); @@ -215,10 +222,9 @@ TEST(TestBufferedInput, WontMerge) { // Expect two calls expectPreads(*readFileMock, content, {{0, 5}, {7, 5}}); - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); BufferedInput input( readFileMock, - *pool, + *pool_, MetricsLog::voidLog(), nullptr, 1, // Will merge if distance <= 1 @@ -229,6 +235,7 @@ TEST(TestBufferedInput, WontMerge) { ASSERT_NE(ret1, nullptr); ASSERT_NE(ret2, nullptr); + EXPECT_EQ(input.nextFetchSize(), 10); input.load(LogType::TEST); auto next1 = getNext(*ret1); @@ -240,16 +247,15 @@ TEST(TestBufferedInput, WontMerge) { EXPECT_EQ(next2.value(), "world"); } -TEST(TestBufferedInput, ReadSorting) { +TEST_F(TestBufferedInput, ReadSorting) { std::string content = "aaabbbcccdddeeefffggghhhiiijjjkkklllmmmnnnooopppqqq"; std::vector regions = {{6, 3}, {24, 3}, {3, 3}, {0, 3}, {29, 3}}; auto readFileMock = std::make_shared(); expectPreads(*readFileMock, content, {{0, 9}, {24, 3}, {29, 3}}); - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); BufferedInput input( readFileMock, - *pool, + *pool_, MetricsLog::voidLog(), nullptr, 1, // Will merge if distance <= 1 @@ -258,13 +264,16 @@ TEST(TestBufferedInput, ReadSorting) { std::vector, std::string>> result; result.reserve(regions.size()); + int64_t bytesToRead = 0; for (auto& region : regions) { + bytesToRead += region.length; auto ret = input.enqueue(region); ASSERT_NE(ret, nullptr); result.push_back( {std::move(ret), content.substr(region.offset, region.length)}); } + EXPECT_EQ(input.nextFetchSize(), bytesToRead); input.load(LogType::TEST); for (auto& r : result) { @@ -274,17 +283,16 @@ TEST(TestBufferedInput, ReadSorting) { } } -TEST(TestBufferedInput, VReadSorting) { +TEST_F(TestBufferedInput, VReadSorting) { std::string content = "aaabbbcccdddeeefffggghhhiiijjjkkklllmmmnnnooopppqqq"; std::vector regions = {{6, 3}, {24, 3}, {3, 3}, {0, 3}, {29, 3}}; auto readFileMock = std::make_shared(); expectPreadvs( *readFileMock, content, {{0, 3}, {3, 3}, {6, 3}, {24, 3}, {29, 3}}); - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); BufferedInput input( readFileMock, - *pool, + *pool_, MetricsLog::voidLog(), nullptr, 1, // Will merge if distance <= 1 @@ -293,13 +301,16 @@ TEST(TestBufferedInput, VReadSorting) { std::vector, std::string>> result; result.reserve(regions.size()); + int64_t bytesToRead = 0; for (auto& region : regions) { + bytesToRead += region.length; auto ret = input.enqueue(region); ASSERT_NE(ret, nullptr); result.push_back( {std::move(ret), content.substr(region.offset, region.length)}); } + EXPECT_EQ(input.nextFetchSize(), bytesToRead); input.load(LogType::TEST); for (auto& r : result) { @@ -309,7 +320,7 @@ TEST(TestBufferedInput, VReadSorting) { } } -TEST(TestBufferedInput, VReadSortingWithLabels) { +TEST_F(TestBufferedInput, VReadSortingWithLabels) { std::string content = "aaabbbcccdddeeefffggghhhiiijjjkkklllmmmnnnooopppqqq"; std::vector l = {"a", "b", "c", "d", "e"}; std::vector regions = { @@ -320,10 +331,9 @@ TEST(TestBufferedInput, VReadSortingWithLabels) { *readFileMock, content, {{0, 3, l[0]}, {3, 3, l[1]}, {6, 3, l[2]}, {24, 3, l[3]}, {29, 3, l[4]}}); - auto pool = facebook::velox::memory::addDefaultLeafMemoryPool(); BufferedInput input( readFileMock, - *pool, + *pool_, MetricsLog::voidLog(), nullptr, 1, // Will merge if distance <= 1 @@ -332,13 +342,16 @@ TEST(TestBufferedInput, VReadSortingWithLabels) { std::vector, std::string>> result; result.reserve(regions.size()); + int64_t bytesToRead = 0; for (auto& region : regions) { + bytesToRead += region.length; auto ret = input.enqueue(region); ASSERT_NE(ret, nullptr); result.push_back( {std::move(ret), content.substr(region.offset, region.length)}); } + EXPECT_EQ(input.nextFetchSize(), bytesToRead); input.load(LogType::TEST); for (auto& r : result) { diff --git a/velox/dwio/common/tests/ThrottlerTest.cpp b/velox/dwio/common/tests/ThrottlerTest.cpp new file mode 100644 index 0000000000000..6a3b62e1f5103 --- /dev/null +++ b/velox/dwio/common/tests/ThrottlerTest.cpp @@ -0,0 +1,558 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/Throttler.h" + +#include + +#include "folly/Random.h" +#include "velox/common/base/tests/GTestUtils.h" + +namespace facebook::velox::dwio::common { +namespace { + +class ThrottlerTest : public testing::Test { + protected: + static Throttler::Config throttleConfig(uint32_t cacheTTLMs = 3'600 * 1'000) { + return Throttler::Config(true, 1, 4, 2.0, 10, 40, 4, cacheTTLMs); + } + + void SetUp() override { + Throttler::testingReset(); + } +}; + +TEST_F(ThrottlerTest, config) { + const auto config = throttleConfig(); + ASSERT_EQ( + config.toString(), + "throttleEnabled:true minThrottleBackoffMs:1ms maxThrottleBackoffMs:4ms backoffScaleFactor:2 minLocalThrottledSignals:10 minGlobalThrottledSignals:40 maxCacheEntries:4 cacheTTLMs:1h 0m 0s"); +} + +TEST_F(ThrottlerTest, signalType) { + ASSERT_EQ(Throttler::signalTypeName(Throttler::SignalType::kLocal), "Local"); + ASSERT_EQ( + Throttler::signalTypeName(Throttler::SignalType::kGlobal), "Global"); + ASSERT_EQ(Throttler::signalTypeName(Throttler::SignalType::kNone), "None"); + ASSERT_EQ( + Throttler::signalTypeName(static_cast(100)), + "Unknown Signal Type: 100"); +} + +TEST_F(ThrottlerTest, init) { + ASSERT_EQ(Throttler::instance(), nullptr); + Throttler::init(throttleConfig()); + auto* instance = Throttler::instance(); + ASSERT_NE(instance, nullptr); + ASSERT_EQ(instance, Throttler::instance()); + VELOX_ASSERT_THROW( + Throttler::init(throttleConfig()), "Throttler has already been set"); + ASSERT_EQ(instance, Throttler::instance()); +} + +TEST_F(ThrottlerTest, throttleDisabled) { + Throttler::init(Throttler::Config(false)); + const std::string cluster{"throttleDisabled"}; + const std::string directory{"throttleDisabled"}; + auto* instance = Throttler::instance(); + for (int i = 1; i <= 100; ++i) { + ASSERT_EQ( + instance->throttleBackoff( + i % 2 ? Throttler::SignalType::kLocal + : Throttler::SignalType::kGlobal, + cluster, + directory), + 0); + } + const auto& stats = instance->stats(); + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); +} + +TEST_F(ThrottlerTest, noThrottlerSignal) { + Throttler::init(Throttler::Config(true, 100, 200, 2.0, 10, 1'000)); + const std::string cluster{"noThrottlerSignal"}; + const std::string directory{"noThrottlerSignal"}; + auto* instance = Throttler::instance(); + for (int i = 1; i <= 100; ++i) { + ASSERT_EQ( + instance->throttleBackoff( + Throttler::SignalType::kNone, cluster, directory), + 0); + } + const auto& stats = instance->stats(); + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); +} + +TEST_F(ThrottlerTest, throttle) { + const uint64_t minThrottleBackoffMs = 1'000; + const uint64_t maxThrottleBackoffMs = 2'000; + for (const bool global : {true, false}) { + SCOPED_TRACE(fmt::format("global {}", global)); + + Throttler::testingReset(); + Throttler::init(Throttler::Config( + true, + minThrottleBackoffMs, + maxThrottleBackoffMs, + 2.0, + global ? 1'0000 : 2, + global ? 2 : 1'0000)); + auto* instance = Throttler::instance(); + const auto& stats = instance->stats(); + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + const Throttler::SignalType type = + global ? Throttler::SignalType::kGlobal : Throttler::SignalType::kLocal; + const std::string cluster{"throttle"}; + const std::string directory{"throttle"}; + ASSERT_EQ(instance->throttleBackoff(type, cluster, directory), 0); + ASSERT_EQ(instance->throttleBackoff(type, cluster, directory), 0); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + uint64_t measuredBackOffMs{0}; + uint64_t firstBackoffMs{0}; + { + MicrosecondTimer timer(&measuredBackOffMs); + firstBackoffMs = instance->throttleBackoff(type, cluster, directory); + } + ASSERT_LE(firstBackoffMs, maxThrottleBackoffMs); + ASSERT_GE(firstBackoffMs, minThrottleBackoffMs); + ASSERT_GE(measuredBackOffMs, firstBackoffMs); + + ASSERT_EQ(stats.localThrottled, global ? 0 : 1); + ASSERT_EQ(stats.globalThrottled, global ? 1 : 0); + ASSERT_EQ(stats.backOffDelay.count(), 1); + ASSERT_EQ(stats.backOffDelay.sum(), firstBackoffMs); + + measuredBackOffMs = 0; + uint64_t secondBackoffMs{0}; + { + MicrosecondTimer timer(&measuredBackOffMs); + secondBackoffMs = instance->throttleBackoff(type, cluster, directory); + } + ASSERT_LE(secondBackoffMs, maxThrottleBackoffMs); + ASSERT_GE(secondBackoffMs, minThrottleBackoffMs); + ASSERT_GE(measuredBackOffMs, secondBackoffMs); + ASSERT_LT(firstBackoffMs, secondBackoffMs); + + ASSERT_EQ(stats.localThrottled, global ? 0 : 2); + ASSERT_EQ(stats.globalThrottled, global ? 2 : 0); + ASSERT_EQ(stats.backOffDelay.count(), 2); + ASSERT_EQ(stats.backOffDelay.sum(), firstBackoffMs + secondBackoffMs); + } +} + +TEST_F(ThrottlerTest, expire) { + const uint64_t minThrottleBackoffMs = 1'00; + const uint64_t maxThrottleBackoffMs = 2'00; + for (const bool global : {true, false}) { + SCOPED_TRACE(fmt::format("global {}", global)); + Throttler::testingReset(); + Throttler::init(Throttler::Config( + true, + minThrottleBackoffMs, + maxThrottleBackoffMs, + 2.0, + global ? 1'0000 : 2, + global ? 2 : 1'0000, + 1'000, + 1'000)); + auto* instance = Throttler::instance(); + const auto& stats = instance->stats(); + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + const Throttler::SignalType type = + global ? Throttler::SignalType::kGlobal : Throttler::SignalType::kLocal; + const std::string cluster{"expire"}; + const std::string directory{"expire"}; + ASSERT_EQ(instance->throttleBackoff(type, cluster, directory), 0); + ASSERT_EQ(instance->throttleBackoff(type, cluster, directory), 0); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + + ASSERT_EQ(instance->throttleBackoff(type, cluster, directory), 0); + ASSERT_EQ(instance->throttleBackoff(type, cluster, directory), 0); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + } +} + +TEST_F(ThrottlerTest, differentLocals) { + const uint64_t minThrottleBackoffMs = 1'000; + const uint64_t maxThrottleBackoffMs = 2'000; + Throttler::init(Throttler::Config( + true, minThrottleBackoffMs, maxThrottleBackoffMs, 2.0, 2, 1'0000)); + auto* instance = Throttler::instance(); + const auto& stats = instance->stats(); + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + const std::string cluster1{"differentLocals1"}; + const std::string directory1{"differentLocals1"}; + ASSERT_EQ( + instance->throttleBackoff( + Throttler::SignalType::kLocal, cluster1, directory1), + 0); + ASSERT_EQ( + instance->throttleBackoff( + Throttler::SignalType::kLocal, cluster1, directory1), + 0); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + const std::string directory2{"differentLocals2"}; + ASSERT_EQ( + instance->throttleBackoff( + Throttler::SignalType::kLocal, cluster1, directory2), + 0); + ASSERT_EQ( + instance->throttleBackoff( + Throttler::SignalType::kLocal, cluster1, directory2), + 0); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + const auto path1firstBackoffMs = instance->throttleBackoff( + Throttler::SignalType::kLocal, cluster1, directory1); + ASSERT_GT(path1firstBackoffMs, 0); + ASSERT_LT(path1firstBackoffMs, maxThrottleBackoffMs); + + ASSERT_EQ(stats.localThrottled, 1); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 1); + ASSERT_EQ(stats.backOffDelay.sum(), path1firstBackoffMs); + + const auto path2firstBackoffMs = instance->throttleBackoff( + Throttler::SignalType::kLocal, cluster1, directory2); + ASSERT_GT(path2firstBackoffMs, 0); + ASSERT_LT(path2firstBackoffMs, maxThrottleBackoffMs); + + ASSERT_EQ(stats.localThrottled, 2); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 2); + ASSERT_EQ( + stats.backOffDelay.sum(), path1firstBackoffMs + path2firstBackoffMs); + + const auto path1SecondBackoffMs = instance->throttleBackoff( + Throttler::SignalType::kLocal, cluster1, directory1); + ASSERT_EQ(path1SecondBackoffMs, maxThrottleBackoffMs); + + ASSERT_EQ(stats.localThrottled, 3); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 3); + ASSERT_EQ( + stats.backOffDelay.sum(), + path1firstBackoffMs + path2firstBackoffMs + path1SecondBackoffMs); + + const auto path2SecondBackoffMs = instance->throttleBackoff( + Throttler::SignalType::kLocal, cluster1, directory2); + ASSERT_EQ(path2SecondBackoffMs, maxThrottleBackoffMs); + + ASSERT_EQ(stats.localThrottled, 4); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 4); + ASSERT_EQ( + stats.backOffDelay.sum(), + path1firstBackoffMs + path2firstBackoffMs + path1SecondBackoffMs + + path2SecondBackoffMs); +} + +TEST_F(ThrottlerTest, differentGlobals) { + const uint64_t minThrottleBackoffMs = 1'000; + const uint64_t maxThrottleBackoffMs = 2'000; + Throttler::init(Throttler::Config( + true, minThrottleBackoffMs, maxThrottleBackoffMs, 2.0, 1'0000, 2)); + auto* instance = Throttler::instance(); + const auto& stats = instance->stats(); + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + const std::string cluster1{"differentGlobals1"}; + const std::string directory1{"differentGlobals1"}; + ASSERT_EQ( + instance->throttleBackoff( + Throttler::SignalType::kGlobal, cluster1, directory1), + 0); + ASSERT_EQ( + instance->throttleBackoff( + Throttler::SignalType::kGlobal, cluster1, directory1), + 0); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + const std::string cluster2{"differentGlobals2"}; + const std::string directory2{"differentGlobals1"}; + ASSERT_EQ( + instance->throttleBackoff( + Throttler::SignalType::kGlobal, cluster2, directory2), + 0); + ASSERT_EQ( + instance->throttleBackoff( + Throttler::SignalType::kGlobal, cluster2, directory2), + 0); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + const auto path1firstBackoffMs = instance->throttleBackoff( + Throttler::SignalType::kGlobal, cluster1, directory1); + ASSERT_GT(path1firstBackoffMs, 0); + ASSERT_LT(path1firstBackoffMs, maxThrottleBackoffMs); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 1); + ASSERT_EQ(stats.backOffDelay.count(), 1); + ASSERT_EQ(stats.backOffDelay.sum(), path1firstBackoffMs); + + const auto path2firstBackoffMs = instance->throttleBackoff( + Throttler::SignalType::kGlobal, cluster2, directory2); + ASSERT_GT(path2firstBackoffMs, 0); + ASSERT_LT(path2firstBackoffMs, maxThrottleBackoffMs); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 2); + ASSERT_EQ(stats.backOffDelay.count(), 2); + ASSERT_EQ( + stats.backOffDelay.sum(), path1firstBackoffMs + path2firstBackoffMs); + + const auto path1SecondBackoffMs = instance->throttleBackoff( + Throttler::SignalType::kGlobal, cluster1, directory1); + ASSERT_EQ(path1SecondBackoffMs, maxThrottleBackoffMs); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 3); + ASSERT_EQ(stats.backOffDelay.count(), 3); + ASSERT_EQ( + stats.backOffDelay.sum(), + path1firstBackoffMs + path2firstBackoffMs + path1SecondBackoffMs); + + const auto path2SecondBackoffMs = instance->throttleBackoff( + Throttler::SignalType::kGlobal, cluster2, directory2); + ASSERT_EQ(path2SecondBackoffMs, maxThrottleBackoffMs); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 4); + ASSERT_EQ(stats.backOffDelay.count(), 4); + ASSERT_EQ( + stats.backOffDelay.sum(), + path1firstBackoffMs + path2firstBackoffMs + path1SecondBackoffMs + + path2SecondBackoffMs); +} + +TEST_F(ThrottlerTest, maxOfGlobalAndLocal) { + const uint64_t minThrottleBackoffMs = 1'000; + const uint64_t maxThrottleBackoffMs = 2'000; + for (const bool localFirst : {false, true}) { + SCOPED_TRACE(fmt::format("localFirst: {}", localFirst)); + Throttler::testingReset(); + Throttler::init(Throttler::Config( + true, minThrottleBackoffMs, maxThrottleBackoffMs, 2.0, 2, 2)); + auto* instance = Throttler::instance(); + const auto& stats = instance->stats(); + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + const std::string cluster1{"maxOfGlobalAndLocal1"}; + const std::string directory1{"maxOfGlobalAndLocal1"}; + ASSERT_EQ( + instance->throttleBackoff( + localFirst ? Throttler::SignalType::kLocal + : Throttler::SignalType::kGlobal, + cluster1, + directory1), + 0); + ASSERT_EQ( + instance->throttleBackoff( + localFirst ? Throttler::SignalType::kLocal + : Throttler::SignalType::kGlobal, + cluster1, + directory1), + 0); + + ASSERT_EQ(stats.localThrottled, 0); + ASSERT_EQ(stats.globalThrottled, 0); + ASSERT_EQ(stats.backOffDelay.count(), 0); + + auto backoffMs = instance->throttleBackoff( + localFirst ? Throttler::SignalType::kLocal + : Throttler::SignalType::kGlobal, + cluster1, + directory1); + ASSERT_GT(backoffMs, 0); + ASSERT_LT(backoffMs, maxThrottleBackoffMs); + + backoffMs = instance->throttleBackoff( + localFirst ? Throttler::SignalType::kGlobal + : Throttler::SignalType::kLocal, + cluster1, + directory1); + ASSERT_GT(backoffMs, 0); + ASSERT_LT(backoffMs, maxThrottleBackoffMs); + + backoffMs = instance->throttleBackoff( + localFirst ? Throttler::SignalType::kGlobal + : Throttler::SignalType::kLocal, + cluster1, + directory1); + ASSERT_GT(backoffMs, 0); + ASSERT_LT(backoffMs, maxThrottleBackoffMs); + + backoffMs = instance->throttleBackoff( + localFirst ? Throttler::SignalType::kLocal + : Throttler::SignalType::kGlobal, + cluster1, + directory1); + ASSERT_EQ(backoffMs, maxThrottleBackoffMs); + + const std::string cluster2{"maxOfGlobalAndLocal2"}; + const std::string directory2{"maxOfGlobalAndLocal1"}; + ASSERT_EQ( + instance->throttleBackoff( + localFirst ? Throttler::SignalType::kGlobal + : Throttler::SignalType::kLocal, + cluster2, + directory2), + 0); + ASSERT_EQ( + instance->throttleBackoff( + localFirst ? Throttler::SignalType::kGlobal + : Throttler::SignalType::kLocal, + cluster2, + directory2), + 0); + + backoffMs = instance->throttleBackoff( + localFirst ? Throttler::SignalType::kGlobal + : Throttler::SignalType::kLocal, + cluster2, + directory2); + ASSERT_GT(backoffMs, 0); + ASSERT_LT(backoffMs, maxThrottleBackoffMs); + + const std::string directory3{"maxOfGlobalAndLocal3"}; + backoffMs = instance->throttleBackoff( + Throttler::SignalType::kGlobal, cluster1, directory3); + if (localFirst) { + ASSERT_GT(backoffMs, 0); + ASSERT_LT(backoffMs, maxThrottleBackoffMs); + } else { + ASSERT_EQ(backoffMs, maxThrottleBackoffMs); + } + + if (localFirst) { + ASSERT_EQ(stats.localThrottled, 2); + ASSERT_EQ(stats.globalThrottled, 4); + } else { + ASSERT_EQ(stats.localThrottled, 3); + ASSERT_EQ(stats.globalThrottled, 3); + } + ASSERT_EQ(stats.backOffDelay.count(), 6); + } +} + +TEST_F(ThrottlerTest, fuzz) { + const uint64_t minThrottleBackoffMs = 1; + const uint64_t maxThrottleBackoffMs = 8; + const double backoffScaleFactor = 2.0; + const uint32_t minLocalThrottledSignals = 10; + const uint32_t minGlobalThrottledSignals = 20; + const uint32_t maxCacheEntries = 64; + const uint32_t cacheTTLMs = 10; + Throttler::testingReset(); + Throttler::init(Throttler::Config( + true, + minThrottleBackoffMs, + maxThrottleBackoffMs, + backoffScaleFactor, + minLocalThrottledSignals, + minGlobalThrottledSignals, + maxCacheEntries, + cacheTTLMs)); + auto* instance = Throttler::instance(); + + const auto seed = getCurrentTimeMs(); + LOG(INFO) << "Random seed: " << getCurrentTimeMs(); + + const int numDirectories = 4096; + std::vector directories; + directories.reserve(numDirectories); + for (int i = 0; i < numDirectories; ++i) { + directories.emplace_back(fmt::format("fuzz-{}", i)); + } + const int numClusters = 128; + std::vector clusters; + clusters.reserve(numClusters); + for (int i = 0; i < numClusters; ++i) { + clusters.emplace_back(fmt::format("fuzz-{}", i)); + } + + std::atomic_bool stopped{false}; + + const int numThreads = 64; + std::vector threads; + threads.reserve(numThreads); + for (int i = 0; i < numThreads; ++i) { + threads.emplace_back([&]() { + folly::Random::DefaultGenerator rng(seed); + while (!stopped) { + const Throttler::SignalType type = folly::Random::oneIn(3) + ? Throttler::SignalType::kGlobal + : Throttler::SignalType::kLocal; + const int directoryIndex = folly::Random::rand32(rng) % numDirectories; + const int clusterIndex = directoryIndex % numClusters; + instance->throttleBackoff( + type, clusters[clusterIndex], directories[directoryIndex]); + } + }); + } + + // Test for 5 seconds. + std::this_thread::sleep_for(std::chrono::seconds(5)); // NOLINT + stopped = true; + for (auto& thread : threads) { + thread.join(); + } +} + +} // namespace +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/tests/TypeTests.cpp b/velox/dwio/common/tests/TypeTests.cpp index f4ad3f635259f..d33673daca3ab 100644 --- a/velox/dwio/common/tests/TypeTests.cpp +++ b/velox/dwio/common/tests/TypeTests.cpp @@ -29,6 +29,28 @@ using namespace facebook::velox::dwio::common::typeutils; using facebook::velox::type::fbhive::HiveTypeParser; using facebook::velox::type::fbhive::HiveTypeSerializer; +void assertEqualTypeWithId( + std::shared_ptr& actual, + std::shared_ptr& expected) { + EXPECT_EQ(actual->size(), expected->size()); + for (auto idx = 0; idx < actual->size(); idx++) { + auto actualTypeChild = actual->childAt(idx); + auto expectedTypeChild = expected->childAt(idx); + EXPECT_TRUE(actualTypeChild->type()->kindEquals(expectedTypeChild->type())); + EXPECT_EQ(actualTypeChild->id(), expectedTypeChild->id()); + EXPECT_EQ(actualTypeChild->column(), expectedTypeChild->column()); + assertEqualTypeWithId(actualTypeChild, expectedTypeChild); + } +} + +void assertValidTypeWithId( + const std::shared_ptr& typeWithId) { + for (auto idx = 0; idx < typeWithId->size(); idx++) { + EXPECT_EQ(typeWithId->childAt(idx)->parent(), typeWithId.get()); + assertValidTypeWithId(typeWithId->childAt(idx)); + } +} + TEST(TestType, selectedType) { auto type = HiveTypeParser().parse( "struct," @@ -39,15 +61,31 @@ TEST(TestType, selectedType) { "col3:map,col4:float," "col5:int,col6:bigint,col7:string>", HiveTypeSerializer::serialize(type).c_str()); - auto typeWithId = TypeWithId::create(type); + std::shared_ptr typeWithId = TypeWithId::create(type); EXPECT_EQ(0, typeWithId->id()); EXPECT_EQ(11, typeWithId->maxId()); + auto copySelector = [](size_t index) { return true; }; + + // The following two lines verify that the original type tree's children are + // not re-parented by the buildSelectedType method when copying. If it is + // re-parented, then this test would crash with SIGSEGV. The return type is + // deliberately ignored so the copied type will be deallocated upon return. + buildSelectedType(typeWithId, copySelector); + EXPECT_EQ(typeWithId->childAt(1)->parent()->type()->kind(), TypeKind::ROW); + + auto cutType = buildSelectedType(typeWithId, copySelector); + assertEqualTypeWithId(cutType, typeWithId); + assertValidTypeWithId(typeWithId); + assertValidTypeWithId(cutType); + std::vector selected(12); selected[0] = true; selected[2] = true; auto selector = [&selected](size_t index) { return selected[index]; }; - auto cutType = buildSelectedType(typeWithId, selector); + cutType = buildSelectedType(typeWithId, selector); + assertValidTypeWithId(typeWithId); + assertValidTypeWithId(cutType); EXPECT_STREQ( "struct", HiveTypeSerializer::serialize(cutType->type()).c_str()); @@ -57,6 +95,8 @@ TEST(TestType, selectedType) { selected.assign(12, true); cutType = buildSelectedType(typeWithId, selector); + assertValidTypeWithId(typeWithId); + assertValidTypeWithId(cutType); EXPECT_STREQ( "struct," "col3:map,col4:float," @@ -69,6 +109,8 @@ TEST(TestType, selectedType) { selected[0] = true; selected[8] = true; cutType = buildSelectedType(typeWithId, selector); + assertValidTypeWithId(typeWithId); + assertValidTypeWithId(cutType); EXPECT_STREQ( "struct", HiveTypeSerializer::serialize(cutType->type()).c_str()); @@ -91,6 +133,8 @@ TEST(TestType, selectedType) { selected[3] = true; selected[4] = true; cutType = buildSelectedType(typeWithId, selector); + assertValidTypeWithId(typeWithId); + assertValidTypeWithId(cutType); EXPECT_STREQ( "struct>", HiveTypeSerializer::serialize(cutType->type()).c_str()); @@ -106,6 +150,8 @@ TEST(TestType, selectedType) { selected[6] = true; selected[7] = true; cutType = buildSelectedType(typeWithId, selector); + assertValidTypeWithId(typeWithId); + assertValidTypeWithId(cutType); EXPECT_STREQ( "struct>", HiveTypeSerializer::serialize(cutType->type()).c_str()); @@ -135,6 +181,9 @@ TEST(TestType, selectedType) { selected[1] = true; selected[11] = true; cutType = buildSelectedType(typeWithId, selector); + assertValidTypeWithId(typeWithId); + assertValidTypeWithId(cutType); + EXPECT_STREQ( "struct", HiveTypeSerializer::serialize(cutType->type()).c_str()); diff --git a/velox/dwio/common/tests/UnitLoaderToolsTests.cpp b/velox/dwio/common/tests/UnitLoaderToolsTests.cpp new file mode 100644 index 0000000000000..36707554ad850 --- /dev/null +++ b/velox/dwio/common/tests/UnitLoaderToolsTests.cpp @@ -0,0 +1,194 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/dwio/common/UnitLoaderTools.h" + +using namespace ::testing; +using namespace ::facebook::velox::dwio::common; +using namespace ::facebook::velox::dwio::common::unit_loader_tools; + +TEST(UnitLoaderToolsTests, NoCallbacksCreated) { + std::atomic_size_t callCount = 0; + { + CallbackOnLastSignal callback([&callCount]() { ++callCount; }); + EXPECT_EQ(callCount, 0); + } + EXPECT_EQ(callCount, 1); +} + +TEST(UnitLoaderToolsTests, SupportsNullCallbacks) { + CallbackOnLastSignal callback(nullptr); + auto cb = callback.getCallback(); + EXPECT_TRUE(cb == nullptr); +} + +TEST(UnitLoaderToolsTests, NoExplicitCalls) { + std::atomic_size_t callCount = 0; + { + CallbackOnLastSignal callback([&callCount]() { ++callCount; }); + EXPECT_EQ(callCount, 0); + { + auto c1 = callback.getCallback(); + auto c4 = callback.getCallback(); + EXPECT_EQ(callCount, 0); + + auto c2 = std::move(c1); + auto c3(c2); + EXPECT_EQ(callCount, 0); + + auto c5 = std::move(c4); + auto c6(c5); + EXPECT_EQ(callCount, 0); + } + EXPECT_EQ(callCount, 1); + } + EXPECT_EQ(callCount, 1); +} + +TEST(UnitLoaderToolsTests, NoExplicitCallsFactoryDeletedFirst) { + std::atomic_size_t callCount = 0; + { + std::function c1, c2; + { + CallbackOnLastSignal callback([&callCount]() { ++callCount; }); + EXPECT_EQ(callCount, 0); + + c1 = callback.getCallback(); + c2 = callback.getCallback(); + EXPECT_EQ(callCount, 0); + } + EXPECT_EQ(callCount, 0); + } + EXPECT_EQ(callCount, 1); +} + +TEST(UnitLoaderToolsTests, ExplicitCalls) { + std::atomic_size_t callCount = 0; + { + CallbackOnLastSignal callback([&callCount]() { ++callCount; }); + EXPECT_EQ(callCount, 0); + { + auto c1 = callback.getCallback(); + auto c4 = callback.getCallback(); + EXPECT_EQ(callCount, 0); + + c1(); + auto c2 = std::move(c1); + c2(); + auto c3(c2); + c3(); + EXPECT_EQ(callCount, 0); + + c4(); + EXPECT_EQ(callCount, 1); + auto c5 = std::move(c4); + c5(); + auto c6(c2); + c6(); + EXPECT_EQ(callCount, 1); + } + EXPECT_EQ(callCount, 1); + } + EXPECT_EQ(callCount, 1); +} + +TEST(UnitLoaderToolsTests, WillOnlyCallbackOnce) { + std::atomic_size_t callCount = 0; + { + CallbackOnLastSignal callback([&callCount]() { ++callCount; }); + EXPECT_EQ(callCount, 0); + { + auto c1 = callback.getCallback(); + auto c4 = callback.getCallback(); + EXPECT_EQ(callCount, 0); + + c1(); + auto c2 = std::move(c1); + c2(); + auto c3(c2); + c3(); + EXPECT_EQ(callCount, 0); + + c4(); + EXPECT_EQ(callCount, 1); + auto c5 = std::move(c4); + c5(); + auto c6(c2); + c6(); + EXPECT_EQ(callCount, 1); + + // This won't emit a new call + auto c7 = callback.getCallback(); + c7(); + EXPECT_EQ(callCount, 1); + } + EXPECT_EQ(callCount, 1); + } + EXPECT_EQ(callCount, 1); +} + +TEST(UnitLoaderToolsTests, HowMuchToSkip) { + // Helpers + auto testSkip = [](uint64_t rowsToSkip, std::vector rowCount) { + return howMuchToSkip(rowsToSkip, rowCount.cbegin(), rowCount.cend()); + }; + + auto result = [](uint32_t unitsToSkip, uint64_t rowsToSkip) { + return std::make_pair(unitsToSkip, rowsToSkip); + }; + + static const char* kErrorMessage = + "Can't skip more rows than all the rows in all the units"; + + // Test cases + EXPECT_EQ(testSkip(0, {}), result(0, 0)); + EXPECT_THAT( + [&]() { testSkip(1, {}); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr(kErrorMessage)))); + + EXPECT_EQ(testSkip(0, {0}), result(1, 0)); + EXPECT_THAT( + [&]() { testSkip(1, {0}); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr(kErrorMessage)))); + + EXPECT_EQ(testSkip(0, {1}), result(0, 0)); + EXPECT_EQ(testSkip(1, {1}), result(1, 0)); + EXPECT_THAT( + [&]() { testSkip(2, {1}); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr(kErrorMessage)))); + + std::vector rowCount = {2, 1, 2}; + EXPECT_EQ(testSkip(0, rowCount), result(0, 0)); + EXPECT_EQ(testSkip(1, rowCount), result(0, 1)); + EXPECT_EQ(testSkip(2, rowCount), result(1, 0)); + EXPECT_EQ(testSkip(3, rowCount), result(2, 0)); + EXPECT_EQ(testSkip(4, rowCount), result(2, 1)); + EXPECT_EQ(testSkip(5, rowCount), result(3, 0)); + EXPECT_THAT( + [&]() { testSkip(6, rowCount); }, + Throws(Property( + &facebook::velox::VeloxRuntimeError::message, + HasSubstr(kErrorMessage)))); +} diff --git a/velox/dwio/common/tests/WriterTest.cpp b/velox/dwio/common/tests/WriterTest.cpp new file mode 100644 index 0000000000000..b35f1ba09fbf0 --- /dev/null +++ b/velox/dwio/common/tests/WriterTest.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/Writer.h" +#include + +using namespace ::testing; + +namespace facebook::velox::dwio::common { +namespace { +TEST(WriterTest, stateString) { + ASSERT_EQ(Writer::stateString(Writer::State::kInit), "INIT"); + ASSERT_EQ(Writer::stateString(Writer::State::kRunning), "RUNNING"); + ASSERT_EQ(Writer::stateString(Writer::State::kClosed), "CLOSED"); + ASSERT_EQ(Writer::stateString(Writer::State::kAborted), "ABORTED"); +} +} // namespace +} // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/tests/utils/BatchMaker.cpp b/velox/dwio/common/tests/utils/BatchMaker.cpp index a458a5031c017..603f363ca820f 100644 --- a/velox/dwio/common/tests/utils/BatchMaker.cpp +++ b/velox/dwio/common/tests/utils/BatchMaker.cpp @@ -48,17 +48,14 @@ VectorPtr createScalar( BufferPtr values = AlignedBuffer::allocate(size, &pool); auto valuesPtr = values->asMutableRange(); - BufferPtr nulls = AlignedBuffer::allocate(bits::nbytes(size), &pool); + BufferPtr nulls = allocateNulls(size, &pool); auto* nullsPtr = nulls->asMutable(); - size_t nullCount = 0; for (size_t i = 0; i < size; ++i) { auto notNull = isNotNull(gen, i, isNullAt); bits::setNull(nullsPtr, i, !notNull); if (notNull) { valuesPtr[i] = val(); - } else { - nullCount++; } } @@ -68,7 +65,7 @@ VectorPtr createScalar( template VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /* unused */, size_t /* unused */, memory::MemoryPool& /* unused */, std::mt19937& /* unused */, @@ -78,7 +75,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /* unused */, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -93,7 +90,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /* unused */, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -108,7 +105,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /* unused */, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -123,7 +120,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& type, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -133,12 +130,13 @@ VectorPtr BatchMaker::createVector( gen, [&gen]() { return static_cast(Random::rand32(gen)); }, pool, - isNullAt); + isNullAt, + type); } template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /*type*/, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -149,7 +147,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /* unused */, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -164,7 +162,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /* unused */, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -179,16 +177,24 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& type, + const TypePtr& type, size_t size, MemoryPool& pool, std::mt19937& gen, std::function isNullAt) { + int bitsToMove = 0; + // Generate proper bits of random value for LongDecimalType tests. + if (type->isLongDecimal()) { + auto [precision, scale] = getDecimalPrecisionScale(*type); + // Round up if the bit number is not the multiples of 8 (1 byte). + bitsToMove = 128 - ceil(log2(std::pow(10, precision)) / 8) * 8; + } return createScalar( size, gen, - [&gen]() { - return HugeInt::build(Random::rand32(gen), Random::rand32(gen)); + [&gen, bitsToMove]() { + return HugeInt::build(Random::rand64(gen), Random::rand64(gen)) >> + bitsToMove; }, pool, isNullAt, @@ -241,7 +247,7 @@ VectorPtr createBinary( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /* unused */, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -251,7 +257,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /* unused */, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -261,7 +267,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& /* unused */, + const TypePtr& /* unused */, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -281,7 +287,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& type, + const TypePtr& type, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -289,7 +295,7 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& type, + const TypePtr& type, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -297,14 +303,14 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& type, + const TypePtr& type, size_t size, MemoryPool& pool, std::mt19937& gen, std::function isNullAt); VectorPtr createRows( - const std::shared_ptr& type, + const TypePtr& type, size_t size, bool allowNulls, MemoryPool& pool, @@ -314,7 +320,7 @@ VectorPtr createRows( size_t nullCount = 0; if (allowNulls) { - nulls = AlignedBuffer::allocate(bits::nbytes(size), &pool); + nulls = allocateNulls(size, &pool); auto* nullsPtr = nulls->asMutable(); for (size_t i = 0; i < size; ++i) { auto notNull = isNotNull(gen, i, isNullAt); @@ -345,7 +351,7 @@ VectorPtr createRows( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& type, + const TypePtr& type, size_t size, MemoryPool& pool, std::mt19937& gen, @@ -355,18 +361,18 @@ VectorPtr BatchMaker::createVector( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& type, + const TypePtr& type, size_t size, MemoryPool& pool, std::mt19937& gen, std::function isNullAt) { - BufferPtr offsets = AlignedBuffer::allocate(size, &pool); + BufferPtr offsets = allocateOffsets(size, &pool); auto* offsetsPtr = offsets->asMutable(); - BufferPtr lengths = AlignedBuffer::allocate(size, &pool); + BufferPtr lengths = allocateSizes(size, &pool); auto* lengthsPtr = lengths->asMutable(); - BufferPtr nulls = AlignedBuffer::allocate(bits::nbytes(size), &pool); + BufferPtr nulls = allocateNulls(size, &pool); auto* nullsPtr = nulls->asMutable(); size_t nullCount = 0; @@ -556,18 +562,18 @@ VectorPtr createMapKeys( template <> VectorPtr BatchMaker::createVector( - const std::shared_ptr& type, + const TypePtr& type, size_t size, MemoryPool& pool, std::mt19937& gen, std::function isNullAt) { - BufferPtr offsets = AlignedBuffer::allocate(size, &pool); + BufferPtr offsets = allocateOffsets(size, &pool); auto* offsetsPtr = offsets->asMutable(); - BufferPtr lengths = AlignedBuffer::allocate(size, &pool); + BufferPtr lengths = allocateSizes(size, &pool); auto* lengthsPtr = lengths->asMutable(); - BufferPtr nulls = AlignedBuffer::allocate(bits::nbytes(size), &pool); + BufferPtr nulls = allocateNulls(size, &pool); auto* nullsPtr = nulls->asMutable(); size_t nullCount = 0; @@ -605,7 +611,7 @@ VectorPtr BatchMaker::createVector( } VectorPtr BatchMaker::createBatch( - const std::shared_ptr& type, + const TypePtr& type, uint64_t capacity, MemoryPool& memoryPool, std::mt19937& gen, @@ -617,7 +623,7 @@ VectorPtr BatchMaker::createBatch( } VectorPtr BatchMaker::createBatch( - const std::shared_ptr& type, + const TypePtr& type, uint64_t capacity, MemoryPool& memoryPool, std::function isNullAt, diff --git a/velox/dwio/common/tests/utils/BatchMaker.h b/velox/dwio/common/tests/utils/BatchMaker.h index 10588f30702d0..4a15cbbc4fc41 100644 --- a/velox/dwio/common/tests/utils/BatchMaker.h +++ b/velox/dwio/common/tests/utils/BatchMaker.h @@ -31,14 +31,14 @@ void propagateNullsRecursive(BaseVector& vector); struct BatchMaker { static VectorPtr createBatch( - const std::shared_ptr& type, + const TypePtr& type, uint64_t capacity, memory::MemoryPool& memoryPool, std::mt19937& gen, std::function isNullAt = nullptr); static VectorPtr createBatch( - const std::shared_ptr& type, + const TypePtr& type, uint64_t capacity, memory::MemoryPool& memoryPool, std::function isNullAt = nullptr, @@ -46,7 +46,7 @@ struct BatchMaker { template static VectorPtr createVector( - const std::shared_ptr& type, + const TypePtr& type, size_t size, memory::MemoryPool& pool, std::mt19937& gen, @@ -54,7 +54,7 @@ struct BatchMaker { template static VectorPtr createVector( - const std::shared_ptr& type, + const TypePtr& type, size_t size, memory::MemoryPool& pool, std::function isNullAt = nullptr, diff --git a/velox/dwio/common/tests/utils/CMakeLists.txt b/velox/dwio/common/tests/utils/CMakeLists.txt index cc34ffe08ae9f..7b3a5722165df 100644 --- a/velox/dwio/common/tests/utils/CMakeLists.txt +++ b/velox/dwio/common/tests/utils/CMakeLists.txt @@ -12,19 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library(velox_dwio_common_test_utils BatchMaker.cpp DataFiles.cpp - FilterGenerator.cpp DataSetBuilder.cpp) +add_library( + velox_dwio_common_test_utils + BatchMaker.cpp + DataFiles.cpp + DataSetBuilder.cpp + FilterGenerator.cpp + UnitLoaderTestTools.cpp + E2EFilterTestBase.cpp) target_link_libraries( velox_dwio_common_test_utils Folly::folly + fmt::fmt + glog::glog + gflags::gflags + GTest::gtest velox_dwio_common velox_dwio_common_exception velox_exception + velox_functions_prestosql velox_memory + velox_parse_parser velox_type velox_type_fbhive - velox_vector) + velox_vector + velox_vector_test_lib) # older versions of GCC need it to allow std::filesystem if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9) diff --git a/velox/dwio/common/tests/utils/DataSetBuilder.cpp b/velox/dwio/common/tests/utils/DataSetBuilder.cpp index f300d602d19e9..07b6b245003ff 100644 --- a/velox/dwio/common/tests/utils/DataSetBuilder.cpp +++ b/velox/dwio/common/tests/utils/DataSetBuilder.cpp @@ -37,7 +37,8 @@ RowTypePtr DataSetBuilder::makeRowType( DataSetBuilder& DataSetBuilder::makeDataset( RowTypePtr rowType, const size_t batchCount, - const size_t numRows) { + const size_t numRows, + const bool withRecursiveNulls) { if (batches_) { batches_->clear(); } else { @@ -45,8 +46,18 @@ DataSetBuilder& DataSetBuilder::makeDataset( } for (size_t i = 0; i < batchCount; ++i) { - batches_->push_back(std::static_pointer_cast( - BatchMaker::createBatch(rowType, numRows, pool_, nullptr, i))); + if (withRecursiveNulls) { + batches_->push_back(std::static_pointer_cast( + BatchMaker::createBatch(rowType, numRows, pool_, nullptr, i))); + } else { + batches_->push_back( + std::static_pointer_cast(BatchMaker::createBatch( + rowType, + numRows, + pool_, + [](vector_size_t /*index*/) { return false; }, + i))); + } } return *this; @@ -106,7 +117,7 @@ DataSetBuilder& DataSetBuilder::withAllNullsForField( for (RowVectorPtr batch : *batches_) { auto fieldValues = getChildBySubfield(batch.get(), field); SelectivityVector rows(fieldValues->size()); - fieldValues->addNulls(nullptr, rows); + fieldValues->addNulls(rows); } return *this; @@ -122,7 +133,7 @@ DataSetBuilder& DataSetBuilder::withNullsForField( if (nullsPercent == 0) { fieldValues->clearNulls(rows); } else if (nullsPercent >= 100) { - fieldValues->addNulls(nullptr, rows); + fieldValues->addNulls(rows); } else { std::vector nonNullRows = getSomeNonNullRowNumbers(fieldValues, 23); diff --git a/velox/dwio/common/tests/utils/DataSetBuilder.h b/velox/dwio/common/tests/utils/DataSetBuilder.h index d60dc158f7928..4893c28336f62 100644 --- a/velox/dwio/common/tests/utils/DataSetBuilder.h +++ b/velox/dwio/common/tests/utils/DataSetBuilder.h @@ -43,7 +43,8 @@ class DataSetBuilder { DataSetBuilder& makeDataset( RowTypePtr rowType, const size_t batchCount, - const size_t numRows); + const size_t numRows, + const bool withRecursiveNulls = true); // Adds high values to 'batches_' so that these values occur only in some row // groups. Tests skipping row groups based on row group stats. @@ -115,6 +116,54 @@ class DataSetBuilder { return *this; } + template + DataSetBuilder& withIntRleForField(const common::Subfield& field) { + constexpr int kMinRun = 5; + constexpr int kMaxRun = 101; + int remaining = 0; + T value; + auto vec = *batches_; + for (auto& batch : vec) { + auto numbers = dwio::common::getChildBySubfield(batch.get(), field) + ->as>(); + for (auto row = 0; row < numbers->size(); ++row) { + if (numbers->isNullAt(row)) { + continue; + } + if (remaining == 0) { + value = numbers->valueAt(row); + remaining = + kMinRun + folly::Random::rand32(rng_) % (kMaxRun - kMinRun); + } + numbers->set(row, value); + --remaining; + } + } + return *this; + } + + template + DataSetBuilder& withIntMainlyConstantForField(const common::Subfield& field) { + for (auto& batch : *batches_) { + std::optional value; + auto* numbers = dwio::common::getChildBySubfield(batch.get(), field) + ->as>(); + for (auto row = 0; row < numbers->size(); ++row) { + if (numbers->isNullAt(row)) { + continue; + } + if (folly::Random::randDouble01(rng_) < 0.95) { + if (!value.has_value()) { + value = numbers->valueAt(row); + } else { + numbers->set(row, *value); + } + } + } + } + return *this; + } + template DataSetBuilder& withQuantizedFloatForField( const common::Subfield& field, diff --git a/velox/dwio/common/tests/E2EFilterTestBase.cpp b/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp similarity index 95% rename from velox/dwio/common/tests/E2EFilterTestBase.cpp rename to velox/dwio/common/tests/utils/E2EFilterTestBase.cpp index fd04f523e3315..287d245c84085 100644 --- a/velox/dwio/common/tests/E2EFilterTestBase.cpp +++ b/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "velox/dwio/common/tests/E2EFilterTestBase.h" +#include "velox/dwio/common/tests/utils/E2EFilterTestBase.h" #include "velox/dwio/common/tests/utils/DataSetBuilder.h" #include "velox/expression/Expr.h" @@ -30,7 +30,6 @@ // Set FLAGS_logtostderr = true to log messages to stderr instead of logfiles // Set FLAGS_timing_repeats = n to run timing filter tests n times DEFINE_int32(timing_repeats, 0, "Count of repeats for timing filter tests"); -DEFINE_bool(use_random_seed, false, ""); namespace facebook::velox::dwio::common { @@ -47,12 +46,14 @@ using velox::common::Subfield; std::vector E2EFilterTestBase::makeDataset( std::function customize, - bool forRowGroupSkip) { + bool forRowGroupSkip, + bool withRecursiveNulls) { if (!dataSetBuilder_) { dataSetBuilder_ = std::make_unique(*leafPool_, 0); } - dataSetBuilder_->makeDataset(rowType_, batchCount_, batchSize_); + dataSetBuilder_->makeDataset( + rowType_, batchCount_, batchSize_, withRecursiveNulls); if (forRowGroupSkip) { dataSetBuilder_->withRowGroupSpecificData(kRowsInGroup); @@ -94,9 +95,8 @@ void E2EFilterTestBase::readWithoutFilter( uint64_t& time) { dwio::common::ReaderOptions readerOpts{leafPool_.get()}; dwio::common::RowReaderOptions rowReaderOpts; - std::string_view data(sinkPtr_->data(), sinkPtr_->size()); auto input = std::make_unique( - std::make_shared(data), readerOpts.getMemoryPool()); + std::make_shared(sinkData_), readerOpts.memoryPool()); auto reader = makeReader(readerOpts, std::move(input)); // The spec must stay live over the lifetime of the reader. @@ -111,7 +111,9 @@ void E2EFilterTestBase::readWithoutFilter( bool hasData; { MicrosecondTimer timer(&time); - hasData = rowReader->next(1000, resultBatch); + auto rowsScanned = rowReader->next(1000, resultBatch); + VLOG(1) << "rowsScanned=" << rowsScanned; + hasData = rowsScanned > 0; } if (!hasData) { break; @@ -146,13 +148,16 @@ void E2EFilterTestBase::readWithFilter( bool skipCheck) { dwio::common::ReaderOptions readerOpts{leafPool_.get()}; dwio::common::RowReaderOptions rowReaderOpts; - std::string_view data(sinkPtr_->data(), sinkPtr_->size()); auto input = std::make_unique( - std::make_shared(data), readerOpts.getMemoryPool()); + std::make_shared(sinkData_), readerOpts.memoryPool()); auto reader = makeReader(readerOpts, std::move(input)); // The spec must stay live over the lifetime of the reader. setUpRowReaderOptions(rowReaderOpts, spec); + VLOG(1) << "spec: " << spec->toString(); + if (!mutationSpec.deletedRows.empty()) { + VLOG(1) << "numDeletedRows=" << mutationSpec.deletedRows.size(); + } OwnershipChecker ownershipChecker; auto rowReader = reader->createRowReader(rowReaderOpts); runtimeStats_ = dwio::common::RuntimeStatistics(); @@ -190,6 +195,7 @@ void E2EFilterTestBase::readWithFilter( if (haveDelete) { mutation.deletedRows = isDeleted.data(); } + VLOG(1) << "readSize=" << readSize; auto rowsScanned = rowReader->next(readSize, resultBatch, &mutation); ASSERT_EQ(rowsScanned, readSize); if (resultBatch->size() == 0) { @@ -404,24 +410,21 @@ void E2EFilterTestBase::testScenario( std::function customize, bool wrapInStruct, const std::vector& filterable, - int32_t numCombinations) { + int32_t numCombinations, + bool withRecursiveNulls) { rowType_ = DataSetBuilder::makeRowType(columns, wrapInStruct); + filterGenerator_ = std::make_unique(rowType_, seed_); - uint32_t seed = 1; - if (FLAGS_use_random_seed) { - seed = folly::Random::secureRand32(); - LOG(INFO) << "Random seed: " << seed; - } - filterGenerator_ = std::make_unique(rowType_, seed); - - auto batches = makeDataset(customize, false); + auto batches = makeDataset(customize, false, withRecursiveNulls); writeToMemory(rowType_, batches, false); testNoRowGroupSkip(batches, filterable, numCombinations); testPruningWithFilter(batches, filterable); - batches = makeDataset(customize, true); - writeToMemory(rowType_, batches, true); - testRowGroupSkip(batches, filterable); + if (testRowGroupSkip_) { + batches = makeDataset(customize, true, withRecursiveNulls); + writeToMemory(rowType_, batches, true); + testRowGroupSkip(batches, filterable); + } } void E2EFilterTestBase::testMetadataFilterImpl( @@ -453,9 +456,8 @@ void E2EFilterTestBase::testMetadataFilterImpl( specC->setChannel(0); ReaderOptions readerOpts{leafPool_.get()}; RowReaderOptions rowReaderOpts; - std::string_view data(sinkPtr_->data(), sinkPtr_->size()); auto input = std::make_unique( - std::make_shared(data), readerOpts.getMemoryPool()); + std::make_shared(sinkData_), readerOpts.memoryPool()); auto reader = makeReader(readerOpts, std::move(input)); setUpRowReaderOptions(rowReaderOpts, spec); rowReaderOpts.setMetadataFilter(metadataFilter); @@ -501,8 +503,8 @@ void E2EFilterTestBase::testMetadataFilter() { test::VectorMaker vectorMaker(leafPool_.get()); functions::prestosql::registerAllScalarFunctions(); parse::registerTypeResolver(); - core::QueryCtx queryCtx; - exec::SimpleExpressionEvaluator evaluator(&queryCtx, leafPool_.get()); + auto queryCtx = core::QueryCtx::create(); + exec::SimpleExpressionEvaluator evaluator(queryCtx.get(), leafPool_.get()); // a: bigint, b: struct std::vector batches; @@ -650,9 +652,8 @@ void E2EFilterTestBase::testSubfieldsPruning() { ->setFilter(common::createBigintValues({1}, false)); ReaderOptions readerOpts{leafPool_.get()}; RowReaderOptions rowReaderOpts; - std::string_view data(sinkPtr_->data(), sinkPtr_->size()); auto input = std::make_unique( - std::make_shared(data), readerOpts.getMemoryPool()); + std::make_shared(sinkData_), readerOpts.memoryPool()); auto reader = makeReader(readerOpts, std::move(input)); setUpRowReaderOptions(rowReaderOpts, spec); auto rowReader = reader->createRowReader(rowReaderOpts); @@ -715,9 +716,8 @@ void E2EFilterTestBase::testMutationCornerCases() { auto& rowType = batches[0]->type(); writeToMemory(rowType, batches, false); ReaderOptions readerOpts{leafPool_.get()}; - std::string_view data(sinkPtr_->data(), sinkPtr_->size()); auto input = std::make_unique( - std::make_shared(data), readerOpts.getMemoryPool()); + std::make_shared(sinkData_), readerOpts.memoryPool()); auto reader = makeReader(readerOpts, std::move(input)); // 1. Interleave batches with and without deletions. diff --git a/velox/dwio/common/tests/E2EFilterTestBase.h b/velox/dwio/common/tests/utils/E2EFilterTestBase.h similarity index 82% rename from velox/dwio/common/tests/E2EFilterTestBase.h rename to velox/dwio/common/tests/utils/E2EFilterTestBase.h index 4d361c8f1fb70..d0659280b4f6b 100644 --- a/velox/dwio/common/tests/E2EFilterTestBase.h +++ b/velox/dwio/common/tests/utils/E2EFilterTestBase.h @@ -39,22 +39,42 @@ class TestingHook : public ValueHook { public: explicit TestingHook(FlatVector* result) : result_(result) {} - void addValue(vector_size_t row, const void* value) override { - result_->set(row, *reinterpret_cast(value)); + void addValue(vector_size_t row, int64_t value) override { + if constexpr (std::is_integral_v) { + result_->set(row, value); + } else { + VELOX_FAIL(); + } + } + + void addValue(vector_size_t row, float value) override { + if constexpr (std::is_same_v) { + result_->set(row, value); + } else { + VELOX_FAIL(); + } + } + + void addValue(vector_size_t row, double value) override { + if constexpr (std::is_same_v) { + result_->set(row, value); + } else { + VELOX_FAIL(); + } + } + + void addValue(vector_size_t row, folly::StringPiece value) override { + if constexpr (std::is_same_v) { + result_->set(row, StringView(value)); + } else { + VELOX_FAIL(); + } } private: FlatVector* result_; }; -template <> -inline void TestingHook::addValue( - vector_size_t row, - const void* value) { - result_->set( - row, StringView(*reinterpret_cast(value))); -} - // Utility for checking that a subsequent batch of output does not // overwrite internals of a possibly retained previous batch. class OwnershipChecker { @@ -78,19 +98,36 @@ class E2EFilterTestBase : public testing::Test { protected: static constexpr int32_t kRowsInGroup = 10'000; + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + static bool useRandomSeed() { + // Check environment variable because `buck test` does not allow pass in + // command line arguments. + const char* env = getenv("VELOX_TEST_USE_RANDOM_SEED"); + return !env ? false : folly::to(env); + } + void SetUp() override { - rootPool_ = memory::defaultMemoryManager().addRootPool("E2EFilterTestBase"); + rootPool_ = memory::memoryManager()->addRootPool("E2EFilterTestBase"); leafPool_ = rootPool_->addLeafChild("E2EFilterTestBase"); + if (useRandomSeed()) { + seed_ = folly::Random::secureRand32(); + LOG(INFO) << "Random seed: " << seed_; + } } static bool typeKindSupportsValueHook(TypeKind kind) { return kind != TypeKind::TIMESTAMP && kind != TypeKind::ARRAY && - kind != TypeKind::ROW && kind != TypeKind::MAP; + kind != TypeKind::ROW && kind != TypeKind::MAP && + kind != TypeKind::HUGEINT; } std::vector makeDataset( std::function customize, - bool forRowGroupSkip); + bool forRowGroupSkip, + bool withRecursiveNulls); void makeAllNulls(const std::string& fieldName); @@ -124,6 +161,16 @@ class E2EFilterTestBase : public testing::Test { keepNulls); } + template + void makeIntRle(const std::string& fieldName) { + dataSetBuilder_->withIntRleForField(Subfield(fieldName)); + } + + template + void makeIntMainlyConstant(const std::string& fieldName) { + dataSetBuilder_->withIntMainlyConstantForField(Subfield(fieldName)); + } + template void makeQuantizedFloat( const std::string& fieldName, @@ -180,6 +227,7 @@ class E2EFilterTestBase : public testing::Test { dwio::common::RowReaderOptions& opts, const std::shared_ptr& spec) { opts.setScanSpec(spec); + opts.setTimestampPrecision(TimestampPrecision::kNanoseconds); } void readWithoutFilter( @@ -210,7 +258,7 @@ class E2EFilterTestBase : public testing::Test { for (int32_t i = 0; i < 5 && i < batch->size(); ++i) { rows.push_back(i); } - for (int32_t i = 5; i < 5 && i < batch->size(); i += 2) { + for (int32_t i = 5; i < batch->size(); i += 2) { rows.push_back(i); } auto result = std::static_pointer_cast>( @@ -271,7 +319,8 @@ class E2EFilterTestBase : public testing::Test { std::function customize, bool wrapInStruct, const std::vector& filterable, - int32_t numCombinations); + int32_t numCombinations, + bool withRecursiveNulls = true); private: void testMetadataFilterImpl( @@ -310,7 +359,7 @@ class E2EFilterTestBase : public testing::Test { std::shared_ptr rootPool_; std::shared_ptr leafPool_; std::shared_ptr rowType_; - dwio::common::MemorySink* sinkPtr_; + std::string sinkData_; bool useVInts_ = true; dwio::common::RuntimeStatistics runtimeStats_; // Number of calls to flush policy between starting new stripes. @@ -319,6 +368,8 @@ class E2EFilterTestBase : public testing::Test { std::vector readSizes_; int32_t batchCount_ = kBatchCount; int32_t batchSize_ = kBatchSize; + bool testRowGroupSkip_ = true; + uint32_t seed_ = 1; }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/tests/utils/FilterGenerator.cpp b/velox/dwio/common/tests/utils/FilterGenerator.cpp index cae952986c528..78ed8c790b5b7 100644 --- a/velox/dwio/common/tests/utils/FilterGenerator.cpp +++ b/velox/dwio/common/tests/utils/FilterGenerator.cpp @@ -46,7 +46,7 @@ VectorPtr getChildBySubfield( const RowTypePtr& rootType) { const Type* type = rootType ? rootType.get() : rowVector->type().get(); auto& path = subfield.path(); - VELOX_CHECK(!path.empty()) + VELOX_CHECK(!path.empty()); auto* rowType = &type->asRow(); auto* field = dynamic_cast(path[0].get()); VELOX_CHECK(field); @@ -158,6 +158,19 @@ std::unique_ptr ColumnStats::makeRangeFilter( filterSpec.selectPct > 25); } +template <> +std::unique_ptr ColumnStats::makeRangeFilter( + const FilterSpec& filterSpec) { + if (values_.empty()) { + return std::make_unique(); + } + int128_t lower = valueAtPct(filterSpec.startPct); + int128_t upper = valueAtPct(filterSpec.startPct + filterSpec.selectPct); + + return std::make_unique( + lower, upper, filterSpec.allowNulls_); +} + template <> std::unique_ptr ColumnStats::makeRangeFilter( const FilterSpec& filterSpec) { @@ -165,6 +178,40 @@ std::unique_ptr ColumnStats::makeRangeFilter( return std::make_unique(); } + int32_t lowerIndex; + int32_t upperIndex; + StringView lower = valueAtPct(filterSpec.startPct, &lowerIndex); + StringView upper = + valueAtPct(filterSpec.startPct + filterSpec.selectPct, &upperIndex); + + // When the filter rate is 0%, we should not allow the value at the boundary. + if (filterSpec.selectPct == 0) { + return std::make_unique( + std::string(lower), + false, + true, + std::string(upper), + false, + true, + filterSpec.allowNulls_); + } + return std::make_unique( + std::string(lower), + false, + false, + std::string(upper), + false, + false, + filterSpec.allowNulls_); +} + +template <> +std::unique_ptr ColumnStats::makeRandomFilter( + const FilterSpec& filterSpec) { + if (values_.empty()) { + return std::make_unique(); + } + // used to determine if we can test a values filter reasonably int32_t lowerIndex; int32_t upperIndex; @@ -452,6 +499,9 @@ SubfieldFilters FilterGenerator::makeSubfieldFilters( case TypeKind::BIGINT: stats = makeStats(vector->type(), rowType_); break; + case TypeKind::HUGEINT: + stats = makeStats(vector->type(), rowType_); + break; case TypeKind::VARCHAR: stats = makeStats(vector->type(), rowType_); break; diff --git a/velox/dwio/common/tests/utils/FilterGenerator.h b/velox/dwio/common/tests/utils/FilterGenerator.h index bfc35b151d224..c0fdad28fa528 100644 --- a/velox/dwio/common/tests/utils/FilterGenerator.h +++ b/velox/dwio/common/tests/utils/FilterGenerator.h @@ -153,9 +153,16 @@ class ColumnStats : public AbstractColumnStats { case FilterKind::kIsNotNull: filter = std::make_unique(); break; - default: + case FilterKind::kBytesRange: filter = makeRangeFilter(filterSpec); break; + default: + if (type_->kind() == TypeKind::VARCHAR) { + filter = makeRandomFilter(filterSpec); + } else { + filter = makeRangeFilter(filterSpec); + } + break; } size_t numHits = 0; @@ -334,6 +341,10 @@ class ColumnStats : public AbstractColumnStats { getIntegerValue(max), getIntegerValue(max), false); } + std::unique_ptr makeRandomFilter(const FilterSpec& filterSpec) { + VELOX_FAIL("This method is only used in specific types."); + } + // The sample size is 65536. static constexpr size_t kUniquesMask = 0xffff; std::vector values_; @@ -415,6 +426,10 @@ class ComplexColumnStats : public AbstractColumnStats { VELOX_FAIL("N/A in ComplexType"); } + std::unique_ptr makeRandomFilter(const FilterSpec&) { + VELOX_FAIL("N/A in ComplexType"); + } + std::unique_ptr makeRowGroupSkipRangeFilter( const std::vector& batches, const Subfield& subfield) { @@ -434,6 +449,14 @@ template <> std::unique_ptr ColumnStats::makeRangeFilter( const FilterSpec& filterSpec); +template <> +std::unique_ptr ColumnStats::makeRangeFilter( + const FilterSpec& filterSpec); + +template <> +std::unique_ptr ColumnStats::makeRandomFilter( + const FilterSpec& filterSpec); + template <> std::unique_ptr ColumnStats::makeRangeFilter( const FilterSpec& filterSpec); diff --git a/velox/dwio/common/tests/utils/MapBuilder.h b/velox/dwio/common/tests/utils/MapBuilder.h index 300645a46d71d..b31e7d0e5d873 100644 --- a/velox/dwio/common/tests/utils/MapBuilder.h +++ b/velox/dwio/common/tests/utils/MapBuilder.h @@ -16,6 +16,7 @@ #pragma once +#include "velox/type/SimpleFunctionApi.h" #include "velox/type/Type.h" #include "velox/vector/ComplexVector.h" @@ -55,7 +56,6 @@ class MapBuilder { BufferPtr valueNulls = allocateNulls(items, &pool); auto* valueNullsPtr = valueNulls->asMutable(); - size_t valueNullCount = 0; auto i = 0; auto offset = 0; @@ -73,7 +73,6 @@ class MapBuilder { valuesPtr[offset] = *pair.second; bits::clearNull(valueNullsPtr, offset); } else { - valueNullCount++; bits::setNull(valueNullsPtr, offset); } ++offset; diff --git a/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp b/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp new file mode 100644 index 0000000000000..e2ec87ae605cd --- /dev/null +++ b/velox/dwio/common/tests/utils/UnitLoaderTestTools.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/tests/utils/UnitLoaderTestTools.h" +#include "velox/dwio/common/UnitLoaderTools.h" + +using facebook::velox::dwio::common::LoadUnit; + +namespace facebook::velox::dwio::common::test { + +ReaderMock::ReaderMock( + std::vector rowsPerUnit, + std::vector ioSizes, + UnitLoaderFactory& factory, + uint64_t rowsToSkip) + : rowsPerUnit_{std::move(rowsPerUnit)}, + ioSizes_{std::move(ioSizes)}, + unitsLoaded_(std::vector(rowsPerUnit_.size())), + loader_{factory.create(getUnits(), rowsToSkip)}, + currentUnit_{0}, + currentRowInUnit_{0} { + VELOX_CHECK(rowsPerUnit_.size() == ioSizes_.size()); + auto [currentUnit, currentRowInUnit] = unit_loader_tools::howMuchToSkip( + rowsToSkip, rowsPerUnit_.cbegin(), rowsPerUnit_.cend()); + currentUnit_ = currentUnit; + currentRowInUnit_ = currentRowInUnit; +} + +bool ReaderMock::read(uint64_t maxRows) { + if (!loadUnit()) { + return false; + } + const auto rowsToRead = + std::min(maxRows, rowsPerUnit_[currentUnit_] - currentRowInUnit_); + loader_->onRead(currentUnit_, currentRowInUnit_, rowsToRead); + currentRowInUnit_ += rowsToRead; + return true; +} + +void ReaderMock::seek(uint64_t rowNumber) { + uint64_t totalRows = 0; + uint64_t rowsLeft = rowNumber; + for (size_t unit = 0; unit < rowsPerUnit_.size(); ++unit) { + const uint64_t rowCount = rowsPerUnit_[unit]; + if (rowsLeft < rowCount) { + currentUnit_ = unit; + currentRowInUnit_ = rowsLeft; + loader_->onSeek(currentUnit_, currentRowInUnit_); + return; + } + rowsLeft -= rowCount; + totalRows += rowCount; + } + VELOX_CHECK_EQ( + rowsLeft, + 0, + "Can't seek to possition {} in file. Must be up to {}.", + rowNumber, + totalRows); +} + +bool ReaderMock::loadUnit() { + VELOX_CHECK(currentRowInUnit_ <= rowsPerUnit_[currentUnit_]); + if (currentRowInUnit_ == rowsPerUnit_[currentUnit_]) { + currentRowInUnit_ = 0; + ++currentUnit_; + if (currentUnit_ >= rowsPerUnit_.size()) { + return false; + } + } + auto& unit = loader_->getLoadedUnit(currentUnit_); + auto& unitMock = dynamic_cast(unit); + VELOX_CHECK(unitMock.isLoaded()); + return true; +} + +std::vector> ReaderMock::getUnits() { + std::vector> units; + for (size_t i = 0; i < rowsPerUnit_.size(); ++i) { + units.emplace_back(std::make_unique( + rowsPerUnit_[i], ioSizes_[i], unitsLoaded_, i)); + } + return units; +} + +std::vector getUnitsLoadedWithFalse(size_t count) { + std::vector unitsLoaded(count); + for (auto& unit : unitsLoaded) { + unit = false; + } + return unitsLoaded; +} + +} // namespace facebook::velox::dwio::common::test diff --git a/velox/dwio/common/tests/utils/UnitLoaderTestTools.h b/velox/dwio/common/tests/utils/UnitLoaderTestTools.h new file mode 100644 index 0000000000000..9eae97f575c25 --- /dev/null +++ b/velox/dwio/common/tests/utils/UnitLoaderTestTools.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "velox/common/base/Exceptions.h" +#include "velox/dwio/common/UnitLoader.h" + +namespace facebook::velox::dwio::common::test { + +class LoadUnitMock : public LoadUnit { + public: + LoadUnitMock( + uint64_t rowCount, + uint64_t ioSize, + std::vector& unitsLoaded, + size_t unitId) + : rowCount_{rowCount}, + ioSize_{ioSize}, + unitsLoaded_{unitsLoaded}, + unitId_{unitId} {} + + ~LoadUnitMock() override = default; + + void load() override { + VELOX_CHECK(!isLoaded()); + unitsLoaded_[unitId_] = true; + } + + void unload() override { + VELOX_CHECK(isLoaded()); + unitsLoaded_[unitId_] = false; + } + + uint64_t getNumRows() override { + return rowCount_; + } + + uint64_t getIoSize() override { + return ioSize_; + } + + bool isLoaded() const { + return unitsLoaded_[unitId_]; + } + + private: + uint64_t rowCount_; + uint64_t ioSize_; + std::vector& unitsLoaded_; + size_t unitId_; +}; + +class ReaderMock { + public: + ReaderMock( + std::vector rowsPerUnit, + std::vector ioSizes, + UnitLoaderFactory& factory, + uint64_t rowsToSkip); + + bool read(uint64_t maxRows); + + void seek(uint64_t rowNumber); + + std::vector unitsLoaded() const { + return {unitsLoaded_.begin(), unitsLoaded_.end()}; + } + + private: + bool loadUnit(); + + std::vector> getUnits(); + + std::vector rowsPerUnit_; + std::vector ioSizes_; + std::vector unitsLoaded_; + std::unique_ptr loader_; + size_t currentUnit_; + size_t currentRowInUnit_; + std::optional lastUnitLoaded_; +}; + +std::vector getUnitsLoadedWithFalse(size_t count); + +} // namespace facebook::velox::dwio::common::test diff --git a/velox/dwio/dwrf/RegisterDwrfReader.h b/velox/dwio/dwrf/RegisterDwrfReader.h new file mode 100644 index 0000000000000..ae15ac80965a3 --- /dev/null +++ b/velox/dwio/dwrf/RegisterDwrfReader.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace facebook::velox::dwrf { + +void registerDwrfReaderFactory(); + +void unregisterDwrfReaderFactory(); + +} // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/RegisterDwrfWriter.h b/velox/dwio/dwrf/RegisterDwrfWriter.h new file mode 100644 index 0000000000000..6888da424887f --- /dev/null +++ b/velox/dwio/dwrf/RegisterDwrfWriter.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace facebook::velox::dwrf { + +void registerDwrfWriterFactory(); + +void unregisterDwrfWriterFactory(); + +} // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/common/ByteRLE.cpp b/velox/dwio/dwrf/common/ByteRLE.cpp index ba4b972b4d597..c07f96f87c7c8 100644 --- a/velox/dwio/dwrf/common/ByteRLE.cpp +++ b/velox/dwio/dwrf/common/ByteRLE.cpp @@ -340,56 +340,42 @@ std::unique_ptr createBooleanRleEncoder( } void ByteRleDecoder::nextBuffer() { + VELOX_DCHECK_EQ(pendingSkip_, 0); + int32_t bufferLength; const void* bufferPointer; - DWIO_ENSURE( - inputStream->Next(&bufferPointer, &bufferLength), - "bad read in nextBuffer ", + const auto ret = inputStream_->Next(&bufferPointer, &bufferLength); + VELOX_CHECK( + ret, + "bad read in nextBuffer {}, {}", encodingKey_.toString(), - ", ", - inputStream->getName()); - bufferStart = static_cast(bufferPointer); - bufferEnd = bufferStart + bufferLength; + inputStream_->getName()); + bufferStart_ = static_cast(bufferPointer); + bufferEnd_ = bufferStart_ + bufferLength; } void ByteRleDecoder::seekToRowGroup( dwio::common::PositionProvider& positionProvider) { - // move the input stream - inputStream->seekToPosition(positionProvider); - // force a re-read from the stream - bufferEnd = bufferStart; - // force reading a new header - remainingValues = 0; - // skip ahead the given number of records - ByteRleDecoder::skip(positionProvider.next()); + // Move the input stream + inputStream_->seekToPosition(positionProvider); + // Force a re-read from the stream + bufferEnd_ = bufferStart_; + // Force reading a new header + remainingValues_ = 0; + // Skip ahead the given number of records + pendingSkip_ = positionProvider.next(); } void ByteRleDecoder::skipBytes(size_t count) { - if (bufferStart < bufferEnd) { - size_t skipSize = std::min( + if (bufferStart_ < bufferEnd_) { + const size_t skipSize = std::min( static_cast(count), - static_cast(bufferEnd - bufferStart)); - bufferStart += skipSize; + static_cast(bufferEnd_ - bufferStart_)); + bufferStart_ += skipSize; count -= skipSize; } if (count > 0) { - inputStream->Skip(count); - } -} - -void ByteRleDecoder::skip(uint64_t numValues) { - while (numValues > 0) { - if (remainingValues == 0) { - readHeader(); - } - size_t count = std::min(static_cast(numValues), remainingValues); - remainingValues -= count; - numValues -= count; - // for literals we need to skip over count bytes, which may involve - // reading from the underlying stream - if (!repeating) { - skipBytes(count); - } + inputStream_->Skip(count); } } @@ -397,30 +383,33 @@ void ByteRleDecoder::next( char* data, uint64_t numValues, const uint64_t* nulls) { + skipPending(); + uint64_t position = 0; // skip over null values while (nulls && position < numValues && bits::isBitNull(nulls, position)) { - position += 1; + ++position; } + while (position < numValues) { - // if we are out of values, read more - if (remainingValues == 0) { + // If we are out of values, read more. + if (remainingValues_ == 0) { readHeader(); } - // how many do we read out of this block? - size_t count = - std::min(static_cast(numValues - position), remainingValues); - uint64_t consumed = 0; - if (repeating) { + // How many do we read out of this block? + const size_t count = + std::min(static_cast(numValues - position), remainingValues_); + uint64_t consumed{0}; + if (repeating_) { if (nulls) { for (uint64_t i = 0; i < count; ++i) { if (!bits::isBitNull(nulls, position + i)) { - data[position + i] = value; - consumed += 1; + data[position + i] = value_; + ++consumed; } } } else { - memset(data + position, value, count); + ::memset(data + position, value_, count); consumed = count; } } else { @@ -428,30 +417,31 @@ void ByteRleDecoder::next( for (uint64_t i = 0; i < count; ++i) { if (!bits::isBitNull(nulls, position + i)) { data[position + i] = readByte(); - consumed += 1; + ++consumed; } } } else { uint64_t i = 0; while (i < count) { - if (bufferStart == bufferEnd) { + if (bufferStart_ == bufferEnd_) { nextBuffer(); } - uint64_t copyBytes = std::min( + const uint64_t copyBytes = std::min( static_cast(count - i), - static_cast(bufferEnd - bufferStart)); - std::copy(bufferStart, bufferStart + copyBytes, data + position + i); - bufferStart += copyBytes; + static_cast(bufferEnd_ - bufferStart_)); + std::copy( + bufferStart_, bufferStart_ + copyBytes, data + position + i); + bufferStart_ += copyBytes; i += copyBytes; } consumed = count; } } - remainingValues -= consumed; + remainingValues_ -= consumed; position += count; // skip over any null values while (nulls && position < numValues && bits::isBitNull(nulls, position)) { - position += 1; + ++position; } } } @@ -465,38 +455,34 @@ std::unique_ptr createByteRleDecoder( void BooleanRleDecoder::seekToRowGroup( dwio::common::PositionProvider& positionProvider) { ByteRleDecoder::seekToRowGroup(positionProvider); - uint64_t consumed = positionProvider.next(); - DWIO_ENSURE_LE( + const uint64_t consumed = positionProvider.next(); + VELOX_CHECK_LE( consumed, 8, "bad position ", encodingKey_.toString(), ", ", - inputStream->getName()); - if (consumed != 0) { - remainingBits = 8 - consumed; - ByteRleDecoder::next( - reinterpret_cast(&reversedLastByte), 1, nullptr); - bits::reverseBits(&reversedLastByte, 1); - } else { - remainingBits = 0; - } + inputStream_->getName()); + pendingSkip_ = 8 * pendingSkip_ + consumed; + remainingBits_ = 0; } -void BooleanRleDecoder::skip(uint64_t numValues) { - if (numValues <= remainingBits) { - remainingBits -= numValues; +void BooleanRleDecoder::skipPending() { + auto numValues = pendingSkip_; + pendingSkip_ = 0; + if (numValues <= remainingBits_) { + remainingBits_ -= numValues; } else { - numValues -= remainingBits; - remainingBits = 0; - uint64_t bytesSkipped = numValues / 8; - ByteRleDecoder::skip(bytesSkipped); + numValues -= remainingBits_; + remainingBits_ = 0; + pendingSkip_ = numValues / 8; + ByteRleDecoder::skipPending(); uint64_t bitsToSkip = numValues % 8; if (bitsToSkip) { ByteRleDecoder::next( - reinterpret_cast(&reversedLastByte), 1, nullptr); - bits::reverseBits(&reversedLastByte, 1); - remainingBits = 8 - bitsToSkip; + reinterpret_cast(&reversedLastByte_), 1, nullptr); + bits::reverseBits(&reversedLastByte_, 1); + remainingBits_ = 8 - bitsToSkip; } } } @@ -505,6 +491,8 @@ void BooleanRleDecoder::next( char* data, uint64_t numValues, const uint64_t* nulls) { + skipPending(); + uint64_t nonNulls = numValues; if (nulls) { nonNulls = bits::countNonNulls(nulls, 0, numValues); @@ -512,62 +500,63 @@ void BooleanRleDecoder::next( const uint32_t outputBytes = (numValues + 7) / 8; if (nonNulls == 0) { - memset(data, 0, outputBytes); + ::memset(data, 0, outputBytes); return; } - if (remainingBits >= nonNulls) { + if (remainingBits_ >= nonNulls) { // The remaining bits from last round is enough for this round, and we don't // need to read new data. Since remainingBits should be less than or equal // to 8, therefore nonNulls must be less than 8. - data[0] = reversedLastByte >> (8 - remainingBits) & 0xff >> (8 - nonNulls); - remainingBits -= nonNulls; + data[0] = + reversedLastByte_ >> (8 - remainingBits_) & 0xff >> (8 - nonNulls); + remainingBits_ -= nonNulls; } else { - // Put the remaining bits, if any, into previousByte - uint8_t previousByte = 0; - if (remainingBits > 0) { - previousByte = reversedLastByte >> (8 - remainingBits); + // Put the remaining bits, if any, into previousByte. + uint8_t previousByte{0}; + if (remainingBits_ > 0) { + previousByte = reversedLastByte_ >> (8 - remainingBits_); } // We need to read in (nonNulls - remainingBits) values and it must be a // positive number if nonNulls is positive - const uint64_t bytesRead = ((nonNulls - remainingBits) + 7) / 8; + const uint64_t bytesRead = bits::divRoundUp(nonNulls - remainingBits_, 8); ByteRleDecoder::next(data, bytesRead, nullptr); bits::reverseBits(reinterpret_cast(data), bytesRead); - reversedLastByte = data[bytesRead - 1]; + reversedLastByte_ = data[bytesRead - 1]; // Now shift the data in place - if (remainingBits > 0) { + if (remainingBits_ > 0) { uint64_t nonNullDWords = nonNulls / 64; // Shift 64 bits a time when there're enough data. Note that the data // buffer was created 64-bits aligned so there won't be performance // degradation shifting it in 64-bit unit. - for (uint64_t i = 0; i < nonNullDWords; i++) { + for (uint64_t i = 0; i < nonNullDWords; ++i) { uint64_t tmp = reinterpret_cast(data)[i]; reinterpret_cast(data)[i] = - previousByte | tmp << remainingBits; // previousByte is LSB - previousByte = (tmp >> (64 - remainingBits)) & 0xff; + previousByte | tmp << remainingBits_; // previousByte is LSB + previousByte = (tmp >> (64 - remainingBits_)) & 0xff; } // Shift 8 bits a time for the remaining bits const uint64_t nonNullOutputBytes = (nonNulls + 7) / 8; - for (int32_t i = nonNullDWords * 8; i < nonNullOutputBytes; i++) { + for (int32_t i = nonNullDWords * 8; i < nonNullOutputBytes; ++i) { uint8_t tmp = data[i]; // already reversed - data[i] = previousByte | tmp << remainingBits; // previousByte is LSB - previousByte = tmp >> (8 - remainingBits); + data[i] = previousByte | tmp << remainingBits_; // previousByte is LSB + previousByte = tmp >> (8 - remainingBits_); } } - remainingBits = bytesRead * 8 + remainingBits - nonNulls; + remainingBits_ = bytesRead * 8 + remainingBits_ - nonNulls; } - // unpack data for nulls + // Unpack data for nulls. if (numValues > nonNulls) { bits::scatterBits(nonNulls, numValues, data, nulls, data); } - // clear the most significant bits in the last byte which will be processed in - // the next round + // Clear the most significant bits in the last byte which will be processed in + // the next round. data[outputBytes - 1] &= 0xff >> (outputBytes * 8 - numValues); } diff --git a/velox/dwio/dwrf/common/ByteRLE.h b/velox/dwio/dwrf/common/ByteRLE.h index 11aa3765c06fd..ea7ff790e843f 100644 --- a/velox/dwio/dwrf/common/ByteRLE.h +++ b/velox/dwio/dwrf/common/ByteRLE.h @@ -16,6 +16,7 @@ #pragma once +#include #include #include "velox/common/base/BitUtil.h" #include "velox/common/base/Nulls.h" @@ -100,25 +101,29 @@ class ByteRleDecoder { ByteRleDecoder( std::unique_ptr input, EncodingKey ek) - : inputStream{std::move(input)}, - remainingValues{0}, - value{0}, - bufferStart{nullptr}, - bufferEnd{nullptr}, - repeating{false}, - encodingKey_{ek} {} + : inputStream_{std::move(input)}, + encodingKey_{ek}, + remainingValues_{0}, + value_{0}, + bufferStart_{nullptr}, + bufferEnd_{nullptr}, + repeating_{false} {} virtual ~ByteRleDecoder() = default; /** - * Seek to a specific row group. + * Seek to a specific row group. Should not read the underlying input stream + * to avoid decoding same data multiple times. */ virtual void seekToRowGroup(dwio::common::PositionProvider& positionProvider); /** - * Seek over a given number of values. + * Seek over a given number of values. Does not decode the underlying input + * stream. */ - virtual void skip(uint64_t numValues); + void skip(uint64_t numValues) { + pendingSkip_ += numValues; + } /** * Read a number of values into the batch. @@ -133,35 +138,18 @@ class ByteRleDecoder { * Load the RowIndex values for the stream this is reading. */ virtual size_t loadIndices(size_t startIndex) { - return inputStream->positionSize() + startIndex + 1; + return inputStream_->positionSize() + startIndex + 1; } void skipBytes(size_t bytes); - template - inline void skip(int32_t numValues, int32_t current, const uint64_t* nulls) { - if (hasNulls) { - numValues = bits::countNonNulls(nulls, current, current + numValues); - } - while (numValues > 0) { - if (remainingValues == 0) { - readHeader(); - } - uint64_t count = std::min(numValues, remainingValues); - remainingValues -= count; - numValues -= count; - if (!repeating) { - skipBytes(count); - } - } - } - template void readWithVisitor(const uint64_t* nulls, Visitor visitor) { + skipPending(); int32_t current = visitor.start(); skip(current, 0, nulls); - int32_t toSkip; - bool atEnd = false; + int32_t toSkip{0}; + bool atEnd{false}; const bool allowNulls = hasNulls && visitor.allowNulls(); for (;;) { if (hasNulls && allowNulls && bits::isBitNull(nulls, current)) { @@ -177,16 +165,16 @@ class ByteRleDecoder { } } // We are at a non-null value on a row to visit. - if (!remainingValues) { + if (!remainingValues_) { readHeader(); } - if (repeating) { - toSkip = visitor.process(value, atEnd); + if (repeating_) { + toSkip = visitor.process(value_, atEnd); } else { - value = readByte(); - toSkip = visitor.process(value, atEnd); + value_ = readByte(); + toSkip = visitor.process(value_, atEnd); } - --remainingValues; + --remainingValues_; } ++current; if (toSkip) { @@ -203,31 +191,60 @@ class ByteRleDecoder { void nextBuffer(); inline signed char readByte() { - if (bufferStart == bufferEnd) { + if (bufferStart_ == bufferEnd_) { nextBuffer(); } - return *(bufferStart++); + return *(bufferStart_++); } inline void readHeader() { - signed char ch = readByte(); + const signed char ch = readByte(); if (ch < 0) { - remainingValues = static_cast(-ch); - repeating = false; + remainingValues_ = static_cast(-ch); + repeating_ = false; } else { - remainingValues = static_cast(ch) + RLE_MINIMUM_REPEAT; - repeating = true; - value = readByte(); + remainingValues_ = static_cast(ch) + RLE_MINIMUM_REPEAT; + repeating_ = true; + value_ = readByte(); } } - std::unique_ptr inputStream; - size_t remainingValues; - char value; - const char* bufferStart; - const char* bufferEnd; - bool repeating; - EncodingKey encodingKey_; + virtual void skipPending() { + auto numValues = pendingSkip_; + pendingSkip_ = 0; + while (numValues > 0) { + if (remainingValues_ == 0) { + readHeader(); + } + const auto count = std::min(numValues, remainingValues_); + remainingValues_ -= count; + numValues -= count; + if (!repeating_) { + skipBytes(count); + } + } + } + + const std::unique_ptr inputStream_; + const EncodingKey encodingKey_; + size_t remainingValues_; + char value_; + const char* bufferStart_; + const char* bufferEnd_; + bool repeating_; + int64_t pendingSkip_{0}; + + private: + template + inline void skip(int32_t numValues, int32_t current, const uint64_t* nulls) { + if constexpr (kHasNulls) { + numValues = bits::countNonNulls(nulls, current, current + numValues); + } + pendingSkip_ += numValues; + if (pendingSkip_ > 0) { + skipPending(); + } + } }; /** @@ -250,23 +267,25 @@ std::unique_ptr createBooleanRleEncoder( */ std::unique_ptr createByteRleDecoder( std::unique_ptr input, - const EncodingKey& ek); + const EncodingKey& encodingKey); class BooleanRleDecoder : public ByteRleDecoder { public: BooleanRleDecoder( std::unique_ptr input, - const EncodingKey& ek) - : ByteRleDecoder{std::move(input), ek}, - remainingBits{0}, - reversedLastByte{0} {} + const EncodingKey& encodingKey) + : ByteRleDecoder{std::move(input), encodingKey}, + remainingBits_{0}, + reversedLastByte_{0} {} ~BooleanRleDecoder() override = default; void seekToRowGroup( dwio::common::PositionProvider& positionProvider) override; - void skip(uint64_t numValues) override; + void skip(uint64_t numValues) { + pendingSkip_ += numValues; + } void next(char* data, uint64_t numValues, const uint64_t* nulls) override; @@ -274,22 +293,9 @@ class BooleanRleDecoder : public ByteRleDecoder { return ByteRleDecoder::loadIndices(startIndex) + 1; } - // Advances 'dataPosition' by 'numValue' non-nulls, where 'current' - // is the position in 'nulls'. - template - void skip( - int32_t numValues, - int32_t current, - const uint64_t* nulls, - int32_t& dataPosition) { - if (hasNulls) { - numValues = bits::countNonNulls(nulls, current, current + numValues); - } - dataPosition += numValues; - } - template void readWithVisitor(const uint64_t* nulls, Visitor visitor) { + skipPending(); int32_t end = visitor.rowAt(visitor.numRows() - 1) + 1; int32_t totalNulls = 0; // Reads all the non-null bits between 0 and last row into 'bits', @@ -298,9 +304,9 @@ class BooleanRleDecoder : public ByteRleDecoder { if (hasNulls) { totalNulls = bits::countNulls(nulls, 0, end); } - bits.resize(bits::nwords(end - totalNulls)); + bits_.resize(bits::nwords(end - totalNulls)); if (end > totalNulls) { - next(reinterpret_cast(bits.data()), end - totalNulls, nullptr); + next(reinterpret_cast(bits_.data()), end - totalNulls, nullptr); } int32_t dataPosition = 0; int32_t current = visitor.start(); @@ -326,7 +332,7 @@ class BooleanRleDecoder : public ByteRleDecoder { } } toSkip = - visitor.process(bits::isBitSet(bits.data(), dataPosition), atEnd); + visitor.process(bits::isBitSet(bits_.data(), dataPosition), atEnd); ++dataPosition; skip: ++current; @@ -340,11 +346,28 @@ class BooleanRleDecoder : public ByteRleDecoder { } } + private: + // Advances 'dataPosition' by 'numValue' non-nulls, where 'current' + // is the position in 'nulls'. + template + static void skip( + int32_t numValues, + int32_t current, + const uint64_t* nulls, + int32_t& dataPosition) { + if (hasNulls) { + numValues = bits::countNonNulls(nulls, current, current + numValues); + } + dataPosition += numValues; + } + + void skipPending() override; + protected: - size_t remainingBits; - uint8_t reversedLastByte; - char buffer; - std::vector bits; + size_t remainingBits_; + uint8_t reversedLastByte_; + char buffer_; + std::vector bits_; }; /** diff --git a/velox/dwio/dwrf/common/CMakeLists.txt b/velox/dwio/dwrf/common/CMakeLists.txt index 0856576e6845e..427e2a294386b 100644 --- a/velox/dwio/dwrf/common/CMakeLists.txt +++ b/velox/dwio/dwrf/common/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library( +velox_add_library( velox_dwio_dwrf_common ByteRLE.cpp Common.cpp @@ -28,12 +28,15 @@ add_library( wrap/dwrf-proto-wrapper.cpp wrap/orc-proto-wrapper.cpp) -add_dependencies(velox_dwio_dwrf_common velox_dwio_dwrf_proto) +if(NOT VELOX_MONO_LIBRARY) + add_dependencies(velox_dwio_dwrf_common velox_dwio_dwrf_proto) +endif() -target_link_libraries( +velox_link_libraries( velox_dwio_dwrf_common velox_common_base velox_common_compression + velox_common_config velox_dwio_common velox_dwio_common_compression velox_dwio_dwrf_proto diff --git a/velox/dwio/dwrf/common/Common.cpp b/velox/dwio/dwrf/common/Common.cpp index 786abac66ee66..5cf0f8929e100 100644 --- a/velox/dwio/dwrf/common/Common.cpp +++ b/velox/dwio/dwrf/common/Common.cpp @@ -83,26 +83,4 @@ DwrfStreamIdentifier EncodingKey::forKind(const proto::Stream_Kind kind) const { return DwrfStreamIdentifier(node_, sequence_, 0, kind); } -namespace { -using common::CompressionKind; - -CompressionKind orcCompressionToCompressionKind( - proto::orc::CompressionKind compression) { - switch (compression) { - case proto::orc::CompressionKind::NONE: - return CompressionKind::CompressionKind_NONE; - case proto::orc::CompressionKind::ZLIB: - return CompressionKind::CompressionKind_ZLIB; - case proto::orc::CompressionKind::SNAPPY: - return CompressionKind::CompressionKind_SNAPPY; - case proto::orc::CompressionKind::LZO: - return CompressionKind::CompressionKind_LZO; - case proto::orc::CompressionKind::LZ4: - return CompressionKind::CompressionKind_LZ4; - case proto::orc::CompressionKind::ZSTD: - return CompressionKind::CompressionKind_ZSTD; - } - return CompressionKind::CompressionKind_NONE; -} -} // namespace } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/common/Common.h b/velox/dwio/dwrf/common/Common.h index f03bff24a11a2..de52dbc5ca42d 100644 --- a/velox/dwio/dwrf/common/Common.h +++ b/velox/dwio/dwrf/common/Common.h @@ -129,7 +129,7 @@ class EncodingKey { } bool valid() const { - return node_ != dwio::common::MAX_UINT32 && sequence_ >= 0; + return node_ != dwio::common::MAX_UINT32; } std::string toString() const { diff --git a/velox/dwio/dwrf/common/Compression.h b/velox/dwio/dwrf/common/Compression.h index ce69430b52e06..1444137ae5c8d 100644 --- a/velox/dwio/dwrf/common/Compression.h +++ b/velox/dwio/dwrf/common/Compression.h @@ -17,8 +17,11 @@ #pragma once #include "velox/common/compression/Compression.h" +#include "velox/dwio/common/OutputStream.h" #include "velox/dwio/common/SeekableInputStream.h" #include "velox/dwio/common/compression/Compression.h" +#include "velox/dwio/common/compression/CompressionBufferPool.h" +#include "velox/dwio/common/compression/PagedOutputStream.h" #include "velox/dwio/dwrf/common/Common.h" #include "velox/dwio/dwrf/common/Config.h" #include "velox/dwio/dwrf/common/Decryption.h" @@ -30,7 +33,7 @@ using namespace dwio::common::compression; constexpr uint8_t PAGE_HEADER_SIZE = 3; -static const CompressionOptions getDwrfOrcCompressionOptions( +inline CompressionOptions getDwrfOrcCompressionOptions( velox::common::CompressionKind kind, uint32_t compressionThreshold, int32_t zlibCompressionLevel, @@ -38,7 +41,8 @@ static const CompressionOptions getDwrfOrcCompressionOptions( CompressionOptions options; options.compressionThreshold = compressionThreshold; - if (kind == velox::common::CompressionKind_ZLIB) { + if (kind == velox::common::CompressionKind_ZLIB || + kind == velox::common::CompressionKind_GZIP) { options.format.zlib.windowBits = Compressor::DWRF_ORC_ZLIB_WINDOW_BITS; options.format.zlib.compressionLevel = zlibCompressionLevel; } else if (kind == velox::common::CompressionKind_ZSTD) { @@ -55,7 +59,7 @@ static const CompressionOptions getDwrfOrcCompressionOptions( * collection * @param config The compression options to use */ -static std::unique_ptr createCompressor( +inline std::unique_ptr createCompressor( common::CompressionKind kind, CompressionBufferPool& bufferPool, dwio::common::DataBufferHolder& bufferHolder, @@ -66,19 +70,32 @@ static std::unique_ptr createCompressor( config.get(Config::COMPRESSION_THRESHOLD), config.get(Config::ZLIB_COMPRESSION_LEVEL), config.get(Config::ZSTD_COMPRESSION_LEVEL)); - - return createCompressor( - kind, + auto compressor = createCompressor(kind, dwrfOrcCompressionOptions); + if (!compressor) { + if (!encrypter && kind == common::CompressionKind::CompressionKind_NONE) { + return std::make_unique(bufferHolder); + } + } + return std::make_unique( bufferPool, bufferHolder, + dwrfOrcCompressionOptions.compressionThreshold, PAGE_HEADER_SIZE, - dwrfOrcCompressionOptions, + std::move(compressor), encrypter); } -static const CompressionOptions getDwrfOrcDecompressionOptions() { +inline CompressionOptions getDwrfOrcDecompressionOptions( + common::CompressionKind kind) { CompressionOptions options; - options.format.zlib.windowBits = Compressor::DWRF_ORC_ZLIB_WINDOW_BITS; + if (kind == common::CompressionKind_ZLIB || + kind == common::CompressionKind_GZIP) { + options.format.zlib.windowBits = Compressor::DWRF_ORC_ZLIB_WINDOW_BITS; + } else if ( + kind == common::CompressionKind_LZ4 || + kind == common::CompressionKind_LZO) { + options.format.lz4_lzo.isHadoopFrameFormat = false; + } return options; } @@ -89,14 +106,14 @@ static const CompressionOptions getDwrfOrcDecompressionOptions() { * @param bufferSize The maximum size of the buffer * @param pool The memory pool */ -static std::unique_ptr createDecompressor( +inline std::unique_ptr createDecompressor( facebook::velox::common::CompressionKind kind, std::unique_ptr input, uint64_t bufferSize, memory::MemoryPool& pool, const std::string& streamDebugInfo, const dwio::common::encryption::Decrypter* decryptr = nullptr) { - const CompressionOptions& options = getDwrfOrcDecompressionOptions(); + const CompressionOptions& options = getDwrfOrcDecompressionOptions(kind); return createDecompressor( kind, std::move(input), diff --git a/velox/dwio/dwrf/common/Config.cpp b/velox/dwio/dwrf/common/Config.cpp index cda606466b184..8ce1ad5f21550 100644 --- a/velox/dwio/dwrf/common/Config.cpp +++ b/velox/dwio/dwrf/common/Config.cpp @@ -188,10 +188,22 @@ Config::Entry Config::MAX_DICTIONARY_SIZE( "hive.exec.orc.max.dictionary.size", 80L * 1024L * 1024L); +Config::Entry Config::INTEGER_DICTIONARY_ENCODING_ENABLED( + "hive.exec.orc.integer.dictionary.encoding.enabled", + true); + +Config::Entry Config::STRING_DICTIONARY_ENCODING_ENABLED( + "hive.exec.orc.string.dictionary.encoding.enabled", + true); + Config::Entry Config::STRIPE_SIZE( "hive.exec.orc.stripe.size", 256L * 1024L * 1024L); +Config::Entry Config::LINEAR_STRIPE_SIZE_HEURISTICS( + "hive.exec.orc.linear.stripe.size.heuristics", + true); + Config::Entry Config::FORCE_LOW_MEMORY_MODE( "hive.exec.orc.low.memory", false); diff --git a/velox/dwio/dwrf/common/Config.h b/velox/dwio/dwrf/common/Config.h index 8c040f0bfdcdb..7b001700bb690 100644 --- a/velox/dwio/dwrf/common/Config.h +++ b/velox/dwio/dwrf/common/Config.h @@ -24,10 +24,10 @@ namespace facebook::velox::dwrf { -class Config : public common::ConfigBase { +class Config : public config::ConfigBase { public: template - using Entry = common::ConfigBase::Entry; + using Entry = config::ConfigBase::Entry; static Entry WRITER_VERSION; static Entry COMPRESSION; @@ -61,7 +61,10 @@ class Config : public common::ConfigBase { MAP_FLAT_COLS_STRUCT_KEYS; static Entry MAP_FLAT_MAX_KEYS; static Entry MAX_DICTIONARY_SIZE; + static Entry INTEGER_DICTIONARY_ENCODING_ENABLED; + static Entry STRING_DICTIONARY_ENCODING_ENABLED; static Entry STRIPE_SIZE; + static Entry LINEAR_STRIPE_SIZE_HEURISTICS; /// With this config, we don't even try the more memory intensive encodings on /// writer start up. static Entry FORCE_LOW_MEMORY_MODE; @@ -77,9 +80,17 @@ class Config : public common::ConfigBase { static std::shared_ptr fromMap( const std::map& map) { - auto ret = std::make_shared(); - ret->configs_.insert(map.cbegin(), map.cend()); - return ret; + auto config = std::make_shared(); + for (const auto& pair : map) { + config->set(pair.first, pair.second); + } + return config; + } + + Config() : ConfigBase({}, true) {} + + std::map toSerdeParams() { + return std::map{configs_.cbegin(), configs_.cend()}; } }; diff --git a/velox/dwio/dwrf/common/DecoderUtil.h b/velox/dwio/dwrf/common/DecoderUtil.h index 9c269a62f7102..f1b100773d68f 100644 --- a/velox/dwio/dwrf/common/DecoderUtil.h +++ b/velox/dwio/dwrf/common/DecoderUtil.h @@ -16,6 +16,7 @@ #pragma once +#include "velox/common/base/Exceptions.h" #include "velox/dwio/common/DirectDecoder.h" #include "velox/dwio/common/IntDecoder.h" #include "velox/dwio/dwrf/common/RLEv1.h" @@ -42,7 +43,7 @@ std::unique_ptr> createRleDecoder( case RleVersion_2: return std::make_unique>(std::move(input), pool); default: - DWIO_ENSURE(false, "not supported"); + VELOX_UNSUPPORTED("Not supported: {}", static_cast(version)); return {}; } } diff --git a/velox/dwio/dwrf/common/FileMetadata.cpp b/velox/dwio/dwrf/common/FileMetadata.cpp index 07a687b63ace3..ccb9f6faa7f50 100644 --- a/velox/dwio/dwrf/common/FileMetadata.cpp +++ b/velox/dwio/dwrf/common/FileMetadata.cpp @@ -102,9 +102,9 @@ TypeKind TypeWrapper::kind() const { } case proto::orc::Type_Kind_CHAR: case proto::orc::Type_Kind_TIMESTAMP_INSTANT: - DWIO_RAISE( + VELOX_FAIL(fmt::format( "{} not supported yet.", - proto::orc::Type_Kind_Name(orcPtr()->kind())); + proto::orc::Type_Kind_Name(orcPtr()->kind()))); default: VELOX_FAIL("Unknown type kind: {}", Type_Kind_Name(orcPtr()->kind())); } diff --git a/velox/dwio/dwrf/common/FileMetadata.h b/velox/dwio/dwrf/common/FileMetadata.h index 2ea21628a595a..bee7c0acfabb1 100644 --- a/velox/dwio/dwrf/common/FileMetadata.h +++ b/velox/dwio/dwrf/common/FileMetadata.h @@ -31,13 +31,6 @@ enum class DwrfFormat : uint8_t { }; class ProtoWrapperBase { - protected: - ProtoWrapperBase(DwrfFormat format, const void* impl) - : format_{format}, impl_{impl} {} - - DwrfFormat format_; - const void* impl_; - public: DwrfFormat format() const { return format_; @@ -46,6 +39,13 @@ class ProtoWrapperBase { inline const void* rawProtoPtr() const { return impl_; } + + protected: + ProtoWrapperBase(DwrfFormat format, const void* impl) + : format_{format}, impl_{impl} {} + + const DwrfFormat format_; + const void* const impl_; }; /*** @@ -336,6 +336,334 @@ class UserMetadataItemWrapper : public ProtoWrapperBase { } }; +class IntegerStatisticsWrapper : public ProtoWrapperBase { + public: + explicit IntegerStatisticsWrapper( + const proto::IntegerStatistics* intStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, intStatistics) {} + + explicit IntegerStatisticsWrapper( + const proto::orc::IntegerStatistics* intStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, intStatistics) {} + + bool hasMinimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_minimum() + : orcPtr()->has_minimum(); + } + + int64_t minimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->minimum() + : orcPtr()->minimum(); + } + + bool hasMaximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_maximum() + : orcPtr()->has_maximum(); + } + + int64_t maximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->maximum() + : orcPtr()->maximum(); + } + + bool hasSum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum() + : orcPtr()->has_sum(); + } + + int64_t sum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum(); + } + + private: + // private helper with no format checking + inline const proto::IntegerStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::IntegerStatistics* orcPtr() const { + return reinterpret_cast( + rawProtoPtr()); + } +}; + +class DoubleStatisticsWrapper : public ProtoWrapperBase { + public: + explicit DoubleStatisticsWrapper( + const proto::DoubleStatistics* doubleStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, doubleStatistics) {} + + explicit DoubleStatisticsWrapper( + const proto::orc::DoubleStatistics* doubleStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, doubleStatistics) {} + + bool hasMinimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_minimum() + : orcPtr()->has_minimum(); + } + + double minimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->minimum() + : orcPtr()->minimum(); + } + + bool hasMaximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_maximum() + : orcPtr()->has_maximum(); + } + + double maximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->maximum() + : orcPtr()->maximum(); + } + + bool hasSum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum() + : orcPtr()->has_sum(); + } + + double sum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum(); + } + + private: + // private helper with no format checking + inline const proto::DoubleStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::DoubleStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + +class StringStatisticsWrapper : public ProtoWrapperBase { + public: + explicit StringStatisticsWrapper( + const proto::StringStatistics* stringStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, stringStatistics) {} + + explicit StringStatisticsWrapper( + const proto::orc::StringStatistics* stringStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, stringStatistics) {} + + bool hasMinimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_minimum() + : orcPtr()->has_minimum(); + } + + const std::string& minimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->minimum() + : orcPtr()->minimum(); + } + + bool hasMaximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_maximum() + : orcPtr()->has_maximum(); + } + + const std::string& maximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->maximum() + : orcPtr()->maximum(); + } + + bool hasSum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum() + : orcPtr()->has_sum(); + } + + int64_t sum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum(); + } + + private: + // private helper with no format checking + inline const proto::StringStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::StringStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + +class BucketStatisticsWrapper : public ProtoWrapperBase { + public: + explicit BucketStatisticsWrapper( + const proto::BucketStatistics* bucketStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, bucketStatistics) {} + + explicit BucketStatisticsWrapper( + const proto::orc::BucketStatistics* bucketStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, bucketStatistics) {} + + int countSize() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->count_size() + : orcPtr()->count_size(); + } + + uint64_t count(int index) const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->count(index) + : orcPtr()->count(index); + } + + private: + // private helper with no format checking + inline const proto::BucketStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::BucketStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + +class BinaryStatisticsWrapper : public ProtoWrapperBase { + public: + explicit BinaryStatisticsWrapper( + const proto::BinaryStatistics* binaryStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, binaryStatistics) {} + + explicit BinaryStatisticsWrapper( + const proto::orc::BinaryStatistics* binaryStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, binaryStatistics) {} + + bool hasSum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum() + : orcPtr()->has_sum(); + } + + int64_t sum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum(); + } + + private: + // private helper with no format checking + inline const proto::BinaryStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::BinaryStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + +class ColumnStatisticsWrapper : public ProtoWrapperBase { + public: + explicit ColumnStatisticsWrapper( + const proto::ColumnStatistics* columnStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, columnStatistics) {} + + explicit ColumnStatisticsWrapper( + const proto::orc::ColumnStatistics* columnStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, columnStatistics) {} + + bool hasNumberOfValues() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_numberofvalues() + : orcPtr()->has_numberofvalues(); + } + + uint64_t numberOfValues() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->numberofvalues() + : orcPtr()->numberofvalues(); + } + + bool hasHasNull() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_hasnull() + : orcPtr()->has_hasnull(); + } + + bool hasNull() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->hasnull() + : orcPtr()->hasnull(); + } + + bool hasRawSize() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_rawsize() : false; + } + + uint64_t rawSize() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->rawsize() : 0; + } + + bool hasSize() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_size() : false; + } + + uint64_t size() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->size() : 0; + } + + bool hasIntStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_intstatistics() + : orcPtr()->has_intstatistics(); + } + + IntegerStatisticsWrapper intStatistics() const { + return format_ == DwrfFormat::kDwrf + ? IntegerStatisticsWrapper(&dwrfPtr()->intstatistics()) + : IntegerStatisticsWrapper(&orcPtr()->intstatistics()); + } + + bool hasDoubleStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_doublestatistics() + : orcPtr()->has_doublestatistics(); + } + + DoubleStatisticsWrapper doubleStatistics() const { + return format_ == DwrfFormat::kDwrf + ? DoubleStatisticsWrapper(&dwrfPtr()->doublestatistics()) + : DoubleStatisticsWrapper(&orcPtr()->doublestatistics()); + } + + bool hasStringStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_stringstatistics() + : orcPtr()->has_stringstatistics(); + } + + StringStatisticsWrapper stringStatistics() const { + return format_ == DwrfFormat::kDwrf + ? StringStatisticsWrapper(&dwrfPtr()->stringstatistics()) + : StringStatisticsWrapper(&orcPtr()->stringstatistics()); + } + + bool hasBucketStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_bucketstatistics() + : orcPtr()->has_bucketstatistics(); + } + + BucketStatisticsWrapper bucketStatistics() const { + return format_ == DwrfFormat::kDwrf + ? BucketStatisticsWrapper(&dwrfPtr()->bucketstatistics()) + : BucketStatisticsWrapper(&orcPtr()->bucketstatistics()); + } + + bool hasBinaryStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_binarystatistics() + : orcPtr()->has_binarystatistics(); + } + + BinaryStatisticsWrapper binaryStatistics() const { + return format_ == DwrfFormat::kDwrf + ? BinaryStatisticsWrapper(&dwrfPtr()->binarystatistics()) + : BinaryStatisticsWrapper(&orcPtr()->binarystatistics()); + } + + bool hasMapStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_mapstatistics() + : false; + } + + const ::facebook::velox::dwrf::proto::MapStatistics& mapStatistics() const { + VELOX_CHECK_EQ(format_, DwrfFormat::kDwrf); + return dwrfPtr()->mapstatistics(); + } + + private: + // private helper with no format checking + inline const proto::ColumnStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::ColumnStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + class FooterWrapper : public ProtoWrapperBase { public: explicit FooterWrapper(const proto::Footer* footer) @@ -424,9 +752,9 @@ class FooterWrapper : public ProtoWrapperBase { return dwrfPtr()->stripecacheoffsets(); } - // TODO: ORC has not supported column statistics yet int statisticsSize() const { - return format_ == DwrfFormat::kDwrf ? dwrfPtr()->statistics_size() : 0; + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->statistics_size() + : orcPtr()->statistics_size(); } const ::google::protobuf::RepeatedPtrField< @@ -436,12 +764,18 @@ class FooterWrapper : public ProtoWrapperBase { return dwrfPtr()->statistics(); } - const ::facebook::velox::dwrf::proto::ColumnStatistics& statistics( + const ::facebook::velox::dwrf::proto::ColumnStatistics& dwrfStatistics( int index) const { VELOX_CHECK_EQ(format_, DwrfFormat::kDwrf); return dwrfPtr()->statistics(index); } + ColumnStatisticsWrapper statistics(int index) const { + return format_ == DwrfFormat::kDwrf + ? ColumnStatisticsWrapper(&dwrfPtr()->statistics(index)) + : ColumnStatisticsWrapper(&orcPtr()->statistics(index)); + } + // TODO: ORC has not supported encryption yet bool hasEncryption() const { return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_encryption() : false; @@ -495,3 +829,10 @@ class FooterWrapper : public ProtoWrapperBase { }; } // namespace facebook::velox::dwrf + +template <> +struct fmt::formatter : formatter { + auto format(facebook::velox::dwrf::DwrfFormat s, format_context& ctx) const { + return formatter::format(static_cast(s), ctx); + } +}; diff --git a/velox/dwio/dwrf/common/FloatingPointDecoder.h b/velox/dwio/dwrf/common/FloatingPointDecoder.h index ccc0e1ba53841..7261371cb5ce2 100644 --- a/velox/dwio/dwrf/common/FloatingPointDecoder.h +++ b/velox/dwio/dwrf/common/FloatingPointDecoder.h @@ -24,20 +24,20 @@ namespace facebook::velox::dwrf { struct DropValues; -template +template class FloatingPointDecoder { public: FloatingPointDecoder( std::unique_ptr&& input) : input_(std::move(input)) {} - TData readValue() { - if (bufferEnd_ - bufferStart_ >= sizeof(TData)) { - TData value = *reinterpret_cast(bufferStart_); - bufferStart_ += sizeof(TData); + TFile readValue() { + if (bufferEnd_ - bufferStart_ >= sizeof(TFile)) { + TFile value = *reinterpret_cast(bufferStart_); + bufferStart_ += sizeof(TFile); return value; } - TData temp; + TFile temp; readBytes(sizeof(temp), input_.get(), &temp, bufferStart_, bufferEnd_); return temp; } @@ -49,7 +49,7 @@ class FloatingPointDecoder { void skip(uint64_t numValues) { skipBytes( - numValues * sizeof(TData), input_.get(), bufferStart_, bufferEnd_); + numValues * sizeof(TFile), input_.get(), bufferStart_, bufferEnd_); } template @@ -58,12 +58,12 @@ class FloatingPointDecoder { numValues = bits::countNonNulls(nulls, current, current + numValues); } skipBytes( - numValues * sizeof(TData), input_.get(), bufferStart_, bufferEnd_); + numValues * sizeof(TFile), input_.get(), bufferStart_, bufferEnd_); } template void readWithVisitor(const uint64_t* nulls, Visitor visitor) { - if (std::is_same_v && + if (std::is_same_v && dwio::common::useFastPath(visitor)) { fastPath(nulls, visitor); return; @@ -141,7 +141,7 @@ class FloatingPointDecoder { return; } } - dwio::common::fixedWidthScan( + dwio::common::fixedWidthScan( innerVector ? folly::Range(*innerVector) : folly::Range(rows, outerVector->size()), outerVector->data(), @@ -155,7 +155,7 @@ class FloatingPointDecoder { visitor.hook()); skip(tailSkip, 0, nullptr); } else { - dwio::common::fixedWidthScan( + dwio::common::fixedWidthScan( folly::Range(rows, numRows), hasHook ? velox::iota(numRows, visitor.innerNonNullRows()) : nullptr, visitor.rawValues(numRows), diff --git a/velox/dwio/dwrf/common/RLEv1.cpp b/velox/dwio/dwrf/common/RLEv1.cpp index 3ef54f85a8136..1b903a1cc3920 100644 --- a/velox/dwio/dwrf/common/RLEv1.cpp +++ b/velox/dwio/dwrf/common/RLEv1.cpp @@ -22,42 +22,45 @@ namespace facebook::velox::dwrf { template void RleEncoderV1::writeValues() { - if (numLiterals != 0) { - if (repeat) { - IntEncoder::writeByte( - static_cast(numLiterals - RLE_MINIMUM_REPEAT)); - IntEncoder::writeByte(static_cast(delta)); - if (!IntEncoder::useVInts_) { - IntEncoder::writeLongLE(literals[0]); + if (numLiterals_ == 0) { + return; + } + + if (repeat_) { + IntEncoder::writeByte( + static_cast(numLiterals_ - RLE_MINIMUM_REPEAT)); + IntEncoder::writeByte(static_cast(delta_)); + if (!IntEncoder::useVInts_) { + IntEncoder::writeLongLE(literals_[0]); + } else { + if constexpr (isSigned) { + IntEncoder::writeVslong(literals_[0]); } else { - if constexpr (isSigned) { - IntEncoder::writeVslong(literals[0]); - } else { - IntEncoder::writeVulong(literals[0]); - } + IntEncoder::writeVulong(literals_[0]); + } + } + } else { + IntEncoder::writeByte(static_cast(-numLiterals_)); + if (!IntEncoder::useVInts_) { + for (int32_t i = 0; i < numLiterals_; ++i) { + IntEncoder::writeLongLE(literals_[i]); } } else { - IntEncoder::writeByte(static_cast(-numLiterals)); - if (!IntEncoder::useVInts_) { - for (int32_t i = 0; i < numLiterals; ++i) { - IntEncoder::writeLongLE(literals[i]); + if constexpr (isSigned) { + for (int32_t i = 0; i < numLiterals_; ++i) { + IntEncoder::writeVslong(literals_[i]); } } else { - if constexpr (isSigned) { - for (int32_t i = 0; i < numLiterals; ++i) { - IntEncoder::writeVslong(literals[i]); - } - } else { - for (int32_t i = 0; i < numLiterals; ++i) { - IntEncoder::writeVulong(literals[i]); - } + for (int32_t i = 0; i < numLiterals_; ++i) { + IntEncoder::writeVulong(literals_[i]); } } } - repeat = false; - numLiterals = 0; - tailRunLength = 0; } + + repeat_ = false; + numLiterals_ = 0; + tailRunLength_ = 0; } template void RleEncoderV1::writeValues(); @@ -66,15 +69,15 @@ template void RleEncoderV1::writeValues(); template void RleDecoderV1::seekToRowGroup( dwio::common::PositionProvider& location) { - // move the input stream - dwio::common::IntDecoder::inputStream->seekToPosition(location); - // force a re-read from the stream - dwio::common::IntDecoder::bufferEnd = - dwio::common::IntDecoder::bufferStart; - // force reading a new header - remainingValues = 0; - // skip ahead the given number of records - skip(location.next()); + // Move the input stream. + dwio::common::IntDecoder::inputStream_->seekToPosition(location); + // Force a re-read from the stream. + dwio::common::IntDecoder::bufferEnd_ = + dwio::common::IntDecoder::bufferStart_; + // Force reading a new header. + remainingValues_ = 0; + // Skip ahead the given number of records. + this->pendingSkip_ = location.next(); } template void RleDecoderV1::seekToRowGroup( @@ -83,30 +86,34 @@ template void RleDecoderV1::seekToRowGroup( dwio::common::PositionProvider& location); template -void RleDecoderV1::skip(uint64_t numValues) { +void RleDecoderV1::skipPending() { + uint64_t numValues = this->pendingSkip_; + this->pendingSkip_ = 0; while (numValues > 0) { - if (remainingValues == 0) { + if (remainingValues_ == 0) { readHeader(); } - uint64_t count = std::min(numValues, remainingValues); - remainingValues -= count; + const uint64_t count = std::min(numValues, remainingValues_); + remainingValues_ -= count; numValues -= count; - if (repeating) { - value += delta * static_cast(count); + if (repeating_) { + value_ += delta_ * static_cast(count); } else { dwio::common::IntDecoder::skipLongs(count); } } } -template void RleDecoderV1::skip(uint64_t numValues); -template void RleDecoderV1::skip(uint64_t numValues); +template void RleDecoderV1::skipPending(); +template void RleDecoderV1::skipPending(); template void RleDecoderV1::next( - int64_t* const data, + int64_t* data, const uint64_t numValues, const uint64_t* const nulls) { + skipPending(); + uint64_t position = 0; // skipNulls() if (nulls) { @@ -117,83 +124,85 @@ void RleDecoderV1::next( } while (position < numValues) { // If we are out of values, read more. - if (remainingValues == 0) { + if (remainingValues_ == 0) { readHeader(); } // How many do we read out of this block? - uint64_t count = std::min(numValues - position, remainingValues); + const uint64_t count = std::min(numValues - position, remainingValues_); uint64_t consumed = 0; - if (repeating) { + if (repeating_) { if (nulls) { for (uint64_t i = 0; i < count; ++i) { if (!bits::isBitNull(nulls, position + i)) { - data[position + i] = value + static_cast(consumed) * delta; - consumed += 1; + data[position + i] = + value_ + static_cast(consumed) * delta_; + ++consumed; } } } else { for (uint64_t i = 0; i < count; ++i) { - data[position + i] = value + static_cast(i) * delta; + data[position + i] = value_ + static_cast(i) * delta_; } consumed = count; } - value += static_cast(consumed) * delta; + value_ += static_cast(consumed) * delta_; } else { - int64_t* datap = data + position; - int64_t* const datapEnd = datap + count; + int64_t* next = data + position; + int64_t* const end = next + count; if (nulls) { - int32_t idx = position; - if (!dwio::common::IntDecoder::useVInts) { - while (datap != datapEnd) { - if (LIKELY(!bits::isBitNull(nulls, idx++))) { - *(datap++) = dwio::common::IntDecoder::readLongLE(); + int32_t index = position; + if (!dwio::common::IntDecoder::useVInts_) { + while (next != end) { + if (LIKELY(!bits::isBitNull(nulls, index++))) { + *(next++) = dwio::common::IntDecoder::readLongLE(); ++consumed; } else { - *(datap++) = 0; + *(next++) = 0; } } } else if constexpr (isSigned) { - while (datap != datapEnd) { - if (LIKELY(!bits::isBitNull(nulls, idx++))) { - *(datap++) = dwio::common::IntDecoder::readVsLong(); + while (next != end) { + if (LIKELY(!bits::isBitNull(nulls, index++))) { + *(next++) = dwio::common::IntDecoder::readVsLong(); ++consumed; } else { - *(datap++) = 0; + *(next++) = 0; } } } else { - while (datap != datapEnd) { - if (LIKELY(!bits::isBitNull(nulls, idx++))) { - *(datap++) = static_cast( + while (next != end) { + if (LIKELY(!bits::isBitNull(nulls, index++))) { + *(next++) = static_cast( dwio::common::IntDecoder::readVuLong()); ++consumed; } else { - *(datap++) = 0; + *(next++) = 0; } } } } else { - if (!dwio::common::IntDecoder::useVInts) { - while (datap != datapEnd) { - *(datap++) = dwio::common::IntDecoder::readLongLE(); + if (!dwio::common::IntDecoder::useVInts_) { + while (next != end) { + *(next++) = dwio::common::IntDecoder::readLongLE(); } } else if constexpr (isSigned) { - while (datap != datapEnd) { - *(datap++) = dwio::common::IntDecoder::readVsLong(); + while (next != end) { + *(next++) = dwio::common::IntDecoder::readVsLong(); } } else { - while (datap != datapEnd) { - *(datap++) = static_cast( + while (next != end) { + *(next++) = static_cast( dwio::common::IntDecoder::readVuLong()); } } consumed = count; } } - remainingValues -= consumed; + + remainingValues_ -= consumed; position += count; - // skipNulls() + // skipNulls(). if (nulls) { // Skip over null values. while (position < numValues && bits::isBitNull(nulls, position)) { @@ -206,36 +215,38 @@ void RleDecoderV1::next( template void RleDecoderV1::next( int64_t* const data, const uint64_t numValues, - const uint64_t* const nulls); + const uint64_t* nulls); template void RleDecoderV1::next( int64_t* const data, const uint64_t numValues, - const uint64_t* const nulls); + const uint64_t* nulls); template -void RleDecoderV1::nextLengths( - int32_t* const data, - const int32_t numValues) { +void RleDecoderV1::nextLengths(int32_t* data, int32_t numValues) { + skipPending(); + uint32_t position = 0; while (position < numValues) { // If we are out of values, read more. - if (remainingValues == 0) { + if (remainingValues_ == 0) { readHeader(); } + // How many do we read out of this block? - int32_t count = std::min(numValues - position, remainingValues); + const int32_t count = + std::min(numValues - position, remainingValues_); uint64_t consumed = 0; - if (repeating) { + if (repeating_) { for (uint32_t i = 0; i < count; ++i) { - data[position + i] = value + i * delta; + data[position + i] = value_ + i * delta_; } consumed = count; - value += static_cast(consumed) * delta; + value_ += static_cast(consumed) * delta_; } else { dwio::common::IntDecoder::bulkRead(count, data + position); consumed = count; } - remainingValues -= consumed; + remainingValues_ -= consumed; position += count; } } diff --git a/velox/dwio/dwrf/common/RLEv1.h b/velox/dwio/dwrf/common/RLEv1.h index 2a0340a31edde..082b57c10e08f 100644 --- a/velox/dwio/dwrf/common/RLEv1.h +++ b/velox/dwio/dwrf/common/RLEv1.h @@ -17,7 +17,6 @@ #pragma once #include "velox/common/base/GTestMacros.h" -#include "velox/common/base/Nulls.h" #include "velox/dwio/common/Adaptor.h" #include "velox/dwio/common/DecoderUtil.h" #include "velox/dwio/common/IntDecoder.h" @@ -38,15 +37,15 @@ class RleEncoderV1 : public IntEncoder { bool useVInts, uint32_t numBytes) : IntEncoder{std::move(outStream), useVInts, numBytes}, - numLiterals{0}, - delta{0}, - repeat{false}, - tailRunLength{0}, - isOverflow{false} {} - - // For 64 bit Integers, only signed type is supported. writeVuLong only - // supports int64_t and it needs to support uint64_t before this method - // can support uint64_t overload. + numLiterals_{0}, + delta_{0}, + repeat_{false}, + tailRunLength_{0}, + overflow_{false} {} + + /// For 64 bit Integers, only signed type is supported. writeVuLong only + /// supports int64_t and it needs to support uint64_t before this method + /// can support uint64_t overload. uint64_t add( const int64_t* data, const common::Ranges& ranges, @@ -82,7 +81,7 @@ class RleEncoderV1 : public IntEncoder { return addImpl(data, ranges, nulls); } - void writeValue(const int64_t value) override { + void writeValue(int64_t value) override { write(value); } @@ -94,65 +93,59 @@ class RleEncoderV1 : public IntEncoder { void recordPosition(PositionRecorder& recorder, int32_t strideIndex = -1) const override { IntEncoder::recordPosition(recorder, strideIndex); - recorder.add(static_cast(numLiterals), strideIndex); + recorder.add(static_cast(numLiterals_), strideIndex); } private: constexpr static int32_t MAX_DELTA = 127; constexpr static int32_t MIN_DELTA = -128; - std::array literals; - int32_t numLiterals; - int64_t delta; - bool repeat; - int32_t tailRunLength; - bool isOverflow; - template void write(T value) { - if (numLiterals == 0) { + if (numLiterals_ == 0) { // Starting new sequence of run or literals. - literals[numLiterals++] = value; - tailRunLength = 1; + literals_[numLiterals_++] = value; + tailRunLength_ = 1; return; } - if (repeat) { + if (repeat_) { if (isRunRepeating(value)) { - numLiterals += 1; - if (numLiterals == RLE_MAXIMUM_REPEAT) { + ++numLiterals_; + if (numLiterals_ == RLE_MAXIMUM_REPEAT) { writeValues(); } } else { writeValues(); - literals[numLiterals++] = value; - tailRunLength = 1; + literals_[numLiterals_++] = value; + tailRunLength_ = 1; } return; } - if (tailRunLength == 1) { + if (tailRunLength_ == 1) { computeDeltaAndTailRunLength(value); } else if (isRunStarting(value)) { - tailRunLength += 1; + ++tailRunLength_; } else { computeDeltaAndTailRunLength(value); } - if (tailRunLength == RLE_MINIMUM_REPEAT) { - if (numLiterals + 1 == RLE_MINIMUM_REPEAT) { - numLiterals += 1; + + if (tailRunLength_ == RLE_MINIMUM_REPEAT) { + if (numLiterals_ + 1 == RLE_MINIMUM_REPEAT) { + ++numLiterals_; } else { - numLiterals -= (RLE_MINIMUM_REPEAT - 1); - int64_t base = literals[numLiterals]; + numLiterals_ -= (RLE_MINIMUM_REPEAT - 1); + const int64_t base = literals_[numLiterals_]; writeValues(); - literals[0] = base; - numLiterals = RLE_MINIMUM_REPEAT; + literals_[0] = base; + numLiterals_ = RLE_MINIMUM_REPEAT; } - // set repeat, so that next call call to write can be special cased. - repeat = true; + // Set repeat, so that next call call to write can be special cased. + repeat_ = true; } else { - literals[numLiterals++] = value; - if (numLiterals == RLE_MAX_LITERAL_SIZE) { + literals_[numLiterals_++] = value; + if (numLiterals_ == RLE_MAX_LITERAL_SIZE) { writeValues(); } } @@ -166,48 +159,55 @@ class RleEncoderV1 : public IntEncoder { template FOLLY_ALWAYS_INLINE bool isRunRepeating(const Integer& value) { - if constexpr (sizeof(Integer) == sizeof(delta)) { + if constexpr (sizeof(Integer) == sizeof(delta_)) { int64_t nextRunValue; - isOverflow = __builtin_add_overflow( - literals[0], delta * numLiterals, &nextRunValue); - return value == nextRunValue && !isOverflow; + overflow_ = __builtin_add_overflow( + literals_[0], delta_ * numLiterals_, &nextRunValue); + return value == nextRunValue && !overflow_; } else { - return value == literals[0] + delta * numLiterals; + return value == literals_[0] + delta_ * numLiterals_; } } template FOLLY_ALWAYS_INLINE bool isRunStarting(const Integer& value) { - if constexpr (sizeof(Integer) == sizeof(delta)) { + if constexpr (sizeof(Integer) == sizeof(delta_)) { int64_t nextRunValue; - isOverflow = __builtin_add_overflow( - literals[numLiterals - 1], delta, &nextRunValue); - return value == nextRunValue && !isOverflow; + overflow_ = __builtin_add_overflow( + literals_[numLiterals_ - 1], delta_, &nextRunValue); + return value == nextRunValue && !overflow_; } else { - return value == literals[numLiterals - 1] + delta; + return value == literals_[numLiterals_ - 1] + delta_; } } template FOLLY_ALWAYS_INLINE void computeDeltaAndTailRunLength(const Integer& value) { - if constexpr (sizeof(Integer) == sizeof(delta)) { - isOverflow = - __builtin_sub_overflow(value, literals[numLiterals - 1], &delta); - if (UNLIKELY(isOverflow)) { - tailRunLength = 1; + if constexpr (sizeof(Integer) == sizeof(delta_)) { + overflow_ = + __builtin_sub_overflow(value, literals_[numLiterals_ - 1], &delta_); + if (UNLIKELY(overflow_)) { + tailRunLength_ = 1; return; } } else { - delta = value - literals[numLiterals - 1]; + delta_ = value - literals_[numLiterals_ - 1]; } - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; + if (delta_ < MIN_DELTA || delta_ > MAX_DELTA) { + tailRunLength_ = 1; } else { - tailRunLength = 2; + tailRunLength_ = 2; } } + std::array literals_; + int32_t numLiterals_; + int64_t delta_; + bool repeat_; + int32_t tailRunLength_; + bool overflow_; + VELOX_FRIEND_TEST(RleEncoderV1Test, encodeMinAndMax); VELOX_FRIEND_TEST(RleEncoderV1Test, encodeMinAndMaxint32); }; @@ -246,48 +246,28 @@ class RleDecoderV1 : public dwio::common::IntDecoder { uint32_t numBytes) : dwio::common::IntDecoder< isSigned>{std::move(input), useVInts, numBytes}, - remainingValues(0), - value(0), - delta(0), - repeating(false) {} + remainingValues_(0), + value_(0), + delta_(0), + repeating_(false) {} void seekToRowGroup( dwio::common::PositionProvider& positionProvider) override; - void skip(uint64_t numValues) override; - void next(int64_t* data, uint64_t numValues, const uint64_t* nulls) override; void nextLengths(int32_t* data, int32_t numValues) override; - template - inline void skip(int32_t numValues, int32_t current, const uint64_t* nulls) { - if (hasNulls) { - numValues = bits::countNonNulls(nulls, current, current + numValues); - } - while (numValues > 0) { - if (remainingValues == 0) { - readHeader(); - } - uint64_t count = std::min(numValues, remainingValues); - remainingValues -= count; - numValues -= count; - if (repeating) { - value += delta * static_cast(count); - } else { - dwio::common::IntDecoder::skipLongsFast(count); - } - } - } - template void readWithVisitor(const uint64_t* nulls, Visitor visitor) { + skipPending(); if (dwio::common::useFastPath(visitor)) { fastPath(nulls, visitor); return; } + int32_t current = visitor.start(); - skip(current, 0, nulls); + this->template skip(current, 0, nulls); int32_t toSkip; bool atEnd = false; const bool allowNulls = hasNulls && visitor.allowNulls(); @@ -298,7 +278,7 @@ class RleDecoderV1 : public dwio::common::IntDecoder { if (hasNulls && !allowNulls) { toSkip = visitor.checkAndSkipNulls(nulls, current, atEnd); if (!Visitor::dense) { - skip(toSkip, current, nullptr); + this->template skip(toSkip, current, nullptr); } if (atEnd) { return; @@ -306,22 +286,22 @@ class RleDecoderV1 : public dwio::common::IntDecoder { } // We are at a non-null value on a row to visit. - if (!remainingValues) { + if (remainingValues_ == 0) { readHeader(); } - if (repeating) { - toSkip = visitor.process(value, atEnd); - value += delta; + if (repeating_) { + toSkip = visitor.process(value_, atEnd); + value_ += delta_; } else { - value = + value_ = dwio::common::IntDecoder::template readInt(); - toSkip = visitor.process(value, atEnd); + toSkip = visitor.process(value_, atEnd); } - --remainingValues; + --remainingValues_; } ++current; - if (toSkip) { - skip(toSkip, current, nulls); + if (toSkip > 0) { + this->template skip(toSkip, current, nulls); current += toSkip; } if (atEnd) { @@ -369,13 +349,13 @@ class RleDecoderV1 : public dwio::common::IntDecoder { visitor.setHasNulls(); } if (innerVector->empty()) { - skip(tailSkip, 0, nullptr); + this->template skip(tailSkip, 0, nullptr); visitor.setAllNull(hasFilter ? 0 : numRows); return; } bulkScan( *innerVector, outerVector->data(), visitor); - skip(tailSkip, 0, nullptr); + this->template skip(tailSkip, 0, nullptr); } } else { bulkScan(rowsAsRange, nullptr, visitor); @@ -410,31 +390,34 @@ class RleDecoderV1 : public dwio::common::IntDecoder { numValues); } - // Returns 1. how many of 'rows' are in the current run 2. the - // distance in rows from the current row to the first row after the - // last in rows that falls in the current run. + /// Returns 1. how many of 'rows' are in the current run; 2. the distance in + /// rows from the current row to the first row after the last in rows that + /// falls in the current run. template std::pair findNumInRun( const int32_t* rows, int32_t rowIndex, int32_t numRows, int32_t currentRow) { - DCHECK_LT(rowIndex, numRows); + VELOX_DCHECK_LT(rowIndex, numRows); if (dense) { - auto left = std::min(remainingValues, numRows - rowIndex); + const auto left = std::min(remainingValues_, numRows - rowIndex); return std::make_pair(left, left); } - if (rows[rowIndex] - currentRow >= remainingValues) { + + if (rows[rowIndex] - currentRow >= remainingValues_) { return std::make_pair(0, 0); } - if (rows[numRows - 1] - currentRow < remainingValues) { + + if (rows[numRows - 1] - currentRow < remainingValues_) { return std::pair(numRows - rowIndex, rows[numRows - 1] - currentRow + 1); } - auto range = folly::Range( + + const auto range = folly::Range( rows + rowIndex, - std::min(remainingValues, numRows - rowIndex)); - auto endOfRun = currentRow + remainingValues; - auto bound = std::lower_bound(range.begin(), range.end(), endOfRun); + std::min(remainingValues_, numRows - rowIndex)); + const auto endOfRun = currentRow + remainingValues_; + const auto bound = std::lower_bound(range.begin(), range.end(), endOfRun); return std::make_pair(bound - range.begin(), bound[-1] - currentRow + 1); } @@ -453,23 +436,23 @@ class RleDecoderV1 : public dwio::common::IntDecoder { auto filterHits = hasFilter ? visitor.outputRows(numRows) : nullptr; int32_t numValues = 0; for (;;) { - if (remainingValues) { + if (remainingValues_ > 0) { auto [numInRun, numAdvanced] = findNumInRun(rows, rowIndex, numRows, currentRow); if (!numInRun) { // We are not at end and the next row of interest is after this run. VELOX_CHECK(!numAdvanced, "Would advance past end of RLEv1 run"); - } else if (repeating) { + } else if (repeating_) { visitor.template processRle( - value, - delta, + value_, + delta_, numInRun, currentRow, scatterRows, filterHits, values, numValues); - value += numAdvanced * delta; + value_ += numAdvanced * delta_; } else { processRun( rows, @@ -482,16 +465,16 @@ class RleDecoderV1 : public dwio::common::IntDecoder { numValues, visitor); } - remainingValues -= numAdvanced; + remainingValues_ -= numAdvanced; currentRow += numAdvanced; rowIndex += numInRun; if (visitor.atEnd()) { visitor.setNumValues(hasFilter ? numValues : numAllRows); return; } - if (remainingValues) { - currentRow += remainingValues; - skip(remainingValues, -1, nullptr); + if (remainingValues_ > 0) { + currentRow += remainingValues_; + this->template skip(remainingValues_, -1, nullptr); } } readHeader(); @@ -499,22 +482,24 @@ class RleDecoderV1 : public dwio::common::IntDecoder { } inline void readHeader() { - signed char ch = dwio::common::IntDecoder::readByte(); + const signed char ch = dwio::common::IntDecoder::readByte(); if (ch < 0) { - remainingValues = static_cast(-ch); - repeating = false; + remainingValues_ = static_cast(-ch); + repeating_ = false; } else { - remainingValues = static_cast(ch) + RLE_MINIMUM_REPEAT; - repeating = true; - delta = dwio::common::IntDecoder::readByte(); - value = dwio::common::IntDecoder::template readInt(); + remainingValues_ = static_cast(ch) + RLE_MINIMUM_REPEAT; + repeating_ = true; + delta_ = dwio::common::IntDecoder::readByte(); + value_ = dwio::common::IntDecoder::template readInt(); } } - uint64_t remainingValues; - int64_t value; - int64_t delta; - bool repeating; + void skipPending() override; + + uint64_t remainingValues_; + int64_t value_; + int64_t delta_; + bool repeating_; }; } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/common/RLEv2.cpp b/velox/dwio/dwrf/common/RLEv2.cpp index fc6470480d7f2..a61c763eaa63b 100644 --- a/velox/dwio/dwrf/common/RLEv2.cpp +++ b/velox/dwio/dwrf/common/RLEv2.cpp @@ -112,7 +112,7 @@ int64_t RleDecoderV2::readLongBE(uint64_t bsz) { int64_t ret = 0, val; uint64_t n = bsz; while (n > 0) { - n--; + --n; val = readByte(); ret |= (val << (n * 8)); } @@ -124,28 +124,26 @@ RleDecoderV2::RleDecoderV2( std::unique_ptr input, MemoryPool& pool) : dwio::common::IntDecoder{std::move(input), false, 0}, - firstByte(0), - runLength(0), - runRead(0), - deltaBase(0), - byteSize(0), - firstValue(0), - prevValue(0), - bitSize(0), - bitsLeft(0), - curByte(0), - patchBitSize(0), - unpackedIdx(0), - patchIdx(0), - base(0), - curGap(0), - curPatch(0), - patchMask(0), - actualGap(0), - unpacked(pool, 0), - unpackedPatch(pool, 0) { - // PASS -} + firstByte_(0), + runLength_(0), + runRead_(0), + deltaBase_(0), + byteSize_(0), + firstValue_(0), + prevValue_(0), + bitSize_(0), + bitsLeft_(0), + curByte_(0), + patchBitSize_(0), + unpackedIdx_(0), + patchIdx_(0), + base_(0), + curGap_(0), + curPatch_(0), + patchMask_(0), + actualGap_(0), + unpacked_(pool, 0), + unpackedPatch_(pool, 0) {} template RleDecoderV2::RleDecoderV2( std::unique_ptr input, @@ -158,13 +156,14 @@ template void RleDecoderV2::seekToRowGroup( dwio::common::PositionProvider& location) { // move the input stream - dwio::common::IntDecoder::inputStream->seekToPosition(location); + dwio::common::IntDecoder::inputStream_->seekToPosition(location); // clear state - dwio::common::IntDecoder::bufferEnd = - dwio::common::IntDecoder::bufferStart = 0; - runRead = runLength = 0; + dwio::common::IntDecoder::bufferStart_ = nullptr; + dwio::common::IntDecoder::bufferEnd_ = nullptr; + runRead_ = 0; + runLength_ = 0; // skip ahead the given number of records - skip(location.next()); + this->pendingSkip_ = location.next(); } template void RleDecoderV2::seekToRowGroup( @@ -173,27 +172,37 @@ template void RleDecoderV2::seekToRowGroup( dwio::common::PositionProvider& location); template -void RleDecoderV2::skip(uint64_t numValues) { +void RleDecoderV2::skipPending() { // simple for now, until perf tests indicate something encoding specific is // needed - const uint64_t N = 64; + constexpr int64_t N = 64; int64_t dummy[N]; - + auto numValues = this->pendingSkip_; + this->pendingSkip_ = 0; while (numValues) { uint64_t nRead = std::min(N, numValues); - next(dummy, nRead, nullptr); + doNext(dummy, nRead, nullptr); numValues -= nRead; } } -template void RleDecoderV2::skip(uint64_t numValues); -template void RleDecoderV2::skip(uint64_t numValues); +template void RleDecoderV2::skipPending(); +template void RleDecoderV2::skipPending(); template void RleDecoderV2::next( int64_t* const data, const uint64_t numValues, const uint64_t* const nulls) { + skipPending(); + doNext(data, numValues, nulls); +} + +template +void RleDecoderV2::doNext( + int64_t* const data, + const uint64_t numValues, + const uint64_t* const nulls) { uint64_t nRead = 0; while (nRead < numValues) { @@ -204,13 +213,13 @@ void RleDecoderV2::next( } } - if (runRead == runLength) { + if (runRead_ == runLength_) { resetRun(); } uint64_t offset = nRead, length = numValues - nRead; - switch (type) { + switch (type_) { case SHORT_REPEAT: nRead += nextShortRepeats(data, offset, length, nulls); break; @@ -224,57 +233,58 @@ void RleDecoderV2::next( nRead += nextDelta(data, offset, length, nulls); break; default: - DWIO_RAISE("unknown encoding"); + VELOX_FAIL("unknown encoding: {}", static_cast(type_)); } } } -template void RleDecoderV2::next( +template void RleDecoderV2::doNext( int64_t* const data, const uint64_t numValues, const uint64_t* const nulls); -template void RleDecoderV2::next( +template void RleDecoderV2::doNext( int64_t* const data, const uint64_t numValues, const uint64_t* const nulls); template uint64_t RleDecoderV2::nextShortRepeats( - int64_t* const data, + int64_t* data, uint64_t offset, uint64_t numValues, - const uint64_t* const nulls) { - if (runRead == runLength) { + const uint64_t* nulls) { + if (runRead_ == runLength_) { // extract the number of fixed bytes - byteSize = (firstByte >> 3) & 0x07; - byteSize += 1; + byteSize_ = (firstByte_ >> 3) & 0x07; + byteSize_ += 1; - runLength = firstByte & 0x07; + runLength_ = firstByte_ & 0x07; // run lengths values are stored only after MIN_REPEAT value is met - runLength += RLE_MINIMUM_REPEAT; - runRead = 0; + runLength_ += RLE_MINIMUM_REPEAT; + runRead_ = 0; // read the repeated value which is store using fixed bytes - firstValue = readLongBE(byteSize); + firstValue_ = readLongBE(byteSize_); if (isSigned) { - firstValue = ZigZag::decode(static_cast(firstValue)); + firstValue_ = + ZigZag::decode(static_cast(firstValue_)); } } - uint64_t nRead = std::min(runLength - runRead, numValues); + uint64_t nRead = std::min(runLength_ - runRead_, numValues); if (nulls) { for (uint64_t pos = offset; pos < offset + nRead; ++pos) { if (!bits::isBitNull(nulls, pos)) { - data[pos] = firstValue; - ++runRead; + data[pos] = firstValue_; + ++runRead_; } } } else { for (uint64_t pos = offset; pos < offset + nRead; ++pos) { - data[pos] = firstValue; - ++runRead; + data[pos] = firstValue_; + ++runRead_; } } @@ -282,38 +292,38 @@ uint64_t RleDecoderV2::nextShortRepeats( } template uint64_t RleDecoderV2::nextShortRepeats( - int64_t* const data, + int64_t* data, uint64_t offset, uint64_t numValues, - const uint64_t* const nulls); + const uint64_t* nulls); template uint64_t RleDecoderV2::nextShortRepeats( - int64_t* const data, + int64_t* data, uint64_t offset, uint64_t numValues, - const uint64_t* const nulls); + const uint64_t* nulls); template uint64_t RleDecoderV2::nextDirect( - int64_t* const data, + int64_t* data, uint64_t offset, uint64_t numValues, - const uint64_t* const nulls) { - if (runRead == runLength) { + const uint64_t* nulls) { + if (runRead_ == runLength_) { // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - bitSize = decodeBitWidth(fbo); + unsigned char fbo = (firstByte_ >> 1) & 0x1f; + bitSize_ = decodeBitWidth(fbo); // extract the run length - runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); + runLength_ = static_cast(firstByte_ & 0x01) << 8; + runLength_ |= readByte(); // runs are one off - runLength += 1; - runRead = 0; + runLength_ += 1; + runRead_ = 0; } - uint64_t nRead = std::min(runLength - runRead, numValues); + uint64_t nRead = std::min(runLength_ - runRead_, numValues); - runRead += readLongs(data, offset, nRead, bitSize, nulls); + runRead_ += readLongs(data, offset, nRead, bitSize_, nulls); if (isSigned) { if (nulls) { @@ -350,27 +360,27 @@ uint64_t RleDecoderV2::nextPatched( uint64_t offset, uint64_t numValues, const uint64_t* const nulls) { - if (runRead == runLength) { + if (runRead_ == runLength_) { // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - bitSize = decodeBitWidth(fbo); + unsigned char fbo = (firstByte_ >> 1) & 0x1f; + bitSize_ = decodeBitWidth(fbo); // extract the run length - runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); + runLength_ = static_cast(firstByte_ & 0x01) << 8; + runLength_ |= readByte(); // runs are one off - runLength += 1; - runRead = 0; + runLength_ += 1; + runRead_ = 0; // extract the number of bytes occupied by base uint64_t thirdByte = readByte(); - byteSize = (thirdByte >> 5) & 0x07; + byteSize_ = (thirdByte >> 5) & 0x07; // base width is one off - byteSize += 1; + byteSize_ += 1; // extract patch width uint32_t pwo = thirdByte & 0x1f; - patchBitSize = decodeBitWidth(pwo); + patchBitSize_ = decodeBitWidth(pwo); // read fourth byte and extract patch gap width uint64_t fourthByte = readByte(); @@ -380,79 +390,79 @@ uint64_t RleDecoderV2::nextPatched( // extract the length of the patch list size_t pl = fourthByte & 0x1f; - DWIO_ENSURE_NE( + VELOX_CHECK_NE( pl, 0, "Corrupt PATCHED_BASE encoded data (pl==0)! ", - dwio::common::IntDecoder::inputStream->getName()); + dwio::common::IntDecoder::inputStream_->getName()); // read the next base width number of bytes to extract base value - base = readLongBE(byteSize); - int64_t mask = (static_cast(1) << ((byteSize * 8) - 1)); + base_ = readLongBE(byteSize_); + int64_t mask = (static_cast(1) << ((byteSize_ * 8) - 1)); // if mask of base value is 1 then base is negative value else positive - if ((base & mask) != 0) { - base = base & ~mask; - base = -base; + if ((base_ & mask) != 0) { + base_ = base_ & ~mask; + base_ = -base_; } // TODO: something more efficient than resize - unpacked.resize(runLength); - unpackedIdx = 0; - readLongs(unpacked.data(), 0, runLength, bitSize); + unpacked_.resize(runLength_); + unpackedIdx_ = 0; + readLongs(unpacked_.data(), 0, runLength_, bitSize_); // any remaining bits are thrown out resetReadLongs(); // TODO: something more efficient than resize - unpackedPatch.resize(pl); - patchIdx = 0; + unpackedPatch_.resize(pl); + patchIdx_ = 0; // TODO: Skip corrupt? // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { - DWIO_ENSURE_LE( - (patchBitSize + pgw), + VELOX_CHECK_LE( + (patchBitSize_ + pgw), 64, "Corrupt PATCHED_BASE encoded data (patchBitSize + pgw > 64)! ", - dwio::common::IntDecoder::inputStream->getName()); - uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); - readLongs(unpackedPatch.data(), 0, pl, cfb); + dwio::common::IntDecoder::inputStream_->getName()); + uint32_t cfb = getClosestFixedBits(patchBitSize_ + pgw); + readLongs(unpackedPatch_.data(), 0, pl, cfb); // any remaining bits are thrown out resetReadLongs(); // apply the patch directly when decoding the packed data - patchMask = ((static_cast(1) << patchBitSize) - 1); + patchMask_ = ((static_cast(1) << patchBitSize_) - 1); adjustGapAndPatch(); } - uint64_t nRead = std::min(runLength - runRead, numValues); + uint64_t nRead = std::min(runLength_ - runRead_, numValues); for (uint64_t pos = offset; pos < offset + nRead; ++pos) { // skip null positions if (nulls && bits::isBitNull(nulls, pos)) { continue; } - if (static_cast(unpackedIdx) != actualGap) { + if (static_cast(unpackedIdx_) != actualGap_) { // no patching required. add base to unpacked value to get final value - data[pos] = base + unpacked[unpackedIdx]; + data[pos] = base_ + unpacked_[unpackedIdx_]; } else { // extract the patch value - int64_t patchedVal = unpacked[unpackedIdx] | (curPatch << bitSize); + int64_t patchedVal = unpacked_[unpackedIdx_] | (curPatch_ << bitSize_); // add base to patched value - data[pos] = base + patchedVal; + data[pos] = base_ + patchedVal; // increment the patch to point to next entry in patch list - ++patchIdx; + ++patchIdx_; - if (patchIdx < unpackedPatch.size()) { + if (patchIdx_ < unpackedPatch_.size()) { adjustGapAndPatch(); // next gap is relative to the current gap - actualGap += unpackedIdx; + actualGap_ += unpackedIdx_; } } - ++runRead; - ++unpackedIdx; + ++runRead_; + ++unpackedIdx_; } return nRead; @@ -476,37 +486,37 @@ uint64_t RleDecoderV2::nextDelta( uint64_t offset, uint64_t numValues, const uint64_t* const nulls) { - if (runRead == runLength) { + if (runRead_ == runLength_) { // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; + unsigned char fbo = (firstByte_ >> 1) & 0x1f; if (fbo != 0) { - bitSize = decodeBitWidth(fbo); + bitSize_ = decodeBitWidth(fbo); } else { - bitSize = 0; + bitSize_ = 0; } // extract the run length - runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); - ++runLength; // account for first value - runRead = deltaBase = 0; + runLength_ = static_cast(firstByte_ & 0x01) << 8; + runLength_ |= readByte(); + ++runLength_; // account for first value + runRead_ = deltaBase_ = 0; // read the first value stored as vint if constexpr (isSigned) { - firstValue = dwio::common::IntDecoder::readVsLong(); + firstValue_ = dwio::common::IntDecoder::readVsLong(); } else { - firstValue = static_cast( + firstValue_ = static_cast( dwio::common::IntDecoder::readVuLong()); } - prevValue = firstValue; + prevValue_ = firstValue_; // read the fixed delta value stored as vint (deltas can be negative even // if all number are positive) - deltaBase = dwio::common::IntDecoder::readVsLong(); + deltaBase_ = dwio::common::IntDecoder::readVsLong(); } - uint64_t nRead = std::min(runLength - runRead, numValues); + uint64_t nRead = std::min(runLength_ - runRead_, numValues); uint64_t pos = offset; for (; pos < offset + nRead; ++pos) { @@ -515,20 +525,20 @@ uint64_t RleDecoderV2::nextDelta( break; } } - if (runRead == 0 && pos < offset + nRead) { - data[pos++] = firstValue; - ++runRead; + if (runRead_ == 0 && pos < offset + nRead) { + data[pos++] = firstValue_; + ++runRead_; } - if (bitSize == 0) { + if (bitSize_ == 0) { // add fixed deltas to adjacent values for (; pos < offset + nRead; ++pos) { // skip null positions if (nulls && bits::isBitNull(nulls, pos)) { continue; } - prevValue = data[pos] = prevValue + deltaBase; - ++runRead; + prevValue_ = data[pos] = prevValue_ + deltaBase_; + ++runRead_; } } else { for (; pos < offset + nRead; ++pos) { @@ -537,25 +547,25 @@ uint64_t RleDecoderV2::nextDelta( break; } } - if (runRead < 2 && pos < offset + nRead) { + if (runRead_ < 2 && pos < offset + nRead) { // add delta base and first value - prevValue = data[pos++] = firstValue + deltaBase; - ++runRead; + prevValue_ = data[pos++] = firstValue_ + deltaBase_; + ++runRead_; } // write the unpacked values, add it to previous value and store final // value to result buffer. if the delta base value is negative then it // is a decreasing sequence else an increasing sequence uint64_t remaining = (offset + nRead) - pos; - runRead += readLongs(data, pos, remaining, bitSize, nulls); + runRead_ += readLongs(data, pos, remaining, bitSize_, nulls); - if (deltaBase < 0) { + if (deltaBase_ < 0) { for (; pos < offset + nRead; ++pos) { // skip null positions if (nulls && bits::isBitNull(nulls, pos)) { continue; } - prevValue = data[pos] = prevValue - data[pos]; + prevValue_ = data[pos] = prevValue_ - data[pos]; } } else { for (; pos < offset + nRead; ++pos) { @@ -563,7 +573,7 @@ uint64_t RleDecoderV2::nextDelta( if (nulls && bits::isBitNull(nulls, pos)) { continue; } - prevValue = data[pos] = prevValue + data[pos]; + prevValue_ = data[pos] = prevValue_ + data[pos]; } } } @@ -584,13 +594,13 @@ template uint64_t RleDecoderV2::nextDelta( template int64_t RleDecoderV2::readValue() { - if (runRead == runLength) { + if (runRead_ == runLength_) { resetRun(); } uint64_t nRead = 0; int64_t value = 0; - switch (type) { + switch (type_) { case SHORT_REPEAT: nRead = nextShortRepeats(&value, 0, 1, nullptr); break; @@ -604,9 +614,9 @@ int64_t RleDecoderV2::readValue() { nRead = nextDelta(&value, 0, 1, nullptr); break; default: - DWIO_RAISE("unknown encoding"); + VELOX_FAIL("unknown encoding: {}", static_cast(type_)); } - VELOX_CHECK(nRead == (uint64_t)1); + VELOX_CHECK_EQ(nRead, (uint64_t)1); return value; } diff --git a/velox/dwio/dwrf/common/RLEv2.h b/velox/dwio/dwrf/common/RLEv2.h index e15bc3e143faf..9feef3c748258 100644 --- a/velox/dwio/dwrf/common/RLEv2.h +++ b/velox/dwio/dwrf/common/RLEv2.h @@ -16,7 +16,6 @@ #pragma once -#include "velox/common/base/Nulls.h" #include "velox/common/memory/Memory.h" #include "velox/dwio/common/Adaptor.h" #include "velox/dwio/common/DataBuffer.h" @@ -46,10 +45,7 @@ class RleDecoderV2 : public dwio::common::IntDecoder { */ void seekToRowGroup(dwio::common::PositionProvider&) override; - /** - * Seek over a given number of values. - */ - void skip(uint64_t numValues) override; + void skipPending() override; /** * Read a number of values into the batch. @@ -57,23 +53,17 @@ class RleDecoderV2 : public dwio::common::IntDecoder { void next(int64_t* data, uint64_t numValues, const uint64_t* nulls) override; void nextLengths(int32_t* const data, const int32_t numValues) override { + skipPending(); for (int i = 0; i < numValues; ++i) { data[i] = readValue(); } } - template - inline void skip(int32_t numValues, int32_t current, const uint64_t* nulls) { - if constexpr (hasNulls) { - numValues = bits::countNonNulls(nulls, current, current + numValues); - } - skip(numValues); - } - template void readWithVisitor(const uint64_t* nulls, Visitor visitor) { + skipPending(); int32_t current = visitor.start(); - skip(current, 0, nulls); + this->template skip(current, 0, nulls); int32_t toSkip; bool atEnd = false; @@ -86,7 +76,7 @@ class RleDecoderV2 : public dwio::common::IntDecoder { if (hasNulls && !allowNulls) { toSkip = visitor.checkAndSkipNulls(nulls, current, atEnd); if (!Visitor::dense) { - skip(toSkip, current, nullptr); + this->template skip(toSkip, current, nullptr); } if (atEnd) { return; @@ -100,7 +90,7 @@ class RleDecoderV2 : public dwio::common::IntDecoder { ++current; if (toSkip) { - skip(toSkip, current, nulls); + this->template skip(toSkip, current, nulls); current += toSkip; } if (atEnd) { @@ -112,52 +102,54 @@ class RleDecoderV2 : public dwio::common::IntDecoder { private: // Used by PATCHED_BASE void adjustGapAndPatch() { - curGap = static_cast(unpackedPatch[patchIdx]) >> patchBitSize; - curPatch = unpackedPatch[patchIdx] & patchMask; - actualGap = 0; + curGap_ = static_cast(unpackedPatch_[patchIdx_]) >> patchBitSize_; + curPatch_ = unpackedPatch_[patchIdx_] & patchMask_; + actualGap_ = 0; // special case: gap is >255 then patch value will be 0. // if gap is <=255 then patch value cannot be 0 - while (curGap == 255 && curPatch == 0) { - actualGap += 255; - ++patchIdx; - curGap = static_cast(unpackedPatch[patchIdx]) >> patchBitSize; - curPatch = unpackedPatch[patchIdx] & patchMask; + while (curGap_ == 255 && curPatch_ == 0) { + actualGap_ += 255; + ++patchIdx_; + curGap_ = + static_cast(unpackedPatch_[patchIdx_]) >> patchBitSize_; + curPatch_ = unpackedPatch_[patchIdx_] & patchMask_; } // add the left over gap - actualGap += curGap; + actualGap_ += curGap_; } void resetReadLongs() { - bitsLeft = 0; - curByte = 0; + bitsLeft_ = 0; + curByte_ = 0; } void resetRun() { resetReadLongs(); - bitSize = 0; - firstByte = readByte(); - type = static_cast((firstByte >> 6) & 0x03); + bitSize_ = 0; + firstByte_ = readByte(); + type_ = static_cast((firstByte_ >> 6) & 0x03); } unsigned char readByte() { - if (dwio::common::IntDecoder::bufferStart == - dwio::common::IntDecoder::bufferEnd) { + if (dwio::common::IntDecoder::bufferStart_ == + dwio::common::IntDecoder::bufferEnd_) { int32_t bufferLength; const void* bufferPointer; - DWIO_ENSURE( - dwio::common::IntDecoder::inputStream->Next( - &bufferPointer, &bufferLength), + const bool ret = dwio::common::IntDecoder::inputStream_->Next( + &bufferPointer, &bufferLength); + VELOX_CHECK( + ret, "bad read in RleDecoderV2::readByte, ", - dwio::common::IntDecoder::inputStream->getName()); - dwio::common::IntDecoder::bufferStart = + dwio::common::IntDecoder::inputStream_->getName()); + dwio::common::IntDecoder::bufferStart_ = static_cast(bufferPointer); - dwio::common::IntDecoder::bufferEnd = - dwio::common::IntDecoder::bufferStart + bufferLength; + dwio::common::IntDecoder::bufferEnd_ = + dwio::common::IntDecoder::bufferStart_ + bufferLength; } unsigned char result = static_cast( - *dwio::common::IntDecoder::bufferStart++); + *dwio::common::IntDecoder::bufferStart_++); return result; } @@ -178,19 +170,19 @@ class RleDecoderV2 : public dwio::common::IntDecoder { } uint64_t result = 0; uint64_t bitsLeftToRead = fb; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = readByte(); - bitsLeft = 8; + while (bitsLeftToRead > bitsLeft_) { + result <<= bitsLeft_; + result |= curByte_ & ((1 << bitsLeft_) - 1); + bitsLeftToRead -= bitsLeft_; + curByte_ = readByte(); + bitsLeft_ = 8; } // handle the left over bits if (bitsLeftToRead > 0) { result <<= bitsLeftToRead; - bitsLeft -= static_cast(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + bitsLeft_ -= static_cast(bitsLeftToRead); + result |= (curByte_ >> bitsLeft_) & ((1 << bitsLeftToRead) - 1); } data[i] = static_cast(result); ++ret; @@ -222,27 +214,32 @@ class RleDecoderV2 : public dwio::common::IntDecoder { int64_t readValue(); - unsigned char firstByte; - uint64_t runLength; - uint64_t runRead; - int64_t deltaBase; // Used by DELTA - uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE - int64_t firstValue; // Used by SHORT_REPEAT and DELTA - int64_t prevValue; // Used by DELTA - uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA - uint32_t bitsLeft; // Used by anything that uses readLongs - uint32_t curByte; // Used by anything that uses readLongs - uint32_t patchBitSize; // Used by PATCHED_BASE - uint64_t unpackedIdx; // Used by PATCHED_BASE - uint64_t patchIdx; // Used by PATCHED_BASE - int64_t base; // Used by PATCHED_BASE - uint64_t curGap; // Used by PATCHED_BASE - int64_t curPatch; // Used by PATCHED_BASE - int64_t patchMask; // Used by PATCHED_BASE - int64_t actualGap; // Used by PATCHED_BASE - EncodingType type; - dwio::common::DataBuffer unpacked; // Used by PATCHED_BASE - dwio::common::DataBuffer unpackedPatch; // Used by PATCHED_BASE + void doNext( + int64_t* const data, + const uint64_t numValues, + const uint64_t* const nulls); + + unsigned char firstByte_; + uint64_t runLength_; + uint64_t runRead_; + int64_t deltaBase_; // Used by DELTA + uint64_t byteSize_; // Used by SHORT_REPEAT and PATCHED_BASE + int64_t firstValue_; // Used by SHORT_REPEAT and DELTA + int64_t prevValue_; // Used by DELTA + uint32_t bitSize_; // Used by DIRECT, PATCHED_BASE and DELTA + uint32_t bitsLeft_; // Used by anything that uses readLongs + uint32_t curByte_; // Used by anything that uses readLongs + uint32_t patchBitSize_; // Used by PATCHED_BASE + uint64_t unpackedIdx_; // Used by PATCHED_BASE + uint64_t patchIdx_; // Used by PATCHED_BASE + int64_t base_; // Used by PATCHED_BASE + uint64_t curGap_; // Used by PATCHED_BASE + int64_t curPatch_; // Used by PATCHED_BASE + int64_t patchMask_; // Used by PATCHED_BASE + int64_t actualGap_; // Used by PATCHED_BASE + EncodingType type_; + dwio::common::DataBuffer unpacked_; // Used by PATCHED_BASE + dwio::common::DataBuffer unpackedPatch_; // Used by PATCHED_BASE }; } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/common/Statistics.cpp b/velox/dwio/dwrf/common/Statistics.cpp index af4bddcc8692e..57d8cea80c6fd 100644 --- a/velox/dwio/dwrf/common/Statistics.cpp +++ b/velox/dwio/dwrf/common/Statistics.cpp @@ -21,74 +21,77 @@ namespace facebook::velox::dwrf { using namespace dwio::common; std::unique_ptr buildColumnStatisticsFromProto( - const proto::ColumnStatistics& s, + const ColumnStatisticsWrapper& stats, const StatsContext& statsContext) { ColumnStatistics colStats( - s.has_numberofvalues() ? std::optional(s.numberofvalues()) : std::nullopt, - s.has_hasnull() ? std::optional(s.hasnull()) : std::nullopt, - s.has_rawsize() ? std::optional(s.rawsize()) : std::nullopt, - s.has_size() ? std::optional(s.size()) : std::nullopt); + stats.hasNumberOfValues() ? std::optional(stats.numberOfValues()) + : std::nullopt, + stats.hasHasNull() ? std::optional(stats.hasNull()) : std::nullopt, + stats.hasRawSize() ? std::optional(stats.rawSize()) : std::nullopt, + stats.hasSize() ? std::optional(stats.size()) : std::nullopt); // detailed stats is only defined when has non-null value - if (!s.has_numberofvalues() || s.numberofvalues() > 0) { - if (s.has_intstatistics()) { - const auto& intStats = s.intstatistics(); + if (!stats.hasNumberOfValues() || stats.numberOfValues() > 0) { + if (stats.hasIntStatistics()) { + const auto& intStats = stats.intStatistics(); return std::make_unique( colStats, - intStats.has_minimum() ? std::optional(intStats.minimum()) - : std::nullopt, - intStats.has_maximum() ? std::optional(intStats.maximum()) - : std::nullopt, - intStats.has_sum() ? std::optional(intStats.sum()) : std::nullopt); - } else if (s.has_doublestatistics()) { - const auto& dStats = s.doublestatistics(); + intStats.hasMinimum() ? std::optional(intStats.minimum()) + : std::nullopt, + intStats.hasMaximum() ? std::optional(intStats.maximum()) + : std::nullopt, + intStats.hasSum() ? std::optional(intStats.sum()) : std::nullopt); + } else if (stats.hasDoubleStatistics()) { + const auto& doubleStats = stats.doubleStatistics(); // Comparing against NaN doesn't make sense, and to prevent downstream // from incorrectly using it, need to make sure min/max/sum doens't have // NaN. - auto hasNan = (dStats.has_minimum() && std::isnan(dStats.minimum())) || - (dStats.has_maximum() && std::isnan(dStats.maximum())) || - (dStats.has_sum() && std::isnan(dStats.sum())); + const auto hasNan = + (doubleStats.hasMinimum() && std::isnan(doubleStats.minimum())) || + (doubleStats.hasMaximum() && std::isnan(doubleStats.maximum())) || + (doubleStats.hasSum() && std::isnan(doubleStats.sum())); if (!hasNan) { return std::make_unique( colStats, - dStats.has_minimum() ? std::optional(dStats.minimum()) - : std::nullopt, - dStats.has_maximum() ? std::optional(dStats.maximum()) - : std::nullopt, - dStats.has_sum() ? std::optional(dStats.sum()) : std::nullopt); + doubleStats.hasMinimum() ? std::optional(doubleStats.minimum()) + : std::nullopt, + doubleStats.hasMaximum() ? std::optional(doubleStats.maximum()) + : std::nullopt, + doubleStats.hasSum() ? std::optional(doubleStats.sum()) + : std::nullopt); } - } else if (s.has_stringstatistics()) { + } else if (stats.hasStringStatistics()) { // DWRF_5_0 is the first version that string stats are saved as UTF8 // bytes, hence only process string stats for version >= DWRF_5_0 if (statsContext.writerVersion >= WriterVersion::DWRF_5_0 || statsContext.writerName == kPrestoWriter || statsContext.writerName == kDwioWriter) { - const auto& strStats = s.stringstatistics(); + const auto& strStats = stats.stringStatistics(); return std::make_unique( colStats, - strStats.has_minimum() ? std::optional(strStats.minimum()) - : std::nullopt, - strStats.has_maximum() ? std::optional(strStats.maximum()) - : std::nullopt, + strStats.hasMinimum() ? std::optional(strStats.minimum()) + : std::nullopt, + strStats.hasMaximum() ? std::optional(strStats.maximum()) + : std::nullopt, // In proto, length(sum) is defined as sint. We need to make sure // length is not negative - (strStats.has_sum() && strStats.sum() >= 0) + (strStats.hasSum() && strStats.sum() >= 0) ? std::optional(strStats.sum()) : std::nullopt); } - } else if (s.has_bucketstatistics()) { - const auto& bucketStats = s.bucketstatistics(); + } else if (stats.hasBucketStatistics()) { + const auto& bucketStats = stats.bucketStatistics(); // Need to make sure there is at least one bucket. True count is saved in // bucket 0 - if (bucketStats.count_size() > 0) { + if (bucketStats.countSize() > 0) { return std::make_unique( colStats, bucketStats.count(0)); } - } else if (s.has_binarystatistics()) { - const auto& binStats = s.binarystatistics(); + } else if (stats.hasBinaryStatistics()) { + const auto& binStats = stats.binaryStatistics(); // In proto, length(sum) is defined as sint. We need to make sure length // is not negative - if (binStats.has_sum() && binStats.sum() >= 0) { + if (binStats.hasSum() && binStats.sum() >= 0) { return std::make_unique( colStats, static_cast(binStats.sum())); } diff --git a/velox/dwio/dwrf/common/Statistics.h b/velox/dwio/dwrf/common/Statistics.h index 0f26682fe1db0..864fb08db3721 100644 --- a/velox/dwio/dwrf/common/Statistics.h +++ b/velox/dwio/dwrf/common/Statistics.h @@ -18,6 +18,7 @@ #include "velox/dwio/common/Statistics.h" #include "velox/dwio/dwrf/common/Common.h" +#include "velox/dwio/dwrf/common/FileMetadata.h" #include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h" namespace facebook::velox::dwrf { @@ -39,7 +40,7 @@ struct StatsContext : public dwio::common::StatsContext { }; std::unique_ptr buildColumnStatisticsFromProto( - const proto::ColumnStatistics& stats, + const ColumnStatisticsWrapper& stats, const StatsContext& statsContext); } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/proto/CMakeLists.txt b/velox/dwio/dwrf/proto/CMakeLists.txt index 02ff2c8d7fc5c..299587a1f4158 100644 --- a/velox/dwio/dwrf/proto/CMakeLists.txt +++ b/velox/dwio/dwrf/proto/CMakeLists.txt @@ -30,20 +30,27 @@ endforeach() set(PROTO_OUTPUT_FILES ${PROTO_HDRS} ${PROTO_SRCS}) set_source_files_properties(${PROTO_OUTPUT_FILES} PROPERTIES GENERATED TRUE) +# Ensure that the option --proto_path is not given an empty argument +foreach(PROTO_PATH ${CMAKE_SOURCE_DIR} ${Protobuf_INCLUDE_DIRS}) + list(APPEND PROTO_PATH_ARGS --proto_path=${PROTO_PATH}) +endforeach() + add_custom_command( OUTPUT ${PROTO_OUTPUT_FILES} - COMMAND - ${Protobuf_PROTOC_EXECUTABLE} --proto_path ${CMAKE_SOURCE_DIR}/ --proto_path - ${Protobuf_INCLUDE_DIRS} --cpp_out ${CMAKE_BINARY_DIR} ${PROTO_FILES_FULL} - DEPENDS ${Protobuf_PROTOC_EXECUTABLE} + COMMAND protobuf::protoc ${PROTO_PATH_ARGS} --cpp_out ${CMAKE_BINARY_DIR} + ${PROTO_FILES_FULL} + DEPENDS protobuf::protoc COMMENT "Running PROTO compiler" VERBATIM) add_custom_target(dwio_proto ALL DEPENDS ${PROTO_OUTPUT_FILES}) -add_library(velox_dwio_dwrf_proto ${PROTO_HDRS} ${PROTO_SRCS}) +if(VELOX_MONO_LIBRARY) + add_dependencies(velox dwio_proto) +endif() +velox_add_library(velox_dwio_dwrf_proto ${PROTO_HDRS} ${PROTO_SRCS}) # Access generated proto file with. # # #include "velox/dwio/dwrf/proto/dwrf_proto.pb.h" -target_link_libraries(velox_dwio_dwrf_proto ${Protobuf_LIBRARIES}) -target_include_directories(velox_dwio_dwrf_proto PUBLIC ${PROJECT_BINARY_DIR}) +velox_link_libraries(velox_dwio_dwrf_proto protobuf::libprotobuf) +velox_include_directories(velox_dwio_dwrf_proto PUBLIC ${PROJECT_BINARY_DIR}) diff --git a/velox/dwio/dwrf/reader/BinaryStreamReader.cpp b/velox/dwio/dwrf/reader/BinaryStreamReader.cpp index d7f5fafa398bc..9d8f5a5bf05f4 100644 --- a/velox/dwio/dwrf/reader/BinaryStreamReader.cpp +++ b/velox/dwio/dwrf/reader/BinaryStreamReader.cpp @@ -32,13 +32,17 @@ BinaryStripeStreams::BinaryStripeStreams( const dwio::common::ColumnSelector& selector, const uint32_t stripeIndex) : preload_(true), // TODO: is preload required ? - stripeInfo_{stripeReader.loadStripe(stripeIndex, preload_)}, + stripeReadState_{std::make_shared( + stripeReader.readerBaseShared(), + stripeReader.fetchStripe(stripeIndex, preload_))}, stripeStreams_{ - stripeReader, - selector, + stripeReadState_, + &selector, + nullptr, options_, - stripeInfo_.offset(), - static_cast(stripeInfo_.numberOfRows()), + stripeReadState_->stripeMetadata->stripeInfo.offset(), + static_cast( + stripeReadState_->stripeMetadata->stripeInfo.numberOfRows()), UnsupportedStrideIndexProvider(), stripeIndex} { if (!preload_) { @@ -81,16 +85,16 @@ std::vector BinaryStripeStreams::getStreamIdentifiers( BinaryStreamReader::BinaryStreamReader( const std::shared_ptr& reader, const std::vector& columnIds) - : stripeReaderBase_{reader}, - columnSelector_{reader->getSchema(), columnIds}, - stripeIndex_{0}, - numStripes{folly::to(reader->getFooter().stripesSize())} { - DWIO_ENSURE(!reader->getFooter().hasEncryption(), "encryption not supported"); - DWIO_ENSURE(!columnIds.empty(), "Atleast one column expected to be read"); + : columnSelector_{reader->schema(), columnIds}, + numStripes_{folly::to(reader->footer().stripesSize())}, + stripeReaderBase_{reader}, + stripeIndex_{0} { + VELOX_CHECK(!reader->footer().hasEncryption(), "encryption not supported"); + VELOX_CHECK(!columnIds.empty(), "Atleast one column expected to be read"); } std::unique_ptr BinaryStreamReader::next() { - if (stripeIndex_ >= numStripes) { + if (stripeIndex_ >= numStripes_) { return nullptr; } return std::make_unique( @@ -101,12 +105,12 @@ std::unordered_map BinaryStreamReader::getStatistics() const { std::unordered_map stats; auto footerStatsSize = - stripeReaderBase_.getReader().getFooter().statisticsSize(); - auto typesSize = stripeReaderBase_.getReader().getFooter().typesSize(); + stripeReaderBase_.getReader().footer().statisticsSize(); + auto typesSize = stripeReaderBase_.getReader().footer().typesSize(); if (footerStatsSize == 0) { - DWIO_ENSURE_EQ( - numStripes, + VELOX_CHECK_EQ( + numStripes_, 0, "Corrupted file detected, Footer stats are missing, but stripes are present"); for (auto node = 0; node < typesSize; node++) { @@ -115,14 +119,14 @@ BinaryStreamReader::getStatistics() const { } } } else { - DWIO_ENSURE_EQ( + VELOX_CHECK_EQ( footerStatsSize, typesSize, "different number of nodes and statistics"); // Node 0 is always selected by ColumnSelector, though this can be // disabled for the current use cases. for (auto node = 0; node < footerStatsSize; node++) { if (columnSelector_.shouldReadNode(node)) { stats[node] = - stripeReaderBase_.getReader().getFooter().statistics(node); + stripeReaderBase_.getReader().footer().dwrfStatistics(node); } } } @@ -130,7 +134,7 @@ BinaryStreamReader::getStatistics() const { } uint32_t BinaryStreamReader::getStrideLen() const { - return stripeReaderBase_.getReader().getFooter().rowIndexStride(); + return stripeReaderBase_.getReader().footer().rowIndexStride(); } } // namespace facebook::velox::dwrf::detail diff --git a/velox/dwio/dwrf/reader/BinaryStreamReader.h b/velox/dwio/dwrf/reader/BinaryStreamReader.h index fb6cca972ab67..d418bfa48c1fb 100644 --- a/velox/dwio/dwrf/reader/BinaryStreamReader.h +++ b/velox/dwio/dwrf/reader/BinaryStreamReader.h @@ -52,12 +52,12 @@ class BinaryStripeStreams { } const StripeInformationWrapper& getStripeInfo() const { - return stripeInfo_; + return stripeReadState_->stripeMetadata->stripeInfo; } private: bool preload_; - StripeInformationWrapper stripeInfo_; + std::shared_ptr stripeReadState_; dwio::common::RowReaderOptions options_; StripeStreamsImpl stripeStreams_; folly::F14FastMap> encodingKeys_; @@ -67,7 +67,7 @@ class BinaryStripeStreams { class BinaryStreamReader { public: - explicit BinaryStreamReader( + BinaryStreamReader( const std::shared_ptr& reader, const std::vector& columnIds); @@ -81,13 +81,15 @@ class BinaryStreamReader { return stripeIndex_; } + uint32_t numStripes() const { + return numStripes_; + } + private: - StripeReaderBase stripeReaderBase_; const dwio::common::ColumnSelector columnSelector_; + const uint32_t numStripes_; + StripeReaderBase stripeReaderBase_; uint32_t stripeIndex_; - - public: - const uint32_t numStripes; }; } // namespace facebook::velox::dwrf::detail diff --git a/velox/dwio/dwrf/reader/CMakeLists.txt b/velox/dwio/dwrf/reader/CMakeLists.txt index 360d88bc1cf9a..cae8fcc24bbe1 100644 --- a/velox/dwio/dwrf/reader/CMakeLists.txt +++ b/velox/dwio/dwrf/reader/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library( +velox_add_library( velox_dwio_dwrf_reader BinaryStreamReader.cpp ColumnReader.cpp @@ -35,6 +35,11 @@ add_library( StripeReaderBase.cpp StripeStream.cpp) -target_link_libraries( - velox_dwio_dwrf_reader velox_dwio_common velox_dwio_dwrf_common velox_caching - velox_dwio_dwrf_utils fmt::fmt) +velox_link_libraries( + velox_dwio_dwrf_reader + velox_dwio_common + velox_dwio_dwrf_common + velox_caching + velox_dwio_dwrf_utils + velox_test_util + fmt::fmt) diff --git a/velox/dwio/dwrf/reader/ColumnReader.cpp b/velox/dwio/dwrf/reader/ColumnReader.cpp index 4e5f10e6bd67b..9a11e9ebb5679 100644 --- a/velox/dwio/dwrf/reader/ColumnReader.cpp +++ b/velox/dwio/dwrf/reader/ColumnReader.cpp @@ -17,6 +17,7 @@ #include "velox/dwio/dwrf/reader/ColumnReader.h" #include "velox/dwio/common/IntCodecCommon.h" #include "velox/dwio/common/IntDecoder.h" +#include "velox/dwio/common/ParallelFor.h" #include "velox/dwio/common/TypeUtils.h" #include "velox/dwio/common/exception/Exceptions.h" #include "velox/dwio/dwrf/common/DecoderUtil.h" @@ -133,14 +134,14 @@ void ColumnReader::readNulls( } ColumnReader::ColumnReader( - std::shared_ptr nodeType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext) - : nodeType_(std::move(nodeType)), + : fileType_(std::move(fileType)), memoryPool_(stripe.getMemoryPool()), flatMapContext_(std::move(flatMapContext)) { - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; std::unique_ptr stream = stripe.getStream( encodingKey.forKind(proto::Stream_Kind_PRESENT), streamLabels.label(), @@ -195,15 +196,15 @@ std::enable_if_t> expandBytes( } } -template +template class ByteRleColumnReader : public ColumnReader { private: std::unique_ptr rle; public: ByteRleColumnReader( - std::shared_ptr nodeType, TypePtr requestedType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, std::function( @@ -211,12 +212,12 @@ class ByteRleColumnReader : public ColumnReader { const EncodingKey&)> creator, FlatMapContext flatMapContext) : ColumnReader( - std::move(nodeType), + std::move(fileType), stripe, streamLabels, std::move(flatMapContext)), requestedType_{std::move(requestedType)} { - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; rle = creator( stripe.getStream( encodingKey.forKind(proto::Stream_Kind_DATA), @@ -235,8 +236,8 @@ class ByteRleColumnReader : public ColumnReader { const TypePtr requestedType_; }; -template -uint64_t ByteRleColumnReader::skip( +template +uint64_t ByteRleColumnReader::skip( uint64_t numValues) { numValues = ColumnReader::skip(numValues); rle->skip(numValues); @@ -257,8 +258,8 @@ VectorPtr makeFlatVector( return flatVector; } -template -void ByteRleColumnReader::next( +template +void ByteRleColumnReader::next( uint64_t numValues, VectorPtr& result, const uint64_t* incomingNulls) { @@ -299,10 +300,10 @@ void ByteRleColumnReader::next( // Handle upcast if constexpr ( - !std::is_same_v && - (std::is_same_v || - sizeof(DataType) < sizeof(RequestedType))) { - expandBytes(valuesPtr, numValues); + !std::is_same_v && + (std::is_same_v || + sizeof(FileType) < sizeof(RequestedType))) { + expandBytes(valuesPtr, numValues); } } @@ -333,17 +334,6 @@ struct TemplatedReadHelper { } }; -template -struct TemplatedReadHelper { - static void nextValues( - IntDecoderT& decoder, - Date* data, - uint64_t numValues, - const uint64_t* nulls) { - decoder.nextInts(reinterpret_cast(data), numValues, nulls); - } -}; - template struct TemplatedReadHelper { static void nextValues( @@ -377,8 +367,8 @@ class DecimalColumnReader : public ColumnReader { public: DecimalColumnReader( - std::shared_ptr nodeType, TypePtr requestedType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext); @@ -393,18 +383,18 @@ class DecimalColumnReader : public ColumnReader { template DecimalColumnReader::DecimalColumnReader( - std::shared_ptr nodeType, TypePtr requestedType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext) : ColumnReader( - std::move(nodeType), + std::move(fileType), stripe, streamLabels, std::move(flatMapContext)), requestedType_(std::move(requestedType)) { - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; if constexpr (std::is_same_v) { scale_ = requestedType_->asShortDecimal().scale(); } else { @@ -483,8 +473,8 @@ template class IntegerDirectColumnReader : public ColumnReader { public: IntegerDirectColumnReader( - std::shared_ptr nodeType, TypePtr requestedType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, uint32_t numBytes, @@ -504,19 +494,19 @@ class IntegerDirectColumnReader : public ColumnReader { template IntegerDirectColumnReader::IntegerDirectColumnReader( - std::shared_ptr nodeType, TypePtr requestedType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, uint32_t numBytes, FlatMapContext flatMapContext) : ColumnReader( - std::move(nodeType), + std::move(fileType), stripe, streamLabels, std::move(flatMapContext)), requestedType_{std::move(requestedType)} { - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; auto data = encodingKey.forKind(proto::Stream_Kind_DATA); bool dataVInts = stripe.getUseVInts(data); if (stripe.format() == DwrfFormat::kDwrf) { @@ -580,8 +570,8 @@ template class IntegerDictionaryColumnReader : public ColumnReader { public: IntegerDictionaryColumnReader( - std::shared_ptr nodeType, TypePtr requestedType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, uint32_t numBytes, @@ -648,19 +638,19 @@ class IntegerDictionaryColumnReader : public ColumnReader { template IntegerDictionaryColumnReader::IntegerDictionaryColumnReader( - std::shared_ptr nodeType, TypePtr requestedType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, uint32_t numBytes, FlatMapContext flatMapContext) : ColumnReader( - std::move(nodeType), + std::move(fileType), stripe, streamLabels, std::move(flatMapContext)), requestedType_{std::move(requestedType)} { - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; auto encoding = stripe.getEncoding(encodingKey); dictionarySize = encoding.dictionarysize(); @@ -767,7 +757,7 @@ class TimestampColumnReader : public ColumnReader { public: TimestampColumnReader( - std::shared_ptr nodeType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext); @@ -780,16 +770,16 @@ class TimestampColumnReader : public ColumnReader { }; TimestampColumnReader::TimestampColumnReader( - std::shared_ptr nodeType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext) : ColumnReader( - std::move(nodeType), + std::move(fileType), stripe, streamLabels, std::move(flatMapContext)) { - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; RleVersion vers = convertRleVersion(stripe.getEncoding(encodingKey).kind()); auto data = encodingKey.forKind(proto::Stream_Kind_DATA); bool vints = stripe.getUseVInts(data); @@ -873,8 +863,8 @@ template class FloatingPointColumnReader : public ColumnReader { public: FloatingPointColumnReader( - std::shared_ptr nodeType, TypePtr requestedType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext); @@ -927,19 +917,19 @@ class FloatingPointColumnReader : public ColumnReader { template FloatingPointColumnReader::FloatingPointColumnReader( - std::shared_ptr nodeType, TypePtr requestedType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext) : ColumnReader( - std::move(nodeType), + std::move(fileType), stripe, streamLabels, std::move(flatMapContext)), requestedType_{std::move(requestedType)}, inputStream(stripe.getStream( - EncodingKey{nodeType_->id(), flatMapContext_.sequence}.forKind( + EncodingKey{fileType_->id(), flatMapContext_.sequence}.forKind( proto::Stream_Kind_DATA), streamLabels.label(), true)), @@ -956,7 +946,7 @@ uint64_t FloatingPointColumnReader::skip(uint64_t numValues) { if (remaining >= toSkip) { bufferPointer += toSkip; } else { - inputStream->Skip(static_cast(toSkip - remaining)); + inputStream->SkipInt64(toSkip - remaining); bufferEnd = nullptr; bufferPointer = nullptr; } @@ -1118,7 +1108,6 @@ class StringDictionaryColumnReader : public ColumnReader { std::unique_ptr strideDictStream_; std::unique_ptr> strideDictLengthDecoder_; - FlatVectorPtr combinedDictionaryValues_; FlatVectorPtr dictionaryValues_; @@ -1140,14 +1129,14 @@ class StringDictionaryColumnReader : public ColumnReader { }; StringDictionaryColumnReader::StringDictionaryColumnReader( - std::shared_ptr nodeType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, const EncodingKey& encodingKey, const RleVersion& rleVersion, FlatMapContext flatMapContext) : ColumnReader( - std::move(nodeType), + std::move(fileType), stripe, streamLabels, std::move(flatMapContext)), @@ -1158,7 +1147,7 @@ StringDictionaryColumnReader::StringDictionaryColumnReader( encodingKey.forKind(proto::Stream_Kind_DICTIONARY_DATA), streamLabels.label(), false)), - returnFlatVector_(stripe.getRowReaderOptions().getReturnFlatVector()) { + returnFlatVector_(stripe.rowReaderOptions().returnFlatVector()) { MakeRleDecoderParams params{ .encodingKey = encodingKey, .stripe = stripe, @@ -1424,7 +1413,7 @@ void StringDictionaryColumnReader::readDictionaryVector( combinedDictionaryValues_ = std::make_shared>( &memoryPool_, - nodeType_->type(), + fileType_->type(), BufferPtr(nullptr), // TODO nulls dictionaryCount_ + strideDictCount_ /*length*/, values, @@ -1446,7 +1435,7 @@ void StringDictionaryColumnReader::readDictionaryVector( dictionaryValues_ = std::make_shared>( &memoryPool_, - nodeType_->type(), + fileType_->type(), BufferPtr(nullptr), // TODO nulls dictionaryCount_ /*length*/, values, @@ -1618,7 +1607,7 @@ class StringDirectColumnReader : public ColumnReader { public: StringDirectColumnReader( - std::shared_ptr nodeType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext); @@ -1631,16 +1620,16 @@ class StringDirectColumnReader : public ColumnReader { }; StringDirectColumnReader::StringDirectColumnReader( - std::shared_ptr nodeType, + std::shared_ptr fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext) : ColumnReader( - std::move(nodeType), + std::move(fileType), stripe, streamLabels, std::move(flatMapContext)) { - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; RleVersion rleVersion = convertRleVersion(stripe.getEncoding(encodingKey).kind()); auto lenId = encodingKey.forKind(proto::Stream_Kind_LENGTH); @@ -1667,7 +1656,7 @@ uint64_t StringDirectColumnReader::skip(uint64_t numValues) { totalBytes += computeSize(buffer.data(), nullptr, step); done += step; } - blobStream->Skip(static_cast(totalBytes)); + blobStream->SkipInt64(static_cast(totalBytes)); return numValues; } @@ -1751,7 +1740,7 @@ void StringDirectColumnReader::next( } else { result = std::make_shared>( &memoryPool_, - nodeType_->type(), + fileType_->type(), nulls, numValues, values, @@ -1764,14 +1753,19 @@ class StructColumnReader : public ColumnReader { private: const std::shared_ptr requestedType_; std::vector> children_; + folly::Executor* executor_; + std::unique_ptr parallelForOnChildren_; public: StructColumnReader( const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const std::shared_ptr& fileType, StripeStreams& stripe, const StreamLabels& streamLabels, - FlatMapContext flatMapContext); + folly::Executor* executor, + size_t decodingParallelismFactor, + FlatMapContext flatMapContext, + ColumnReaderFactory& factory); ~StructColumnReader() override = default; uint64_t skip(uint64_t numValues) override; @@ -1794,39 +1788,50 @@ FlatMapContext makeCopyWithNullDecoder(FlatMapContext& original) { // that consumes current column projection which is to be refactored StructColumnReader::StructColumnReader( const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const std::shared_ptr& fileType, StripeStreams& stripe, const StreamLabels& streamLabels, - FlatMapContext flatMapContext) - : ColumnReader(dataType, stripe, streamLabels, std::move(flatMapContext)), - requestedType_{requestedType} { + folly::Executor* executor, + size_t decodingParallelismFactor, + FlatMapContext flatMapContext, + ColumnReaderFactory& factory) + : ColumnReader(fileType, stripe, streamLabels, std::move(flatMapContext)), + requestedType_{requestedType}, + executor_{executor} { DWIO_ENSURE_EQ( - nodeType_->id(), - dataType->id(), - "nodeType and dataType id mismatch in StructColumnReader#init"); - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + fileType_->id(), + fileType->id(), + "fileType and fileType id mismatch in StructColumnReader#init"); + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; auto encoding = static_cast(stripe.getEncoding(encodingKey).kind()); DWIO_ENSURE_EQ( encoding, proto::ColumnEncoding_Kind_DIRECT, "Unknown encoding for StructColumnReader"); + // Can parallelize if top level and doesn't have any flatmap children + bool canParallelize = fileType->parent() == nullptr; // isTopLevel ? + // count the number of selected sub-columns const auto& cs = stripe.getColumnSelector(); - auto project = stripe.getRowReaderOptions().getProjectSelectedType(); + auto project = stripe.rowReaderOptions().projectSelectedType(); for (uint64_t i = 0; i < requestedType_->size(); ++i) { auto& child = requestedType_->childAt(i); // if the requested field is not in file, we either return null reader // or constant reader based on its expression if (cs.shouldReadNode(child->id())) { - if (i < nodeType_->size()) { - children_.push_back(ColumnReader::build( + if (i < fileType_->size()) { + auto childColumnReader = factory.build( child, - nodeType_->childAt(i), + fileType_->childAt(i), stripe, streamLabels.append(folly::to(i)), - makeCopyWithNullDecoder(flatMapContext_))); + executor, + decodingParallelismFactor, + makeCopyWithNullDecoder(flatMapContext_)); + canParallelize = canParallelize && !childColumnReader->isFlatMap(); + children_.push_back(std::move(childColumnReader)); } else { children_.push_back( std::make_unique(stripe, child->type())); @@ -1835,6 +1840,12 @@ StructColumnReader::StructColumnReader( children_.emplace_back(); } } + + parallelForOnChildren_ = std::make_unique( + executor, + 0, + children_.size(), + canParallelize ? decodingParallelismFactor : 0); } uint64_t StructColumnReader::skip(uint64_t numValues) { @@ -1858,10 +1869,9 @@ void StructColumnReader::next( // the parent vector. childrenVectors = rowVector->children(); DWIO_ENSURE_GE(childrenVectors.size(), children_.size()); - } - if (result) { - result->resize(numValues, false); + // Resize rowVector + rowVector->unsafeResize(numValues, false); } BufferPtr nulls = readNulls(numValues, result, incomingNulls); @@ -1879,12 +1889,14 @@ void StructColumnReader::next( childrenVectorsPtr = &childrenVectors; } - for (uint64_t i = 0; i < children_.size(); ++i) { - auto& reader = children_[i]; - if (reader) { - reader->next(numValues, (*childrenVectorsPtr)[i], nullsPtr); - } - } + VELOX_CHECK(parallelForOnChildren_, "ParallelFor should be initialized"); + parallelForOnChildren_->execute( + [this, numValues, childrenVectorsPtr, nullsPtr](size_t i) { + auto& reader = children_[i]; + if (reader) { + reader->next(numValues, (*childrenVectorsPtr)[i], nullsPtr); + } + }); if (result) { result->setNullCount(nullCount); @@ -1922,10 +1934,13 @@ class ListColumnReader : public ColumnReader { public: ListColumnReader( const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const std::shared_ptr& fileType, StripeStreams& stripe, const StreamLabels& streamLabels, - FlatMapContext flatMapContext); + FlatMapContext flatMapContext, + folly::Executor* executor, + size_t decodingParallelismFactor, + ColumnReaderFactory& factory); ~ListColumnReader() override = default; uint64_t skip(uint64_t numValues) override; @@ -1936,14 +1951,17 @@ class ListColumnReader : public ColumnReader { ListColumnReader::ListColumnReader( const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const std::shared_ptr& fileType, StripeStreams& stripe, const StreamLabels& streamLabels, - FlatMapContext flatMapContext) - : ColumnReader(dataType, stripe, streamLabels, std::move(flatMapContext)), + FlatMapContext flatMapContext, + folly::Executor* executor, + size_t decodingParallelismFactor, + ColumnReaderFactory& factory) + : ColumnReader(fileType, stripe, streamLabels, std::move(flatMapContext)), requestedType_{requestedType} { - DWIO_ENSURE_EQ(nodeType_->id(), dataType->id(), "working on the same node"); - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + DWIO_ENSURE_EQ(fileType_->id(), fileType->id(), "working on the same node"); + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; // count the number of selected sub-columns RleVersion vers = convertRleVersion(stripe.getEncoding(encodingKey).kind()); @@ -1959,11 +1977,13 @@ ListColumnReader::ListColumnReader( const auto& cs = stripe.getColumnSelector(); auto& childType = requestedType_->childAt(0); if (cs.shouldReadNode(childType->id())) { - child = ColumnReader::build( + child = factory.build( childType, - nodeType_->childAt(0), + fileType_->childAt(0), stripe, streamLabels, + executor, + decodingParallelismFactor, makeCopyWithNullDecoder(flatMapContext_)); } } @@ -2087,10 +2107,13 @@ class MapColumnReader : public ColumnReader { public: MapColumnReader( const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const std::shared_ptr& fileType, StripeStreams& stripe, const StreamLabels& streamLabels, - FlatMapContext flatMapContext); + FlatMapContext flatMapContext, + folly::Executor* executor, + size_t decodingParallelismFactor, + ColumnReaderFactory& factory); ~MapColumnReader() override = default; uint64_t skip(uint64_t numValues) override; @@ -2101,14 +2124,17 @@ class MapColumnReader : public ColumnReader { MapColumnReader::MapColumnReader( const std::shared_ptr& requestedType, - const std::shared_ptr& dataType, + const std::shared_ptr& fileType, StripeStreams& stripe, const StreamLabels& streamLabels, - FlatMapContext flatMapContext) - : ColumnReader(dataType, stripe, streamLabels, std::move(flatMapContext)), + FlatMapContext flatMapContext, + folly::Executor* executor, + size_t decodingParallelismFactor, + ColumnReaderFactory& factory) + : ColumnReader(fileType, stripe, streamLabels, std::move(flatMapContext)), requestedType_{requestedType} { - DWIO_ENSURE_EQ(nodeType_->id(), dataType->id(), "working on the same node"); - EncodingKey encodingKey{nodeType_->id(), flatMapContext_.sequence}; + DWIO_ENSURE_EQ(fileType_->id(), fileType->id(), "working on the same node"); + EncodingKey encodingKey{fileType_->id(), flatMapContext_.sequence}; // Determine if the key and/or value columns are selected RleVersion vers = convertRleVersion(stripe.getEncoding(encodingKey).kind()); @@ -2124,25 +2150,29 @@ MapColumnReader::MapColumnReader( const auto& cs = stripe.getColumnSelector(); auto& keyType = requestedType_->childAt(0); if (cs.shouldReadNode(keyType->id())) { - keyReader = ColumnReader::build( + keyReader = factory.build( keyType, - nodeType_->childAt(0), + fileType_->childAt(0), stripe, streamLabels, + executor, + decodingParallelismFactor, makeCopyWithNullDecoder(flatMapContext_)); } auto& valueType = requestedType_->childAt(1); if (cs.shouldReadNode(valueType->id())) { - elementReader = ColumnReader::build( + elementReader = factory.build( valueType, - nodeType_->childAt(1), + fileType_->childAt(1), stripe, streamLabels, + executor, + decodingParallelismFactor, makeCopyWithNullDecoder(flatMapContext_)); } - VLOG(1) << "[Map] Initialized map column reader for node " << nodeType_->id(); + VLOG(1) << "[Map] Initialized map column reader for node " << fileType_->id(); } uint64_t MapColumnReader::skip(uint64_t numValues) { @@ -2278,48 +2308,48 @@ struct RleDecoderFactory { template std::unique_ptr buildByteRleColumnReader( - const std::shared_ptr& nodeType, TypePtr requestedType, + const std::shared_ptr& fileType, StripeStreams& stripe, const StreamLabels& streamLabels, FlatMapContext flatMapContext) { switch (requestedType->kind()) { case TypeKind::BOOLEAN: return std::make_unique>( - nodeType, std::move(requestedType), + fileType, stripe, streamLabels, RleDecoderFactory::get(), std::move(flatMapContext)); case TypeKind::TINYINT: return std::make_unique>( - nodeType, std::move(requestedType), + fileType, stripe, streamLabels, RleDecoderFactory::get(), std::move(flatMapContext)); case TypeKind::SMALLINT: return std::make_unique>( - nodeType, std::move(requestedType), + fileType, stripe, streamLabels, RleDecoderFactory::get(), std::move(flatMapContext)); case TypeKind::INTEGER: return std::make_unique>( - nodeType, std::move(requestedType), + fileType, stripe, streamLabels, RleDecoderFactory::get(), std::move(flatMapContext)); case TypeKind::BIGINT: return std::make_unique>( - nodeType, std::move(requestedType), + fileType, stripe, streamLabels, RleDecoderFactory::get(), @@ -2332,8 +2362,8 @@ std::unique_ptr buildByteRleColumnReader( template