From dcc82c91d3d52147d38c6c89b5e8a5794fe395d1 Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Thu, 11 Apr 2024 09:47:24 +0800 Subject: [PATCH] address comments --- velox/functions/sparksql/Hash.cpp | 91 ++++++++++++++++--------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp index 7ec03cc011e6..f1a1279dce52 100644 --- a/velox/functions/sparksql/Hash.cpp +++ b/velox/functions/sparksql/Hash.cpp @@ -29,6 +29,8 @@ const int32_t kDefaultSeed = 42; struct Murmur3Hash; struct XxHash64; +/// A template struct that contains the seed and return type of the hash +/// function. template struct HashTraits {}; @@ -44,12 +46,9 @@ struct HashTraits { using ReturnType = int64_t; }; -template < - typename HashClass, - typename SeedType, - typename ReturnType, - typename T> -ReturnType hashOne(T input, SeedType seed) { +// Computes the hash value of input using the hash function in HashClass. +template +ReturnType hashOne(int32_t input, SeedType seed) { return HashClass::hashInt32(input, seed); } @@ -83,54 +82,56 @@ ReturnType hashOne(StringView input, SeedType seed) { return HashClass::hashBytes(input, seed); } +/// Class to compute hashes identical to one produced by Spark. +/// Hashes are computed using the algorithm implemented in HashClass. template < typename HashClass, - TypeKind kind, typename SeedType = typename HashTraits::SeedType, typename ReturnType = typename HashTraits::ReturnType> -class PrimitiveVectorHasher; +class SparkVectorHasher { + public: + SparkVectorHasher(DecodedVector& decoded) : decoded_(decoded) {} + + virtual ~SparkVectorHasher() = default; + + // Compute the hash value of input vector at index. + ReturnType hashAt(vector_size_t index, SeedType seed) { + if (decoded_.isNullAt(index)) { + return seed; + } + return hashNotNull(index, seed); + } + + virtual ReturnType hashNotNull(vector_size_t index, SeedType seed) = 0; + + protected: + const DecodedVector& decoded_; +}; template < typename HashClass, + TypeKind kind, typename SeedType = typename HashTraits::SeedType, typename ReturnType = typename HashTraits::ReturnType> -class ArrayVectorHasher; +class PrimitiveVectorHasher; template < typename HashClass, typename SeedType = typename HashTraits::SeedType, typename ReturnType = typename HashTraits::ReturnType> -class MapVectorHasher; +class ArrayVectorHasher; template < typename HashClass, typename SeedType = typename HashTraits::SeedType, typename ReturnType = typename HashTraits::ReturnType> -class RowVectorHasher; +class MapVectorHasher; template < typename HashClass, typename SeedType = typename HashTraits::SeedType, typename ReturnType = typename HashTraits::ReturnType> -class SparkVectorHasher { - public: - // Compute the hash value of input vector at index. - ReturnType hashAt(vector_size_t index, SeedType seed) { - if (decoded_.isNullAt(index)) { - return seed; - } - return hashNotNull(index, seed); - } - - virtual ReturnType hashNotNull(vector_size_t index, SeedType seed) = 0; - - SparkVectorHasher(DecodedVector& decoded) : decoded_(decoded) {} - - virtual ~SparkVectorHasher() = default; - - protected: - const DecodedVector& decoded_; -}; +class RowVectorHasher; template std::shared_ptr> createPrimitiveVectorHasher( @@ -141,18 +142,20 @@ std::shared_ptr> createPrimitiveVectorHasher( template std::shared_ptr> createVectorHasher( DecodedVector& decoded) { - auto baseType = decoded.base()->type(); - if (baseType->isPrimitiveType()) { - return VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH( - createPrimitiveVectorHasher, HashClass, baseType->kind(), decoded); - } else if (baseType->isArray()) { - return std::make_shared>(decoded); - } else if (baseType->isMap()) { - return std::make_shared>(decoded); - } else if (baseType->isRow()) { - return std::make_shared>(decoded); - } - VELOX_UNREACHABLE(); + switch (decoded.base()->typeKind()) { + case TypeKind::ARRAY: + return std::make_shared>(decoded); + case TypeKind::MAP: + return std::make_shared>(decoded); + case TypeKind::ROW: + return std::make_shared>(decoded); + default: + return VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH( + createPrimitiveVectorHasher, + HashClass, + decoded.base()->typeKind(), + decoded); + } } template < @@ -288,7 +291,7 @@ void applyWithType( SeedType hashSeed = seed ? *seed : kDefaultSeed; auto& result = *resultRef->as>(); - rows.applyToSelected([&](int row) { result.set(row, hashSeed); }); + rows.applyToSelected([&](auto row) { result.set(row, hashSeed); }); exec::LocalSelectivityVector selectedMinusNulls(context); @@ -304,7 +307,7 @@ void applyWithType( } auto hasher = createVectorHasher(*decoded); - selected->applyToSelected([&](int row) { + selected->applyToSelected([&](auto row) { result.set(row, hasher->hashNotNull(row, result.valueAt(row))); }); }