diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp index 19fc7a794012..92631c5dfffd 100644 --- a/velox/functions/sparksql/Hash.cpp +++ b/velox/functions/sparksql/Hash.cpp @@ -29,23 +29,6 @@ const int32_t kDefaultSeed = 42; struct Murmur3Hash; struct XxHash64; -// A template struct that contains the seed and return type of the hash -// function. -template -struct HashTraits {}; - -template <> -struct HashTraits { - using SeedType = int32_t; - using ReturnType = int32_t; -}; - -template <> -struct HashTraits { - using SeedType = int64_t; - using ReturnType = int64_t; -}; - // Computes the hash value of input using the hash function in HashClass. template ReturnType hashOne(int32_t input, SeedType seed) { @@ -82,39 +65,26 @@ ReturnType hashOne(StringView input, SeedType seed) { return HashClass::hashBytes(input, seed); } -template < - typename HashClass, - TypeKind kind, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> +template class PrimitiveVectorHasher; -template < - typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> +template class ArrayVectorHasher; -template < - typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> +template class MapVectorHasher; -template < - typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> +template class RowVectorHasher; // Class to compute hashes identical to one produced by Spark. // Hashes are computed using the algorithm implemented in HashClass. -template < - typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> +template class SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + SparkVectorHasher(DecodedVector& decoded) : decoded_(decoded) {} virtual ~SparkVectorHasher() = default; @@ -131,17 +101,14 @@ class SparkVectorHasher { ReturnType hashNotNullAt(vector_size_t index, SeedType seed) { switch (decoded_.base()->typeKind()) { case TypeKind::ARRAY: - return static_cast*>( - this) - ->hashNotNullAt(index, seed); + return static_cast*>(this)->hashValueAt( + index, seed); case TypeKind::MAP: - return static_cast*>( - this) - ->hashNotNullAt(index, seed); + return static_cast*>(this)->hashValueAt( + index, seed); case TypeKind::ROW: - return static_cast*>( - this) - ->hashNotNullAt(index, seed); + return static_cast*>(this)->hashValueAt( + index, seed); default: return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( hashPrimitive, decoded_.base()->typeKind(), index, seed); @@ -154,10 +121,8 @@ class SparkVectorHasher { private: template ReturnType hashPrimitive(vector_size_t index, SeedType seed) { - return static_cast< - PrimitiveVectorHasher*>( - this) - ->hashNotNullAt(index, seed); + return static_cast*>(this) + ->hashValueAt(index, seed); } }; @@ -186,17 +151,16 @@ std::shared_ptr> createVectorHasher( } } -template < - typename HashClass, - TypeKind kind, - typename SeedType, - typename ReturnType> +template class PrimitiveVectorHasher : public SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + PrimitiveVectorHasher(DecodedVector& decoded) : SparkVectorHasher(decoded) {} - ReturnType hashNotNullAt(vector_size_t index, SeedType seed) { + ReturnType hashValueAt(vector_size_t index, SeedType seed) { return hashOne( this->decoded_.template valueAt::NativeType>( index), @@ -204,9 +168,12 @@ class PrimitiveVectorHasher : public SparkVectorHasher { } }; -template +template class ArrayVectorHasher : public SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + ArrayVectorHasher(DecodedVector& decoded) : SparkVectorHasher(decoded) { base_ = decoded.base()->as(); @@ -217,7 +184,7 @@ class ArrayVectorHasher : public SparkVectorHasher { elementHasher_ = createVectorHasher(decodedElements_); } - ReturnType hashNotNullAt(vector_size_t index, SeedType seed) { + ReturnType hashValueAt(vector_size_t index, SeedType seed) { auto size = base_->sizeAt(indices_[index]); auto offset = base_->offsetAt(indices_[index]); @@ -235,9 +202,12 @@ class ArrayVectorHasher : public SparkVectorHasher { std::shared_ptr> elementHasher_; }; -template +template class MapVectorHasher : public SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + MapVectorHasher(DecodedVector& decoded) : SparkVectorHasher(decoded) { base_ = decoded.base()->as(); @@ -250,7 +220,7 @@ class MapVectorHasher : public SparkVectorHasher { valueHasher_ = createVectorHasher(decodedValues_); } - ReturnType hashNotNullAt(vector_size_t index, SeedType seed) { + ReturnType hashValueAt(vector_size_t index, SeedType seed) { auto size = base_->sizeAt(indices_[index]); auto offset = base_->offsetAt(indices_[index]); @@ -271,9 +241,12 @@ class MapVectorHasher : public SparkVectorHasher { std::shared_ptr> valueHasher_; }; -template +template class RowVectorHasher : public SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + RowVectorHasher(DecodedVector& decoded) : SparkVectorHasher(decoded) { base_ = decoded.base()->as(); @@ -288,7 +261,7 @@ class RowVectorHasher : public SparkVectorHasher { } } - ReturnType hashNotNullAt(vector_size_t index, SeedType seed) { + ReturnType hashValueAt(vector_size_t index, SeedType seed) { ReturnType result = seed; for (auto i = 0; i < base_->childrenSize(); ++i) { result = hashers_[i]->hashAt(indices_[index], result); @@ -307,8 +280,8 @@ class RowVectorHasher : public SparkVectorHasher { // HashClass contains the function like hashInt32 template < typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> + typename SeedType = typename HashClass::SeedType, + typename ReturnType = typename HashClass::ReturnType> void applyWithType( const SelectivityVector& rows, std::vector& args, // Not using const ref so we can reuse args @@ -354,6 +327,9 @@ void applyWithType( class Murmur3Hash final { public: + using SeedType = int32_t; + using ReturnType = int32_t; + static uint32_t hashInt32(int32_t input, uint32_t seed) { uint32_t k1 = mixK1(input); uint32_t h1 = mixH1(seed, k1); @@ -458,6 +434,9 @@ class Murmur3HashFunction final : public exec::VectorFunction { class XxHash64 final { public: + using SeedType = int64_t; + using ReturnType = int64_t; + static uint64_t hashInt32(const int32_t input, uint64_t seed) { int64_t hash = seed + PRIME64_5 + 4L; hash ^= static_cast((input & 0xFFFFFFFFL) * PRIME64_1);