diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp index 6c776d74c0bd..c425e3cac3e2 100644 --- a/velox/functions/sparksql/Hash.cpp +++ b/velox/functions/sparksql/Hash.cpp @@ -73,6 +73,8 @@ void applyWithType( CASE(VARBINARY, hash.hashBytes, StringView); CASE(REAL, hash.hashFloat, float); CASE(DOUBLE, hash.hashDouble, double); + CASE(HUGEINT, hash.hashLongDecimal, int128_t); + CASE(TIMESTAMP, hash.hashTimestamp, Timestamp); #undef CASE default: VELOX_NYI( @@ -139,6 +141,17 @@ class Murmur3Hash final { return fmix(h1, input.size()); } + uint32_t hashLongDecimal(int128_t input, uint32_t seed) { + int32_t length; + char out[sizeof(int128_t)]; + DecimalUtil::toByteArray(input, out, length); + return hashBytes(StringView(out, length), seed); + } + + uint32_t hashTimestamp(Timestamp input, uint32_t seed) { + return hashInt64(input.toMicros(), seed); + } + private: uint32_t mixK1(uint32_t k1) { k1 *= 0xcc9e2d51; @@ -245,6 +258,17 @@ class XxHash64 final { return fmix(hash); } + int64_t hashLongDecimal(int128_t input, uint32_t seed) { + int32_t length; + char out[sizeof(int128_t)]; + DecimalUtil::toByteArray(input, out, length); + return hashBytes(StringView(out, length), seed); + } + + int64_t hashTimestamp(Timestamp input, uint32_t seed) { + return hashInt64(input.toMicros(), seed); + } + private: uint64_t fmix(uint64_t hash) { hash ^= hash >> 33; diff --git a/velox/functions/sparksql/Hash.h b/velox/functions/sparksql/Hash.h index fb381874e4f0..6f931af4c9a8 100644 --- a/velox/functions/sparksql/Hash.h +++ b/velox/functions/sparksql/Hash.h @@ -22,11 +22,11 @@ namespace facebook::velox::functions::sparksql { // - Integer types (tinyint, smallint, integer, bigint) // - Varchar, varbinary // - Real, double -// -// TODO: // - Decimal // - Date // - Timestamp +// +// TODO: // - Row, Array: hash the elements in order // - Map: iterate over map, hashing key then value. Since map ordering is // unspecified, hashing logically equivalent maps may result in @@ -51,10 +51,11 @@ std::shared_ptr makeHashWithSeed( // - Integer types (byte, short, int, long) // - String, Binary // - Float, Double +// - Decimal +// - Date +// - Timestamp // // Unsupported: -// - Decimal -// - Datetime // - Structs, Arrays: hash the elements in order // - Maps: iterate over map, hashing key then value. Since map ordering is // unspecified, hashing logically equivalent maps may result in diff --git a/velox/functions/sparksql/tests/HashTest.cpp b/velox/functions/sparksql/tests/HashTest.cpp index 714db7f5b943..422d63643e56 100644 --- a/velox/functions/sparksql/tests/HashTest.cpp +++ b/velox/functions/sparksql/tests/HashTest.cpp @@ -38,6 +38,21 @@ TEST_F(HashTest, String) { EXPECT_EQ(hash(std::nullopt), 42); } +TEST_F(HashTest, longDecimal) { + EXPECT_EQ(hash(12345678), -277285195); + EXPECT_EQ(hash(0), -783713497); + EXPECT_EQ(hash(DecimalUtil::kLongDecimalMin), 1400911110); + EXPECT_EQ(hash(DecimalUtil::kLongDecimalMax), -817514053); + EXPECT_EQ(hash(-12345678), -1198355617); + EXPECT_EQ(hash(std::nullopt), 42); +} + +// Spark CLI select timestamp_micros(12345678) to get the Timestamp. +// select hash(Timestamp("1970-01-01 00:00:12.345678")) to get the hash value. +TEST_F(HashTest, Timestamp) { + EXPECT_EQ(hash(Timestamp::fromMicros(12345678)), 1402875301); +} + TEST_F(HashTest, Int64) { EXPECT_EQ(hash(0xcafecafedeadbeef), -256235155); EXPECT_EQ(hash(0xdeadbeefcafecafe), 673261790); diff --git a/velox/functions/sparksql/tests/XxHash64Test.cpp b/velox/functions/sparksql/tests/XxHash64Test.cpp index 15ae146c6a2f..09162f4a0279 100644 --- a/velox/functions/sparksql/tests/XxHash64Test.cpp +++ b/velox/functions/sparksql/tests/XxHash64Test.cpp @@ -44,6 +44,25 @@ TEST_F(XxHash64Test, varchar) { EXPECT_EQ(xxhash64(std::nullopt), 42); } +TEST_F(XxHash64Test, longDecimal) { + EXPECT_EQ(xxhash64(12345678), 4541350547708072824); + EXPECT_EQ(xxhash64(0), -8959994473701255385); + EXPECT_EQ( + xxhash64(DecimalUtil::kLongDecimalMin), -2254039905620870768); + EXPECT_EQ( + xxhash64(DecimalUtil::kLongDecimalMax), -47190729175993179); + EXPECT_EQ(xxhash64(-12345678), -7692719129258511951); + EXPECT_EQ(xxhash64(std::nullopt), 42); +} + +// Spark CLI select timestamp_micros(12345678) to get the Timestamp. +// select xxhash64(Timestamp("1970-01-01 00:00:12.345678")) to get the hash +// value. +TEST_F(XxHash64Test, Timestamp) { + EXPECT_EQ( + xxhash64(Timestamp::fromMicros(12345678)), 782671362992292307); +} + TEST_F(XxHash64Test, int64) { EXPECT_EQ(xxhash64(0xcafecafedeadbeef), -6259772178006417012); EXPECT_EQ(xxhash64(0xdeadbeefcafecafe), -1700188678616701932);