(spark) branch master updated: [SPARK-51664][SQL] Support the TIME data type in the Hash expression

maxgekk Mon, 31 Mar 2025 01:10:01 -0700

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 87b8f2b3a7ab [SPARK-51664][SQL] Support the TIME data type in the Hash 
expression
87b8f2b3a7ab is described below

commit 87b8f2b3a7ab8021c6aeb94fe56326b296b87c98
Author: Max Gekk <max.g...@gmail.com>
AuthorDate: Mon Mar 31 11:09:27 2025 +0300

    [SPARK-51664][SQL] Support the TIME data type in the Hash expression
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to support the new data type TIME in `HashExpression`, 
and generate a hash in the same way as for the underlying type `Long.`
    
    ### Why are the changes needed?
    The affected expressions are used in core components in Spark SQL:
    - Murmur3Hash: generating partition ID,
    - HiveHash: bucking
    - XxHash64: Bloom filter
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    By running the new test:
    ```
    $ build/sbt "test:testOnly *HashExpressionsSuite"
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #50456 from MaxGekk/time-hash-expr.
    
    Lead-authored-by: Max Gekk <max.g...@gmail.com>
    Co-authored-by: Maxim Gekk <max.g...@gmail.com>
    Signed-off-by: Max Gekk <max.g...@gmail.com>
---
 .../scala/org/apache/spark/sql/catalyst/expressions/hash.scala   | 4 +++-
 .../spark/sql/catalyst/expressions/HashExpressionsSuite.scala    | 9 ++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index ac493d19df1b..7cb645e601d3 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -256,6 +256,8 @@ case class Crc32(child: Expression)
  *                             input with seed.
  *  - binary:                  use murmur3 to hash the bytes with seed.
  *  - string:                  get the bytes of string and hash it.
+ *  - time:                    it stores long value of `microseconds` since 
the midnight, use
+ *                             murmur3 to hash the long input with seed.
  *  - array:                   The `result` starts with seed, then use 
`result` as seed, recursively
  *                             calculate hash value for each element, and 
assign the element hash
  *                             value to `result`.
@@ -507,7 +509,7 @@ abstract class HashExpression[E] extends Expression {
     case NullType => ""
     case BooleanType => genHashBoolean(input, result)
     case ByteType | ShortType | IntegerType | DateType => genHashInt(input, 
result)
-    case LongType => genHashLong(input, result)
+    case LongType | _: TimeType => genHashLong(input, result)
     case TimestampType | TimestampNTZType => genHashTimestamp(input, result)
     case FloatType => genHashFloat(input, result)
     case DoubleType => genHashDouble(input, result)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
index 019c953a3b0a..dddc33aa4358 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.nio.charset.StandardCharsets
-import java.time.{Duration, Period, ZoneId, ZoneOffset}
+import java.time.{Duration, LocalTime, Period, ZoneId, ZoneOffset}
 
 import scala.collection.mutable.ArrayBuffer
 import scala.language.implicitConversions
@@ -754,6 +754,13 @@ class HashExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     checkResult(Literal.create(-0F, FloatType), Literal.create(0F, FloatType))
   }
 
+  test("Support TimeType") {
+    val time = Literal.create(LocalTime.of(23, 50, 59, 123456000), TimeType())
+    checkEvaluation(Murmur3Hash(Seq(time), 10), 258472763)
+    checkEvaluation(XxHash64(Seq(time), 10), -9197489935839400467L)
+    checkEvaluation(HiveHash(Seq(time)), -40222445)
+  }
+
   private def testHash(inputSchema: StructType): Unit = {
     val inputGenerator = RandomDataGenerator.forType(inputSchema, nullable = 
false).get
     val toRow = ExpressionEncoder(inputSchema).createSerializer()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-51664][SQL] Support the TIME data type in the Hash expression

Reply via email to