This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 9e14f5f959d4 [SPARK-53111][SQL][PYTHON][CONNECT] Implement the
time_diff function in PySpark
9e14f5f959d4 is described below
commit 9e14f5f959d4abba4d713a4165d97b59c431123c
Author: Uros Bojanic <[email protected]>
AuthorDate: Wed Oct 15 15:52:55 2025 +0800
[SPARK-53111][SQL][PYTHON][CONNECT] Implement the time_diff function in
PySpark
### What changes were proposed in this pull request?
Implement the `time_diff` function in PySpark & PySpark Connect API.
### Why are the changes needed?
Expand API support for the `time_diff` function.
### Does this PR introduce _any_ user-facing change?
Yes, the new function is now available in Python API.
### How was this patch tested?
Added appropriate Python function tests.
- pyspark.sql.tests.test_functions
- pyspark.sql.tests.connect.test_parity_functions
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #51829 from uros-db/python-time_diff.
Authored-by: Uros Bojanic <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.../source/reference/pyspark.sql/functions.rst | 1 +
python/pyspark/sql/connect/functions/builtin.py | 7 ++++
python/pyspark/sql/functions/__init__.py | 1 +
python/pyspark/sql/functions/builtin.py | 43 ++++++++++++++++++++++
python/pyspark/sql/tests/test_functions.py | 18 +++++++--
5 files changed, 66 insertions(+), 4 deletions(-)
diff --git a/python/docs/source/reference/pyspark.sql/functions.rst
b/python/docs/source/reference/pyspark.sql/functions.rst
index 003fdc0a00b5..e4175707aecd 100644
--- a/python/docs/source/reference/pyspark.sql/functions.rst
+++ b/python/docs/source/reference/pyspark.sql/functions.rst
@@ -299,6 +299,7 @@ Date and Timestamp Functions
timestamp_micros
timestamp_millis
timestamp_seconds
+ time_diff
time_trunc
to_date
to_time
diff --git a/python/pyspark/sql/connect/functions/builtin.py
b/python/pyspark/sql/connect/functions/builtin.py
index aee4a7572a35..2668b7a526fd 100644
--- a/python/pyspark/sql/connect/functions/builtin.py
+++ b/python/pyspark/sql/connect/functions/builtin.py
@@ -3650,6 +3650,13 @@ def timestamp_seconds(col: "ColumnOrName") -> Column:
timestamp_seconds.__doc__ = pysparkfuncs.timestamp_seconds.__doc__
+def time_diff(unit: "ColumnOrName", start: "ColumnOrName", end:
"ColumnOrName") -> Column:
+ return _invoke_function_over_columns("time_diff", unit, start, end)
+
+
+time_diff.__doc__ = pysparkfuncs.time_diff.__doc__
+
+
def time_trunc(unit: "ColumnOrName", time: "ColumnOrName") -> Column:
return _invoke_function_over_columns("time_trunc", unit, time)
diff --git a/python/pyspark/sql/functions/__init__.py
b/python/pyspark/sql/functions/__init__.py
index 7c3f4cbc1a4f..e1b320c98f7f 100644
--- a/python/pyspark/sql/functions/__init__.py
+++ b/python/pyspark/sql/functions/__init__.py
@@ -248,6 +248,7 @@ __all__ = [ # noqa: F405
"timestamp_micros",
"timestamp_millis",
"timestamp_seconds",
+ "time_diff",
"time_trunc",
"to_date",
"to_time",
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 0dd0aea7bced..24baace54621 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -12710,6 +12710,49 @@ def timestamp_seconds(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("timestamp_seconds", col)
+@_try_remote_functions
+def time_diff(unit: "ColumnOrName", start: "ColumnOrName", end:
"ColumnOrName") -> Column:
+ """
+ Returns the difference between two times, measured in specified units.
+
+ .. versionadded:: 4.1.0
+
+ Parameters
+ ----------
+ unit : :class:`~pyspark.sql.Column` or column name
+ The unit to truncate the time to. Supported units are: "HOUR",
"MINUTE", "SECOND",
+ "MILLISECOND", and "MICROSECOND". The unit is case-insensitive.
+ start : :class:`~pyspark.sql.Column` or column name
+ A starting time.
+ end : :class:`~pyspark.sql.Column` or column name
+ An ending time.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ The difference between two times, in the specified units.
+
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.date_diff`
+ :meth:`pyspark.sql.functions.timestamp_diff`
+
+ Examples
+ --------
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame(
+ ... [("HOUR", "13:08:15", "21:30:28")], ['unit', 'start',
'end']).withColumn("start",
+ ... sf.col("start").cast("time")).withColumn("end",
sf.col("end").cast("time"))
+ >>> df.select('*', sf.time_diff('unit', 'start', 'end')).show()
+ +----+--------+--------+---------------------------+
+ |unit| start| end|time_diff(unit, start, end)|
+ +----+--------+--------+---------------------------+
+ |HOUR|13:08:15|21:30:28| 8|
+ +----+--------+--------+---------------------------+
+ """
+ return _invoke_function_over_columns("time_diff", unit, start, end)
+
+
@_try_remote_functions
def time_trunc(unit: "ColumnOrName", time: "ColumnOrName") -> Column:
"""
diff --git a/python/pyspark/sql/tests/test_functions.py
b/python/pyspark/sql/tests/test_functions.py
index 91e519c6f8c7..41c07a61eb1e 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -81,10 +81,7 @@ class FunctionsTestsMixin:
missing_in_py = jvm_fn_set.difference(py_fn_set)
# Functions that we expect to be missing in python until they are
added to pyspark
- expected_missing_in_py = set(
- # TODO(SPARK-53108): Implement the time_diff function in Python
- ["time_diff"]
- )
+ expected_missing_in_py = set()
self.assertEqual(
expected_missing_in_py, missing_in_py, "Missing functions in
pyspark not as expected"
@@ -403,6 +400,19 @@ class FunctionsTestsMixin:
rndn2 = df.select("key", F.randn(0)).collect()
self.assertEqual(sorted(rndn1), sorted(rndn2))
+ def test_time_diff(self):
+ # SPARK-53111: test the time_diff function.
+ df = self.spark.range(1).select(
+ F.lit("hour").alias("unit"),
+ F.lit(datetime.time(20, 30, 29)).alias("start"),
+ F.lit(datetime.time(21, 30, 29)).alias("end"),
+ )
+ result = 1
+ row_from_col = df.select(F.time_diff(df.unit, df.start,
df.end)).first()
+ self.assertEqual(row_from_col[0], result)
+ row_from_name = df.select(F.time_diff("unit", "start", "end")).first()
+ self.assertEqual(row_from_name[0], result)
+
def test_time_trunc(self):
# SPARK-53110: test the time_trunc function.
df = self.spark.range(1).select(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]