This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new a8cdae8948c1 [SPARK-53112][SQL][PYTHON][CONNECT] Support TIME in the
make_timestamp_ntz and try_make_timestamp_ntz functions in PySpark
a8cdae8948c1 is described below
commit a8cdae8948c1b96bdb2a2b6de8866f94567698bd
Author: Yicong-Huang <[email protected]>
AuthorDate: Thu Sep 25 11:18:13 2025 +0800
[SPARK-53112][SQL][PYTHON][CONNECT] Support TIME in the make_timestamp_ntz
and try_make_timestamp_ntz functions in PySpark
### What changes were proposed in this pull request?
Implement the `make_timestamp_ntz` and `try_make_timestamp_ntz` functions
in PySpark & PySpark Connect API.
### Why are the changes needed?
Expand API support for the `make_timestamp_ntz` and
`try_make_timestamp_ntz` functions.
### Does this PR introduce _any_ user-facing change?
Yes, the new functions are now available in Python API.
### How was this patch tested?
Added appropriate Python function tests.
- pyspark.sql.tests.test_functions
- pyspark.sql.tests.connect.test_parity_functions
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #51831 from uros-db/python-try_make_timestamp_ntz.
Lead-authored-by: Yicong-Huang
<[email protected]>
Co-authored-by: Uros Bojanic <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/connect/functions/builtin.py | 102 ++++++-
python/pyspark/sql/functions/builtin.py | 228 ++++++++++++---
python/pyspark/sql/tests/test_functions.py | 374 +++++++++++++++++++++++-
3 files changed, 660 insertions(+), 44 deletions(-)
diff --git a/python/pyspark/sql/connect/functions/builtin.py
b/python/pyspark/sql/connect/functions/builtin.py
index d506262fc119..ca514e36b1e7 100644
--- a/python/pyspark/sql/connect/functions/builtin.py
+++ b/python/pyspark/sql/connect/functions/builtin.py
@@ -4019,6 +4019,7 @@ def try_make_timestamp_ltz(
try_make_timestamp_ltz.__doc__ = pysparkfuncs.try_make_timestamp_ltz.__doc__
+@overload
def make_timestamp_ntz(
years: "ColumnOrName",
months: "ColumnOrName",
@@ -4027,14 +4028,59 @@ def make_timestamp_ntz(
mins: "ColumnOrName",
secs: "ColumnOrName",
) -> Column:
- return _invoke_function_over_columns(
- "make_timestamp_ntz", years, months, days, hours, mins, secs
- )
+ ...
+
+
+@overload
+def make_timestamp_ntz(
+ *,
+ date: "ColumnOrName",
+ time: "ColumnOrName",
+) -> Column:
+ ...
+
+
+def make_timestamp_ntz(
+ years: Optional["ColumnOrName"] = None,
+ months: Optional["ColumnOrName"] = None,
+ days: Optional["ColumnOrName"] = None,
+ hours: Optional["ColumnOrName"] = None,
+ mins: Optional["ColumnOrName"] = None,
+ secs: Optional["ColumnOrName"] = None,
+ *,
+ date: Optional["ColumnOrName"] = None,
+ time: Optional["ColumnOrName"] = None,
+) -> Column:
+ if years is not None:
+ if any(arg is not None for arg in [date, time]):
+ raise PySparkValueError(
+ errorClass="CANNOT_SET_TOGETHER",
+ messageParameters={"arg_list":
"years|months|days|hours|mins|secs and date|time"},
+ )
+ return _invoke_function_over_columns(
+ "make_timestamp_ntz",
+ cast("ColumnOrName", years),
+ cast("ColumnOrName", months),
+ cast("ColumnOrName", days),
+ cast("ColumnOrName", hours),
+ cast("ColumnOrName", mins),
+ cast("ColumnOrName", secs),
+ )
+ else:
+ if any(arg is not None for arg in [years, months, days, hours, mins,
secs]):
+ raise PySparkValueError(
+ errorClass="CANNOT_SET_TOGETHER",
+ messageParameters={"arg_list":
"years|months|days|hours|mins|secs and date|time"},
+ )
+ return _invoke_function_over_columns(
+ "make_timestamp_ntz", cast("ColumnOrName", date),
cast("ColumnOrName", time)
+ )
make_timestamp_ntz.__doc__ = pysparkfuncs.make_timestamp_ntz.__doc__
+@overload
def try_make_timestamp_ntz(
years: "ColumnOrName",
months: "ColumnOrName",
@@ -4043,9 +4089,53 @@ def try_make_timestamp_ntz(
mins: "ColumnOrName",
secs: "ColumnOrName",
) -> Column:
- return _invoke_function_over_columns(
- "try_make_timestamp_ntz", years, months, days, hours, mins, secs
- )
+ ...
+
+
+@overload
+def try_make_timestamp_ntz(
+ *,
+ date: "ColumnOrName",
+ time: "ColumnOrName",
+) -> Column:
+ ...
+
+
+def try_make_timestamp_ntz(
+ years: Optional["ColumnOrName"] = None,
+ months: Optional["ColumnOrName"] = None,
+ days: Optional["ColumnOrName"] = None,
+ hours: Optional["ColumnOrName"] = None,
+ mins: Optional["ColumnOrName"] = None,
+ secs: Optional["ColumnOrName"] = None,
+ *,
+ date: Optional["ColumnOrName"] = None,
+ time: Optional["ColumnOrName"] = None,
+) -> Column:
+ if years is not None:
+ if any(arg is not None for arg in [date, time]):
+ raise PySparkValueError(
+ errorClass="CANNOT_SET_TOGETHER",
+ messageParameters={"arg_list":
"years|months|days|hours|mins|secs and date|time"},
+ )
+ return _invoke_function_over_columns(
+ "try_make_timestamp_ntz",
+ cast("ColumnOrName", years),
+ cast("ColumnOrName", months),
+ cast("ColumnOrName", days),
+ cast("ColumnOrName", hours),
+ cast("ColumnOrName", mins),
+ cast("ColumnOrName", secs),
+ )
+ else:
+ if any(arg is not None for arg in [years, months, days, hours, mins,
secs]):
+ raise PySparkValueError(
+ errorClass="CANNOT_SET_TOGETHER",
+ messageParameters={"arg_list":
"years|months|days|hours|mins|secs and date|time"},
+ )
+ return _invoke_function_over_columns(
+ "try_make_timestamp_ntz", cast("ColumnOrName", date),
cast("ColumnOrName", time)
+ )
try_make_timestamp_ntz.__doc__ = pysparkfuncs.try_make_timestamp_ntz.__doc__
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index fb5d8ea46196..8236b3565bb8 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -25165,7 +25165,7 @@ def try_make_timestamp_ltz(
)
-@_try_remote_functions
+@overload
def make_timestamp_ntz(
years: "ColumnOrName",
months: "ColumnOrName",
@@ -25173,31 +25173,78 @@ def make_timestamp_ntz(
hours: "ColumnOrName",
mins: "ColumnOrName",
secs: "ColumnOrName",
+) -> Column:
+ ...
+
+
+@overload
+def make_timestamp_ntz(
+ *,
+ date: "ColumnOrName",
+ time: "ColumnOrName",
+) -> Column:
+ ...
+
+
+@_try_remote_functions
+def make_timestamp_ntz(
+ years: Optional["ColumnOrName"] = None,
+ months: Optional["ColumnOrName"] = None,
+ days: Optional["ColumnOrName"] = None,
+ hours: Optional["ColumnOrName"] = None,
+ mins: Optional["ColumnOrName"] = None,
+ secs: Optional["ColumnOrName"] = None,
+ *,
+ date: Optional["ColumnOrName"] = None,
+ time: Optional["ColumnOrName"] = None,
) -> Column:
"""
- Create local date-time from years, months, days, hours, mins, secs fields.
- If the configuration `spark.sql.ansi.enabled` is false, the function
returns NULL
- on invalid inputs. Otherwise, it will throw an error instead.
+ Create local date-time from years, months, days, hours, mins, secs fields.
Alternatively, try to
+ create local date-time from date and time fields. If the configuration
`spark.sql.ansi.enabled`
+ is false, the function returns NULL on invalid inputs. Otherwise, it will
throw an error.
.. versionadded:: 3.5.0
+ .. versionchanged:: 4.1.0
+ Added support for creating timestamps from date and time.
+
Parameters
----------
- years : :class:`~pyspark.sql.Column` or column name
- The year to represent, from 1 to 9999
- months : :class:`~pyspark.sql.Column` or column name
- The month-of-year to represent, from 1 (January) to 12 (December)
- days : :class:`~pyspark.sql.Column` or column name
- The day-of-month to represent, from 1 to 31
- hours : :class:`~pyspark.sql.Column` or column name
- The hour-of-day to represent, from 0 to 23
- mins : :class:`~pyspark.sql.Column` or column name
- The minute-of-hour to represent, from 0 to 59
- secs : :class:`~pyspark.sql.Column` or column name
+ years : :class:`~pyspark.sql.Column` or column name, optional
+ The year to represent, from 1 to 9999.
+ Required when creating timestamps from individual components.
+ Must be used with months, days, hours, mins, and secs.
+ months : :class:`~pyspark.sql.Column` or column name, optional
+ The month-of-year to represent, from 1 (January) to 12 (December).
+ Required when creating timestamps from individual components.
+ Must be used with years, days, hours, mins, and secs.
+ days : :class:`~pyspark.sql.Column` or column name, optional
+ The day-of-month to represent, from 1 to 31.
+ Required when creating timestamps from individual components.
+ Must be used with years, months, hours, mins, and secs.
+ hours : :class:`~pyspark.sql.Column` or column name, optional
+ The hour-of-day to represent, from 0 to 23.
+ Required when creating timestamps from individual components.
+ Must be used with years, months, days, mins, and secs.
+ mins : :class:`~pyspark.sql.Column` or column name, optional
+ The minute-of-hour to represent, from 0 to 59.
+ Required when creating timestamps from individual components.
+ Must be used with years, months, days, hours, and secs.
+ secs : :class:`~pyspark.sql.Column` or column name, optional
The second-of-minute and its micro-fraction to represent, from 0 to 60.
- The value can be either an integer like 13 , or a fraction like 13.123.
+ The value can be either an integer like 13, or a fraction like 13.123.
If the sec argument equals to 60, the seconds field is set
to 0 and 1 minute is added to the final timestamp.
+ Required when creating timestamps from individual components.
+ Must be used with years, months, days, hours, and mins.
+ date : :class:`~pyspark.sql.Column` or column name, optional
+ The date to represent, in valid DATE format.
+ Required when creating timestamps from date and time components.
+ Must be used with time parameter only.
+ time : :class:`~pyspark.sql.Column` or column name, optional
+ The time to represent, in valid TIME format.
+ Required when creating timestamps from date and time components.
+ Must be used with date parameter only.
Returns
-------
@@ -25219,6 +25266,8 @@ def make_timestamp_ntz(
--------
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
+ Example 1: Make local date-time from years, months, days, hours, mins,
secs.
+
>>> import pyspark.sql.functions as sf
>>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]],
... ['year', 'month', 'day', 'hour', 'min', 'sec'])
@@ -25231,14 +25280,50 @@ def make_timestamp_ntz(
|2014-12-28 06:30:45.887 |
+----------------------------------------------------+
+ Example 2: Make local date-time from date and time.
+
+ >>> import pyspark.sql.functions as sf
+ >>> from datetime import date, time
+ >>> df = spark.range(1).select(
+ ... sf.lit(date(2014, 12, 28)).alias("date"),
+ ... sf.lit(time(6, 30, 45, 887000)).alias("time")
+ ... )
+ >>> df.select(sf.make_timestamp_ntz(date=df.date,
time=df.time)).show(truncate=False)
+ +------------------------------+
+ |make_timestamp_ntz(date, time)|
+ +------------------------------+
+ |2014-12-28 06:30:45.887 |
+ +------------------------------+
+
>>> spark.conf.unset("spark.sql.session.timeZone")
"""
- return _invoke_function_over_columns(
- "make_timestamp_ntz", years, months, days, hours, mins, secs
- )
+ if years is not None:
+ if any(arg is not None for arg in [date, time]):
+ raise PySparkValueError(
+ errorClass="CANNOT_SET_TOGETHER",
+ messageParameters={"arg_list":
"years|months|days|hours|mins|secs and date|time"},
+ )
+ return _invoke_function_over_columns(
+ "make_timestamp_ntz",
+ cast("ColumnOrName", years),
+ cast("ColumnOrName", months),
+ cast("ColumnOrName", days),
+ cast("ColumnOrName", hours),
+ cast("ColumnOrName", mins),
+ cast("ColumnOrName", secs),
+ )
+ else:
+ if any(arg is not None for arg in [years, months, days, hours, mins,
secs]):
+ raise PySparkValueError(
+ errorClass="CANNOT_SET_TOGETHER",
+ messageParameters={"arg_list":
"years|months|days|hours|mins|secs and date|time"},
+ )
+ return _invoke_function_over_columns(
+ "make_timestamp_ntz", cast("ColumnOrName", date),
cast("ColumnOrName", time)
+ )
-@_try_remote_functions
+@overload
def try_make_timestamp_ntz(
years: "ColumnOrName",
months: "ColumnOrName",
@@ -25246,30 +25331,78 @@ def try_make_timestamp_ntz(
hours: "ColumnOrName",
mins: "ColumnOrName",
secs: "ColumnOrName",
+) -> Column:
+ ...
+
+
+@overload
+def try_make_timestamp_ntz(
+ *,
+ date: "ColumnOrName",
+ time: "ColumnOrName",
+) -> Column:
+ ...
+
+
+@_try_remote_functions
+def try_make_timestamp_ntz(
+ years: Optional["ColumnOrName"] = None,
+ months: Optional["ColumnOrName"] = None,
+ days: Optional["ColumnOrName"] = None,
+ hours: Optional["ColumnOrName"] = None,
+ mins: Optional["ColumnOrName"] = None,
+ secs: Optional["ColumnOrName"] = None,
+ *,
+ date: Optional["ColumnOrName"] = None,
+ time: Optional["ColumnOrName"] = None,
) -> Column:
"""
- Try to create local date-time from years, months, days, hours, mins, secs
fields.
- The function returns NULL on invalid inputs.
+ Try to create local date-time from years, months, days, hours, mins, secs
fields. Alternatively,
+ try to create local date-time from date and time fields. The function
returns NULL on invalid
+ inputs.
.. versionadded:: 4.0.0
+ .. versionchanged:: 4.1.0
+ Added support for creating timestamps from date and time.
+
Parameters
----------
- years : :class:`~pyspark.sql.Column` or column name
- The year to represent, from 1 to 9999
- months : :class:`~pyspark.sql.Column` or column name
- The month-of-year to represent, from 1 (January) to 12 (December)
- days : :class:`~pyspark.sql.Column` or column name
- The day-of-month to represent, from 1 to 31
- hours : :class:`~pyspark.sql.Column` or column name
- The hour-of-day to represent, from 0 to 23
- mins : :class:`~pyspark.sql.Column` or column name
- The minute-of-hour to represent, from 0 to 59
- secs : :class:`~pyspark.sql.Column` or column name
+ years : :class:`~pyspark.sql.Column` or column name, optional
+ The year to represent, from 1 to 9999.
+ Required when creating timestamps from individual components.
+ Must be used with months, days, hours, mins, and secs.
+ months : :class:`~pyspark.sql.Column` or column name, optional
+ The month-of-year to represent, from 1 (January) to 12 (December).
+ Required when creating timestamps from individual components.
+ Must be used with years, days, hours, mins, and secs.
+ days : :class:`~pyspark.sql.Column` or column name, optional
+ The day-of-month to represent, from 1 to 31.
+ Required when creating timestamps from individual components.
+ Must be used with years, months, hours, mins, and secs.
+ hours : :class:`~pyspark.sql.Column` or column name, optional
+ The hour-of-day to represent, from 0 to 23.
+ Required when creating timestamps from individual components.
+ Must be used with years, months, days, mins, and secs.
+ mins : :class:`~pyspark.sql.Column` or column name, optional
+ The minute-of-hour to represent, from 0 to 59.
+ Required when creating timestamps from individual components.
+ Must be used with years, months, days, hours, and secs.
+ secs : :class:`~pyspark.sql.Column` or column name, optional
The second-of-minute and its micro-fraction to represent, from 0 to 60.
- The value can be either an integer like 13 , or a fraction like 13.123.
+ The value can be either an integer like 13, or a fraction like 13.123.
If the sec argument equals to 60, the seconds field is set
to 0 and 1 minute is added to the final timestamp.
+ Required when creating timestamps from individual components.
+ Must be used with years, months, days, hours, and mins.
+ date : :class:`~pyspark.sql.Column` or column name, optional
+ The date to represent, in valid DATE format.
+ Required when creating timestamps from date and time components.
+ Must be used with time parameter only.
+ time : :class:`~pyspark.sql.Column` or column name, optional
+ The time to represent, in valid TIME format.
+ Required when creating timestamps from date and time components.
+ Must be used with date parameter only.
Returns
-------
@@ -25321,9 +25454,30 @@ def try_make_timestamp_ntz(
>>> spark.conf.unset("spark.sql.session.timeZone")
"""
- return _invoke_function_over_columns(
- "try_make_timestamp_ntz", years, months, days, hours, mins, secs
- )
+ if years is not None:
+ if any(arg is not None for arg in [date, time]):
+ raise PySparkValueError(
+ errorClass="CANNOT_SET_TOGETHER",
+ messageParameters={"arg_list":
"years|months|days|hours|mins|secs and date|time"},
+ )
+ return _invoke_function_over_columns(
+ "try_make_timestamp_ntz",
+ cast("ColumnOrName", years),
+ cast("ColumnOrName", months),
+ cast("ColumnOrName", days),
+ cast("ColumnOrName", hours),
+ cast("ColumnOrName", mins),
+ cast("ColumnOrName", secs),
+ )
+ else:
+ if any(arg is not None for arg in [years, months, days, hours, mins,
secs]):
+ raise PySparkValueError(
+ errorClass="CANNOT_SET_TOGETHER",
+ messageParameters={"arg_list":
"years|months|days|hours|mins|secs and date|time"},
+ )
+ return _invoke_function_over_columns(
+ "try_make_timestamp_ntz", cast("ColumnOrName", date),
cast("ColumnOrName", time)
+ )
@_try_remote_functions
diff --git a/python/pyspark/sql/tests/test_functions.py
b/python/pyspark/sql/tests/test_functions.py
index 2fde3edc2486..e39609ac615d 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -471,13 +471,18 @@ class FunctionsTestsMixin:
assertDataFrameEqual(actual, [Row(None)])
def test_try_make_timestamp_ntz(self):
+ """Test cases for try_make_timestamp_ntz with 6-parameter and
date/time forms."""
+
+ # Test 1: Valid 6 positional arguments
data = [(2024, 5, 22, 10, 30, 0)]
+ result = datetime.datetime(2024, 5, 22, 10, 30)
df = self.spark.createDataFrame(data, ["year", "month", "day", "hour",
"minute", "second"])
actual = df.select(
F.try_make_timestamp_ntz(df.year, df.month, df.day, df.hour,
df.minute, df.second)
)
- assertDataFrameEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10,
30))])
+ assertDataFrameEqual(actual, [Row(result)])
+ # Test 2: Invalid input (month=13) - should return NULL
data = [(2024, 13, 22, 10, 30, 0)]
df = self.spark.createDataFrame(data, ["year", "month", "day", "hour",
"minute", "second"])
actual = df.select(
@@ -485,6 +490,161 @@ class FunctionsTestsMixin:
)
assertDataFrameEqual(actual, [Row(None)])
+ # Test 3: Date/time keyword arguments
+ df = self.spark.range(1).select(
+ F.lit(datetime.date(2024, 5, 22)).alias("date"),
+ F.lit(datetime.time(10, 30, 0)).alias("time"),
+ )
+ actual = df.select(F.try_make_timestamp_ntz(date=df.date,
time=df.time))
+ assertDataFrameEqual(actual, [Row(result)])
+
+ # Test 4: All 6 keyword arguments
+ df_full = self.spark.createDataFrame(
+ [(2024, 5, 22, 10, 30, 45)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ actual = df_full.select(
+ F.try_make_timestamp_ntz(
+ years=df_full.year,
+ months=df_full.month,
+ days=df_full.day,
+ hours=df_full.hour,
+ mins=df_full.minute,
+ secs=df_full.second,
+ )
+ )
+ expected = datetime.datetime(2024, 5, 22, 10, 30, 45)
+ assertDataFrameEqual(actual, [Row(expected)])
+
+ # Test 5: Only year provided - should raise Exception for missing
required parameters
+ with self.assertRaises(Exception):
+ F.try_make_timestamp_ntz(years=df_full.year)
+
+ # Test 6: Partial parameters - should raise Exception for missing
required parameters
+ with self.assertRaises(Exception):
+ F.try_make_timestamp_ntz(years=df_full.year, months=df_full.month,
days=df_full.day)
+
+ # Test 7: Partial parameters - should raise Exception for missing
required parameters
+ with self.assertRaises(Exception):
+ F.try_make_timestamp_ntz(
+ years=df_full.year, months=df_full.month, days=df_full.day,
hours=df_full.hour
+ )
+
+ # Test 8: Fractional seconds
+ df_frac = self.spark.createDataFrame(
+ [(2024, 5, 22, 10, 30, 45.123)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ actual = df_frac.select(
+ F.try_make_timestamp_ntz(
+ df_frac.year,
+ df_frac.month,
+ df_frac.day,
+ df_frac.hour,
+ df_frac.minute,
+ df_frac.second,
+ )
+ )
+ expected_frac = datetime.datetime(2024, 5, 22, 10, 30, 45, 123000)
+ assertDataFrameEqual(actual, [Row(expected_frac)])
+
+ # Test 9: Edge case - February 29 in leap year (full 6 parameters)
+ df_leap = self.spark.createDataFrame(
+ [(2024, 2, 29, 0, 0, 0)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ actual = df_leap.select(
+ F.try_make_timestamp_ntz(
+ df_leap.year,
+ df_leap.month,
+ df_leap.day,
+ df_leap.hour,
+ df_leap.minute,
+ df_leap.second,
+ )
+ )
+ expected_leap = datetime.datetime(2024, 2, 29, 0, 0, 0)
+ assertDataFrameEqual(actual, [Row(expected_leap)])
+
+ # Test 10: Edge case - February 29 in non-leap year (should return
NULL)
+ df_non_leap = self.spark.createDataFrame(
+ [(2023, 2, 29, 0, 0, 0)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ actual = df_non_leap.select(
+ F.try_make_timestamp_ntz(
+ df_non_leap.year,
+ df_non_leap.month,
+ df_non_leap.day,
+ df_non_leap.hour,
+ df_non_leap.minute,
+ df_non_leap.second,
+ )
+ )
+ assertDataFrameEqual(actual, [Row(None)])
+
+ # Test 11: Minimum valid values (full 6 parameters)
+ df_min = self.spark.createDataFrame(
+ [(1, 1, 1, 0, 0, 0)], ["year", "month", "day", "hour", "minute",
"second"]
+ )
+ actual = df_min.select(
+ F.try_make_timestamp_ntz(
+ df_min.year, df_min.month, df_min.day, df_min.hour,
df_min.minute, df_min.second
+ )
+ )
+ expected_min = datetime.datetime(1, 1, 1, 0, 0, 0)
+ assertDataFrameEqual(actual, [Row(expected_min)])
+
+ # Test 12: Maximum valid hour/minute/second
+ df_max_time = self.spark.createDataFrame(
+ [(2024, 5, 22, 23, 59, 59)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ actual = df_max_time.select(
+ F.try_make_timestamp_ntz(
+ df_max_time.year,
+ df_max_time.month,
+ df_max_time.day,
+ df_max_time.hour,
+ df_max_time.minute,
+ df_max_time.second,
+ )
+ )
+ expected_max_time = datetime.datetime(2024, 5, 22, 23, 59, 59)
+ assertDataFrameEqual(actual, [Row(expected_max_time)])
+
+ # Test 13: Invalid hour (should return NULL)
+ df_invalid_hour = self.spark.createDataFrame(
+ [(2024, 5, 22, 25, 0, 0)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ actual = df_invalid_hour.select(
+ F.try_make_timestamp_ntz(
+ df_invalid_hour.year,
+ df_invalid_hour.month,
+ df_invalid_hour.day,
+ df_invalid_hour.hour,
+ df_invalid_hour.minute,
+ df_invalid_hour.second,
+ )
+ )
+ assertDataFrameEqual(actual, [Row(None)])
+
+ # Test 14: Valid date/time combination with NULL date
+ df = self.spark.range(1).select(
+ F.lit(None).cast("date").alias("date"), F.lit(datetime.time(10,
30, 0)).alias("time")
+ )
+ actual = df.select(F.try_make_timestamp_ntz(date=df.date,
time=df.time))
+ assertDataFrameEqual(actual, [Row(None)])
+
+ # Test 15: Valid date/time combination with NULL time
+ df = self.spark.range(1).select(
+ F.lit(datetime.date(2024, 5, 22)).alias("date"),
F.lit(None).cast("time").alias("time")
+ )
+ actual = df.select(F.try_make_timestamp_ntz(date=df.date,
time=df.time))
+ assertDataFrameEqual(actual, [Row(None)])
+
+ # Test 16: Mixed parameter usage should raise PySparkValueError
+ with self.assertRaises(PySparkValueError) as context:
+ F.try_make_timestamp_ntz(years=df_full.year, date=df_full.year)
+ error_msg = str(context.exception)
+ self.assertIn("CANNOT_SET_TOGETHER", error_msg)
+ self.assertIn("years|months|days|hours|mins|secs and date|time",
error_msg)
+
def test_string_functions(self):
string_functions = [
"upper",
@@ -679,6 +839,218 @@ class FunctionsTestsMixin:
self.assertIsInstance(row_from_name[0], datetime.time)
self.assertEqual(row_from_name[0], result)
+ def test_make_timestamp_ntz(self):
+ """Comprehensive test cases for make_timestamp_ntz with various
arguments and edge cases."""
+
+ # Test 1: Basic 6 positional arguments
+ data = [(2024, 5, 22, 10, 30, 0)]
+ result = datetime.datetime(2024, 5, 22, 10, 30)
+ df = self.spark.createDataFrame(data, ["year", "month", "day", "hour",
"minute", "second"])
+
+ actual = df.select(
+ F.make_timestamp_ntz(df.year, df.month, df.day, df.hour,
df.minute, df.second)
+ )
+ assertDataFrameEqual(actual, [Row(result)])
+
+ # Test 2: All 6 keyword arguments
+ actual = df.select(
+ F.make_timestamp_ntz(
+ years=df.year,
+ months=df.month,
+ days=df.day,
+ hours=df.hour,
+ mins=df.minute,
+ secs=df.second,
+ )
+ )
+ assertDataFrameEqual(actual, [Row(result)])
+
+ # Test 3: Date/time keyword arguments
+ df_dt = self.spark.range(1).select(
+ F.lit(datetime.date(2024, 5, 22)).alias("date"),
+ F.lit(datetime.time(10, 30, 0)).alias("time"),
+ )
+ actual = df_dt.select(F.make_timestamp_ntz(date=df_dt.date,
time=df_dt.time))
+ assertDataFrameEqual(actual, [Row(result)])
+
+ # Test 4: Fractional seconds with positional arguments
+ data_frac = [(2024, 5, 22, 10, 30, 45.123)]
+ result_frac = datetime.datetime(2024, 5, 22, 10, 30, 45, 123000)
+ df_frac = self.spark.createDataFrame(
+ data_frac, ["year", "month", "day", "hour", "minute", "second"]
+ )
+
+ actual = df_frac.select(
+ F.make_timestamp_ntz(
+ df_frac.year,
+ df_frac.month,
+ df_frac.day,
+ df_frac.hour,
+ df_frac.minute,
+ df_frac.second,
+ )
+ )
+ assertDataFrameEqual(actual, [Row(result_frac)])
+
+ # Test 5: Fractional seconds with keyword arguments
+ actual = df_frac.select(
+ F.make_timestamp_ntz(
+ years=df_frac.year,
+ months=df_frac.month,
+ days=df_frac.day,
+ hours=df_frac.hour,
+ mins=df_frac.minute,
+ secs=df_frac.second,
+ )
+ )
+ assertDataFrameEqual(actual, [Row(result_frac)])
+
+ # Test 6: Fractional seconds with date/time arguments
+ df_dt_frac = self.spark.range(1).select(
+ F.lit(datetime.date(2024, 5, 22)).alias("date"),
+ F.lit(datetime.time(10, 30, 45, 123000)).alias("time"),
+ )
+ actual = df_dt_frac.select(F.make_timestamp_ntz(date=df_dt_frac.date,
time=df_dt_frac.time))
+ assertDataFrameEqual(actual, [Row(result_frac)])
+
+ # Test 7: Edge case - February 29 in leap year
+ df_leap = self.spark.createDataFrame(
+ [(2024, 2, 29, 0, 0, 0)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ expected_leap = datetime.datetime(2024, 2, 29, 0, 0, 0)
+ actual = df_leap.select(
+ F.make_timestamp_ntz(
+ df_leap.year,
+ df_leap.month,
+ df_leap.day,
+ df_leap.hour,
+ df_leap.minute,
+ df_leap.second,
+ )
+ )
+ assertDataFrameEqual(actual, [Row(expected_leap)])
+
+ # Test 8: Maximum valid time values
+ df_max = self.spark.createDataFrame(
+ [(2024, 12, 31, 23, 59, 59)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ expected_max = datetime.datetime(2024, 12, 31, 23, 59, 59)
+ actual = df_max.select(
+ F.make_timestamp_ntz(
+ df_max.year, df_max.month, df_max.day, df_max.hour,
df_max.minute, df_max.second
+ )
+ )
+ assertDataFrameEqual(actual, [Row(expected_max)])
+
+ # Test 9: Minimum valid values
+ df_min = self.spark.createDataFrame(
+ [(1, 1, 1, 0, 0, 0)], ["year", "month", "day", "hour", "minute",
"second"]
+ )
+ expected_min = datetime.datetime(1, 1, 1, 0, 0, 0)
+ actual = df_min.select(
+ F.make_timestamp_ntz(
+ df_min.year, df_min.month, df_min.day, df_min.hour,
df_min.minute, df_min.second
+ )
+ )
+ assertDataFrameEqual(actual, [Row(expected_min)])
+
+ # Test 10: Mixed positional and keyword (should work for valid
combinations)
+ actual = df.select(
+ F.make_timestamp_ntz(
+ df.year, df.month, df.day, hours=df.hour, mins=df.minute,
secs=df.second
+ )
+ )
+ assertDataFrameEqual(actual, [Row(result)])
+
+ # Test 11: Using literal values
+ actual = self.spark.range(1).select(
+ F.make_timestamp_ntz(F.lit(2024), F.lit(5), F.lit(22), F.lit(10),
F.lit(30), F.lit(0))
+ )
+ assertDataFrameEqual(actual, [Row(result)])
+
+ # Test 12: Using string column names
+ actual = df.select(F.make_timestamp_ntz("year", "month", "day",
"hour", "minute", "second"))
+ assertDataFrameEqual(actual, [Row(result)])
+
+ # Error handling tests
+
+ # Test 13: Mixing timestamp and date/time keyword arguments
+ with self.assertRaises(PySparkValueError) as context:
+ df_dt.select(
+ F.make_timestamp_ntz(years=df.year, date=df_dt.date,
time=df_dt.time)
+ ).collect()
+ error_msg = str(context.exception)
+ self.assertIn("CANNOT_SET_TOGETHER", error_msg)
+ self.assertIn("years|months|days|hours|mins|secs and date|time",
error_msg)
+
+ # Test 14: Incomplete keyword arguments - should raise Exception for
None values
+ with self.assertRaises(Exception):
+ F.make_timestamp_ntz(years=df.year, months=df.month, days=df.day)
+
+ # Test 15: Only one keyword argument - should raise Exception for None
values
+ with self.assertRaises(Exception):
+ F.make_timestamp_ntz(years=df.year)
+
+ # Test 16: Only date without time - should raise Exception for None
values
+ with self.assertRaises(Exception):
+ F.make_timestamp_ntz(date=df_dt.date)
+
+ # Test 17: Invalid data types - should raise exception for invalid
string to int cast
+ with self.assertRaises(Exception):
+ self.spark.range(1).select(
+ F.make_timestamp_ntz(
+ F.lit("invalid"), F.lit(5), F.lit(22), F.lit(10),
F.lit(30), F.lit(0)
+ )
+ ).collect()
+
+ # Test 18: Out of range values (month=13) - should raise exception for
invalid date
+ df_invalid = self.spark.createDataFrame(
+ [(2024, 13, 22, 10, 30, 0)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ with self.assertRaises(Exception):
+ df_invalid.select(
+ F.make_timestamp_ntz(
+ df_invalid.year,
+ df_invalid.month,
+ df_invalid.day,
+ df_invalid.hour,
+ df_invalid.minute,
+ df_invalid.second,
+ )
+ ).collect()
+
+ # Test 19: Out of range values (hour=25) - should raise exception for
invalid time
+ df_invalid_hour = self.spark.createDataFrame(
+ [(2024, 5, 22, 25, 30, 0)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ with self.assertRaises(Exception):
+ df_invalid_hour.select(
+ F.make_timestamp_ntz(
+ df_invalid_hour.year,
+ df_invalid_hour.month,
+ df_invalid_hour.day,
+ df_invalid_hour.hour,
+ df_invalid_hour.minute,
+ df_invalid_hour.second,
+ )
+ ).collect()
+
+ # Test 20: February 29 in non-leap year
+ df_non_leap = self.spark.createDataFrame(
+ [(2023, 2, 29, 0, 0, 0)], ["year", "month", "day", "hour",
"minute", "second"]
+ )
+ with self.assertRaises(Exception): # Should raise runtime exception
for invalid date
+ df_non_leap.select(
+ F.make_timestamp_ntz(
+ df_non_leap.year,
+ df_non_leap.month,
+ df_non_leap.day,
+ df_non_leap.hour,
+ df_non_leap.minute,
+ df_non_leap.second,
+ )
+ ).collect()
+
def test_make_date(self):
# SPARK-36554: expose make_date expression
df = self.spark.createDataFrame([(2020, 6, 26)], ["Y", "M", "D"])
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]