This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7b404cb8b173 [SPARK-55621][PYTHON] Fix ambiguous and unnecessary
unicode usage
7b404cb8b173 is described below
commit 7b404cb8b1732084644b38c42fcf7279d42e8abb
Author: Tian Gao <[email protected]>
AuthorDate: Tue Mar 3 09:28:25 2026 +0900
[SPARK-55621][PYTHON] Fix ambiguous and unnecessary unicode usage
### What changes were proposed in this pull request?
Fixed all the unnecessary and ambiguous unicode character usage.
A set of `ruff` rules are also added to prevent future regressions.
### Why are the changes needed?
We should avoid using non-ascii unicode character usage as much as
possible. There are few rationales behind it
* Sometimes it's just wrong. e.g. `‘index’` vs `'index'`
* Some editor (VSCode) will highlight it as a warning and some
editor/terminal might not display it well
* It's difficult to keep consistency because people don't know how to type
that
* For docstrings, it could actually be displayed somewhere while users are
using it and unicode could cause problems
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
`ruff check` passed.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54410 from gaogaotiantian/fix-ascii.
Authored-by: Tian Gao <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
pyproject.toml | 5 ++
python/pyspark/ml/connect/base.py | 2 +-
python/pyspark/pandas/accessors.py | 2 +-
python/pyspark/pandas/base.py | 2 +-
python/pyspark/pandas/config.py | 2 +-
python/pyspark/pandas/frame.py | 60 ++++++++---------
python/pyspark/pandas/generic.py | 10 +--
python/pyspark/pandas/groupby.py | 2 +-
python/pyspark/pandas/indexes/base.py | 2 +-
python/pyspark/pandas/indexes/datetimes.py | 2 +-
python/pyspark/pandas/indexes/multi.py | 2 +-
python/pyspark/pandas/indexing.py | 2 +-
python/pyspark/pandas/namespace.py | 8 +--
python/pyspark/pandas/plot/core.py | 6 +-
python/pyspark/pandas/series.py | 20 +++---
python/pyspark/pandas/strings.py | 26 ++++----
python/pyspark/pandas/window.py | 2 +-
python/pyspark/sql/conversion.py | 4 +-
python/pyspark/sql/plot/core.py | 2 +-
python/pyspark/testing/utils.py | 4 +-
.../upstream/pyarrow/test_pyarrow_array_cast.py | 26 ++++----
.../upstream/pyarrow/test_pyarrow_type_coercion.py | 78 +++++++++++-----------
22 files changed, 137 insertions(+), 132 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 68a6cba0acb5..bb8685f89005 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,11 @@ exclude = [
[tool.ruff.lint]
extend-select = [
"G010", # logging-warn
+ # ambiguous unicode character
+ "RUF001", # string
+ "RUF002", # docstring
+ "RUF003", # comment
+ # ambiguous unicode character end
"RUF100", # unused-noqa
]
ignore = [
diff --git a/python/pyspark/ml/connect/base.py
b/python/pyspark/ml/connect/base.py
index 32c72d590745..92fe1ae4e32c 100644
--- a/python/pyspark/ml/connect/base.py
+++ b/python/pyspark/ml/connect/base.py
@@ -155,7 +155,7 @@ class Transformer(Params, metaclass=ABCMeta):
) -> Union[DataFrame, pd.DataFrame]:
"""
Transforms the input dataset.
- The dataset can be either pandas dataframe or spark dataframe,
+ The dataset can be either pandas dataframe or spark dataframe,
if it is a spark DataFrame, the result of transformation is a new
spark DataFrame
that contains all existing columns and output columns with names,
If it is a pandas DataFrame, the result of transformation is a shallow
copy
diff --git a/python/pyspark/pandas/accessors.py
b/python/pyspark/pandas/accessors.py
index 359c2db7cb7d..ea6d33c8fa18 100644
--- a/python/pyspark/pandas/accessors.py
+++ b/python/pyspark/pandas/accessors.py
@@ -78,7 +78,7 @@ class PandasOnSparkFrameMethods:
- 'distributed-sequence' : a sequence that increases one by one,
by group-by and group-map approach in a distributed manner.
- - 'distributed' : a monotonically increasing sequence simply by
using PySpark’s
+ - 'distributed' : a monotonically increasing sequence simply by
using PySpark's
monotonically_increasing_id function in a fully distributed
manner.
column : string or tuple of string
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index a43fd7112cbd..5a39b581e986 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -1449,7 +1449,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
Parameters
----------
dropna : bool, default True
- Don’t include NaN in the count.
+ Don't include NaN in the count.
approx: bool, default False
If False, will use the exact algorithm and return the exact number
of unique.
If True, it uses the HyperLogLog approximate algorithm, which is
significantly faster
diff --git a/python/pyspark/pandas/config.py b/python/pyspark/pandas/config.py
index e89bcf835f8b..2049e1fcef5f 100644
--- a/python/pyspark/pandas/config.py
+++ b/python/pyspark/pandas/config.py
@@ -250,7 +250,7 @@ _options: List[Option] = [
key="compute.isin_limit",
doc=(
"'compute.isin_limit' sets the limit for filtering by
'Column.isin(list)'. "
- "If the length of the ‘list’ is above the limit, broadcast join is
used instead "
+ "If the length of the 'list' is above the limit, broadcast join is
used instead "
"for better performance."
),
default=80,
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index f3072c2f4fec..6227e5ad26c6 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -170,7 +170,7 @@ if TYPE_CHECKING:
# Two patterns basically seek the footer string from Pandas'
REPR_PATTERN = re.compile(r"\n\n\[(?P<rows>[0-9]+) rows x (?P<columns>[0-9]+)
columns\]$")
REPR_HTML_PATTERN = re.compile(
- r"\n\<p\>(?P<rows>[0-9]+) rows × (?P<columns>[0-9]+)
columns\<\/p\>\n\<\/div\>$"
+ r"\n\<p\>(?P<rows>[0-9]+) rows × (?P<columns>[0-9]+)
columns\<\/p\>\n\<\/div\>$" # noqa: RUF001
)
@@ -2594,10 +2594,10 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
for the column names.
index : bool, default True
Write row names (index).
- na_rep : str, default ‘NaN’
+ na_rep : str, default 'NaN'
Missing data representation.
formatters : list of functions or dict of {str: function}, optional
- Formatter functions to apply to columns’ elements by position or
name. The result of
+ Formatter functions to apply to columns' elements by position or
name. The result of
each function must be a Unicode string. List must be of length
equal to the number of
columns.
float_format : str, optional
@@ -2610,9 +2610,9 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
bold_rows : bool, default False
Make the row labels bold in the output.
column_format : str, optional
- The columns format as specified in LaTeX table format e.g. ‘rcl’
for 3 columns. By
- default, ‘l’ will be used for all columns except columns of
numbers, which default
- to ‘r’.
+ The columns format as specified in LaTeX table format e.g. 'rcl'
for 3 columns. By
+ default, 'l' will be used for all columns except columns of
numbers, which default
+ to 'r'.
longtable : bool, optional
By default the value will be read from the pandas config module.
Use a longtable
environment instead of tabular. Requires adding a
usepackage{longtable} to your LaTeX
@@ -2621,14 +2621,14 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
By default the value will be read from the pandas config module.
When set to False
prevents from escaping latex special characters in column names.
encoding : str, optional
- A string representing the encoding to use in the output file,
defaults to ‘ascii’ on
- Python 2 and ‘utf-8’ on Python 3.
- decimal : str, default ‘.’
- Character recognized as decimal separator, e.g. ‘,’ in Europe.
+ A string representing the encoding to use in the output file,
defaults to 'ascii' on
+ Python 2 and 'utf-8' on Python 3.
+ decimal : str, default '.'
+ Character recognized as decimal separator, e.g. ',' in Europe.
multicolumn : bool, default True
Use multicolumn to enhance MultiIndex columns. The default will be
read from the config
module.
- multicolumn_format : str, default ‘l’
+ multicolumn_format : str, default 'l'
The alignment for multicolumns, similar to column_format The
default will be read from
the config module.
multirow : bool, default False
@@ -4920,7 +4920,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
The axis to use. 0 or 'index' for row-wise (count unique values
per column),
1 or 'columns' for column-wise (count unique values per row).
dropna : bool, default True
- Don’t include NaN in the count.
+ Don't include NaN in the count.
approx: bool, default False
If False, will use the exact algorithm and return the exact number
of unique.
If True, it uses the HyperLogLog approximate algorithm, which is
significantly faster
@@ -7482,7 +7482,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
If a string is given, must be the name of a level If list-like,
elements must
be names or positional indexes of levels.
- axis: {0 or ‘index’, 1 or ‘columns’}, default 0
+ axis: {0 or 'index', 1 or 'columns'}, default 0
Returns
-------
@@ -7834,7 +7834,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
na_position : {'first', 'last'}, default 'last'
`first` puts NaNs at the beginning, `last` puts NaNs at the end
ignore_index : bool, default False
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
+ If True, the resulting axis will be labeled 0, 1, ..., n - 1.
Returns
-------
@@ -7942,7 +7942,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
kind : str, default None
pandas-on-Spark does not allow specifying the sorting algorithm
now,
default None
- na_position : {‘first’, ‘last’}, default ‘last’
+ na_position : {'first', 'last'}, default 'last'
first puts NaNs at the beginning, last puts NaNs at the end. Not
implemented for
MultiIndex.
ignore_index : bool, default False
@@ -8984,12 +8984,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
how: {'left', 'right', 'outer', 'inner'}, default 'left'
How to handle the operation of the two objects.
- * left: use `left` frame’s index (or column if on is specified).
- * right: use `right`’s index.
- * outer: form union of `left` frame’s index (or column if on is
specified) with
- right’s index, and sort it. lexicographically.
- * inner: form intersection of `left` frame’s index (or column if
on is specified)
- with `right`’s index, preserving the order of the `left`’s one.
+ * left: use `left` frame's index (or column if on is specified).
+ * right: use `right`'s index.
+ * outer: form union of `left` frame's index (or column if on is
specified) with
+ right's index, and sort it. lexicographically.
+ * inner: form intersection of `left` frame's index (or column if
on is specified)
+ with `right`'s index, preserving the order of the `left`'s one.
lsuffix : str, default ''
Suffix to use from left frame's overlapping columns.
rsuffix : str, default ''
@@ -9055,8 +9055,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
K3 A3 None
Another option to join using the key columns is to use the on
parameter. DataFrame.join
- always uses right’s index but we can use any column in df. This method
does not preserve
- the original DataFrame’s index in the result unlike pandas.
+ always uses right's index but we can use any column in df. This method
does not preserve
+ the original DataFrame's index in the result unlike pandas.
>>> join_psdf = psdf1.join(psdf2.set_index('key'), on='key')
>>> join_psdf.index
@@ -9859,9 +9859,9 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
For numeric data, the result's index will include ``count``,
``mean``, ``std``, ``min``, ``25%``, ``50%``, ``75%``, ``max``.
- For object data (e.g. strings or timestamps), the result’s index will
include
+ For object data (e.g. strings or timestamps), the result's index will
include
``count``, ``unique``, ``top``, and ``freq``.
- The ``top`` is the most common value. The ``freq`` is the most common
value’s frequency.
+ The ``top`` is the most common value. The ``freq`` is the most common
value's frequency.
Timestamps also include the ``first`` and ``last`` items.
Examples
@@ -10268,12 +10268,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Parameters
----------
labels: array-like, optional
- New labels / index to conform the axis specified by ‘axis’ to.
+ New labels / index to conform the axis specified by 'axis' to.
index, columns: array-like, optional
New labels / index to conform to, should be specified using
keywords.
Preferably an Index object to avoid duplicating data
axis: int or str, optional
- Axis to target. Can be either the axis name (‘index’, ‘columns’) or
+ Axis to target. Can be either the axis name ('index', 'columns') or
number (0, 1).
copy : bool, default True
Return a new object, even if the passed indexes are the same.
@@ -10669,7 +10669,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
are not set as `id_vars`.
var_name : scalar, default 'variable'
Name to use for the 'variable' column. If None it uses
`frame.columns.name` or
- ‘variable’.
+ 'variable'.
value_name : scalar, default 'value'
Name to use for the 'value' column.
@@ -11846,12 +11846,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
"""
Alter axes labels.
Function / dict values must be unique (1-to-1). Labels not contained
in a dict / Series
- will be left as-is. Extra labels listed don’t throw an error.
+ will be left as-is. Extra labels listed don't throw an error.
Parameters
----------
mapper : dict-like or function
- Dict-like or functions transformations to apply to that axis’
values.
+ Dict-like or functions transformations to apply to that axis'
values.
Use either `mapper` and `axis` to specify the axis to target with
`mapper`, or `index`
and `columns`.
index : dict-like or function
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index cfd566e20573..9e36fb9225eb 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -867,8 +867,8 @@ class Frame(object, metaclass=ABCMeta):
File path. If not specified, the result is returned as
a string.
lines: bool, default True
- If ‘orient’ is ‘records’ write out line delimited JSON format.
- Will throw ValueError if incorrect ‘orient’ since others are not
+ If 'orient' is 'records' write out line delimited JSON format.
+ Will throw ValueError if incorrect 'orient' since others are not
list like. It should be always True for now.
orient: str, default 'records'
It should be always 'records' for now.
@@ -1590,7 +1590,7 @@ class Frame(object, metaclass=ABCMeta):
self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only:
bool = None
) -> Union[Scalar, "Series"]:
"""
- Return unbiased kurtosis using Fisher’s definition of kurtosis
(kurtosis of normal == 0.0).
+ Return unbiased kurtosis using Fisher's definition of kurtosis
(kurtosis of normal == 0.0).
Normalized by N-1.
Parameters
@@ -1795,8 +1795,8 @@ class Frame(object, metaclass=ABCMeta):
Parameters
----------
- axis: {0 or ‘index’, 1 or ‘columns’}, default 0
- If 0 or ‘index’ counts are generated for each column. If 1 or
‘columns’ counts are
+ axis: {0 or 'index', 1 or 'columns'}, default 0
+ If 0 or 'index' counts are generated for each column. If 1 or
'columns' counts are
generated for each row.
numeric_only: bool, default False
If True, include only float, int, boolean columns. This parameter
is mainly for
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index f9e8123555ad..5e47f9840811 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -3264,7 +3264,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
Parameters
----------
dropna : boolean, default True
- Don’t include NaN in the counts.
+ Don't include NaN in the counts.
Returns
-------
diff --git a/python/pyspark/pandas/indexes/base.py
b/python/pyspark/pandas/indexes/base.py
index 40b432556463..081d2836d701 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -565,7 +565,7 @@ class Index(IndexOpsMixin):
mapper : function, dict, or pd.Series
Mapping correspondence.
na_action : {None, 'ignore'}
- If ‘ignore’, propagate NA values, without passing them to the
mapping correspondence.
+ If 'ignore', propagate NA values, without passing them to the
mapping correspondence.
Returns
-------
diff --git a/python/pyspark/pandas/indexes/datetimes.py
b/python/pyspark/pandas/indexes/datetimes.py
index cd90e49dc7ee..6d7b723d0f11 100644
--- a/python/pyspark/pandas/indexes/datetimes.py
+++ b/python/pyspark/pandas/indexes/datetimes.py
@@ -75,7 +75,7 @@ class DatetimeIndex(Index):
yearfirst : bool, default False
If True parse dates in `data` with the year first order.
dtype : numpy.dtype or str, default None
- Note that the only NumPy dtype allowed is ‘datetime64[ns]’.
+ Note that the only NumPy dtype allowed is 'datetime64[ns]'.
copy : bool, default False
Make a copy of input ndarray.
name : label, default None
diff --git a/python/pyspark/pandas/indexes/multi.py
b/python/pyspark/pandas/indexes/multi.py
index aeb5c71ff186..55819ce59168 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -200,7 +200,7 @@ class MultiIndex(Index):
Parameters
----------
arrays: list / sequence of array-likes
- Each array-like gives one level’s value for each data point.
len(arrays)
+ Each array-like gives one level's value for each data point.
len(arrays)
is the number of levels.
sortorder: int or None
Level of sortedness (must be lexicographically sorted by that
level).
diff --git a/python/pyspark/pandas/indexing.py
b/python/pyspark/pandas/indexing.py
index 03d8054c7e2a..3ef01d72496f 100644
--- a/python/pyspark/pandas/indexing.py
+++ b/python/pyspark/pandas/indexing.py
@@ -776,7 +776,7 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta):
if label in selected_column_labels_set
]
if selected_column_labels !=
selected_labels_in_internal_order:
- # If requested columns are in different order than the
DataFrame’s internal order,
+ # If requested columns are in different order than the
DataFrame's internal order,
# it returns early (no-op), matching pandas 3 behavior
for that edge case.
return
value = F.lit(value)
diff --git a/python/pyspark/pandas/namespace.py
b/python/pyspark/pandas/namespace.py
index 3ff617c0ee3f..0a1f2413e01f 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -279,9 +279,9 @@ def read_csv(
----------
path : str or list
Path(s) of the CSV file(s) to be read.
- sep : str, default ‘,’
+ sep : str, default ','
Delimiter to use. Non empty string.
- header : int, default ‘infer’
+ header : int, default 'infer'
Whether to use the column names, and the start of the data.
Default behavior is to infer the column names: if no names are passed
the behavior is identical to `header=0` and column names are inferred
from
@@ -303,7 +303,7 @@ def read_csv(
If callable, the callable function will be evaluated against the
column names,
returning names where the callable function evaluates to `True`.
dtype : Type name or dict of column -> type, default None
- Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32}
Use str or object
+ Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
Use str or object
together with suitable na_values settings to preserve and not
interpret dtype.
nrows : int, default None
Number of rows to read from the CSV file.
@@ -2102,7 +2102,7 @@ def timedelta_range(
TimedeltaIndex(['2 days', '3 days', '4 days'], dtype='timedelta64[ns]',
freq=None)
The freq parameter specifies the frequency of the TimedeltaIndex.
- Only fixed frequencies can be passed, non-fixed frequencies such as ‘M’
(month end) will raise.
+ Only fixed frequencies can be passed, non-fixed frequencies such as 'M'
(month end) will raise.
>>> ps.timedelta_range(start='1 day', end='2 days', freq='6h')
... # doctest: +NORMALIZE_WHITESPACE
diff --git a/python/pyspark/pandas/plot/core.py
b/python/pyspark/pandas/plot/core.py
index 8c50d7401918..1d296a383f19 100644
--- a/python/pyspark/pandas/plot/core.py
+++ b/python/pyspark/pandas/plot/core.py
@@ -573,7 +573,7 @@ class PandasOnSparkPlotAccessor(PandasObject):
"""
Plot DataFrame/Series as lines.
- This function is useful to plot lines using DataFrame’s values
+ This function is useful to plot lines using DataFrame's values
as coordinates.
Parameters
@@ -904,12 +904,12 @@ class PandasOnSparkPlotAccessor(PandasObject):
def hist(self, bins=10, **kwds):
"""
- Draw one histogram of the DataFrame’s columns.
+ Draw one histogram of the DataFrame's columns.
A `histogram`_ is a representation of the distribution of data.
This function calls :meth:`plotting.backend.plot`,
on each series in the DataFrame, resulting in one histogram per column.
- This is useful when the DataFrame’s Series are in a similar scale.
+ This is useful when the DataFrame's Series are in a similar scale.
.. _histogram: https://en.wikipedia.org/wiki/Histogram
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index f1fb0069fa74..4eddbe5ad8ad 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -2568,7 +2568,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
index : single label or list-like
Redundant for application on Series, but index can be used instead
of labels.
columns : single label or list-like
- No change is made to the Series; use ‘index’ or ‘labels’ instead.
+ No change is made to the Series; use 'index' or 'labels' instead.
.. versionadded:: 3.4.0
level : int or level name, optional
@@ -2977,7 +2977,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
na_position : {'first', 'last'}, default 'last'
`first` puts NaNs at the beginning, `last` puts NaNs at the end
ignore_index : bool, default False
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
+ If True, the resulting axis will be labeled 0, 1, ..., n - 1.
.. versionadded:: 3.4.0
@@ -3104,11 +3104,11 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
kind : str, default None
pandas-on-Spark does not allow specifying the sorting algorithm
now,
default None
- na_position : {‘first’, ‘last’}, default ‘last’
+ na_position : {'first', 'last'}, default 'last'
first puts NaNs at the beginning, last puts NaNs at the end. Not
implemented for
MultiIndex.
ignore_index : bool, default False
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
+ If True, the resulting axis will be labeled 0, 1, ..., n - 1.
.. versionadded:: 3.4.0
@@ -4999,11 +4999,11 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
- Dicts can be used to specify different replacement values
for different
existing values.
- For example, {'a': 'b', 'y': 'z'} replaces the value ‘a’
with ‘b’ and ‘y’
- with ‘z’. To use a dict in this way the value parameter
should be None.
+ For example, {'a': 'b', 'y': 'z'} replaces the value 'a'
with 'b' and 'y'
+ with 'z'. To use a dict in this way the value parameter
should be None.
- For a DataFrame a dict can specify that different values
should be replaced
in different columns. For example, {'a': 1, 'b': 'z'} looks
for the value 1
- in column ‘a’ and the value ‘z’ in column ‘b’ and replaces
these values with
+ in column 'a' and the value 'z' in column 'b' and replaces
these values with
whatever is specified in value.
The value parameter should not be None in this case.
You can treat this as a special case of passing two lists
except that you are
@@ -6758,9 +6758,9 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
----------
value : scalar
Values to insert into self.
- side : {‘left’, ‘right’}, optional
- If ‘left’, the index of the first suitable location found is given.
- If ‘right’, return the last such index. If there is no suitable
index,
+ side : {'left', 'right'}, optional
+ If 'left', the index of the first suitable location found is given.
+ If 'right', return the last such index. If there is no suitable
index,
return either 0 or N (where N is the length of self).
Returns
diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
index 05d678d123f4..1fb6a0c505a5 100644
--- a/python/pyspark/pandas/strings.py
+++ b/python/pyspark/pandas/strings.py
@@ -908,7 +908,7 @@ class StringMethods:
4 False
dtype: bool
- Returning ‘house’ or ‘dog’ when either expression occurs in a string.
+ Returning 'house' or 'dog' when either expression occurs in a string.
>>> s1.str.contains('house|dog', regex=True)
0 False
@@ -941,7 +941,7 @@ class StringMethods:
Ensure pat is a not a literal pattern when regex is set to True.
Note in the following example one might expect only s2[1] and s2[3]
- to return True. However, ‘.0’ as a regex matches any character followed
+ to return True. However, '.0' as a regex matches any character followed
by a 0.
>>> s2 = ps.Series(['40','40.0','41','41.0','35'])
@@ -1116,7 +1116,7 @@ class StringMethods:
--------
>>> s = ps.Series(['Lion', 'Monkey', 'Rabbit'])
- The search for the pattern ‘Monkey’ returns one match:
+ The search for the pattern 'Monkey' returns one match:
>>> s.str.findall('Monkey')
0 []
@@ -1124,7 +1124,7 @@ class StringMethods:
2 []
dtype: object
- On the other hand, the search for the pattern ‘MONKEY’ doesn’t return
+ On the other hand, the search for the pattern 'MONKEY' doesn't return
any match:
>>> s.str.findall('MONKEY')
@@ -1134,7 +1134,7 @@ class StringMethods:
dtype: object
Flags can be added to the pattern or regular expression. For instance,
- to find the pattern ‘MONKEY’ ignoring the case:
+ to find the pattern 'MONKEY' ignoring the case:
>>> import re
>>> s.str.findall('MONKEY', flags=re.IGNORECASE)
@@ -1153,7 +1153,7 @@ class StringMethods:
dtype: object
Regular expressions are supported too. For instance, the search for all
- the strings ending with the word ‘on’ is shown next:
+ the strings ending with the word 'on' is shown next:
>>> s.str.findall('on$')
0 [on]
@@ -1259,7 +1259,7 @@ class StringMethods:
1 [cat, None, dog]
dtype: object
- Join all lists using a ‘-‘. The list containing None will produce None.
+ Join all lists using a '-'. The list containing None will produce None.
>>> s.str.join('-')
0 lion-elephant-zebra
@@ -1417,7 +1417,7 @@ class StringMethods:
Parameters
----------
- form : {‘NFC’, ‘NFKC’, ‘NFD’, ‘NFKD’}
+ form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
Unicode form.
Returns
@@ -1440,7 +1440,7 @@ class StringMethods:
width : int
Minimum width of resulting string; additional characters will be
filled with character defined in `fillchar`.
- side : {‘left’, ‘right’, ‘both’}, default ‘left’
+ side : {'left', 'right', 'both'}, default 'left'
Side from which to fill resulting string.
fillchar : str, default ' '
Additional character for filling, default is whitespace.
@@ -2308,20 +2308,20 @@ class StringMethods:
def zfill(self, width: int) -> "ps.Series":
"""
- Pad strings in the Series by prepending ‘0’ characters.
+ Pad strings in the Series by prepending '0' characters.
- Strings in the Series are padded with ‘0’ characters on the left of the
+ Strings in the Series are padded with '0' characters on the left of the
string to reach a total string length width. Strings in the Series with
length greater or equal to width are unchanged.
- Differs from :func:`str.zfill` which has special handling for ‘+’/’-‘
+ Differs from :func:`str.zfill` which has special handling for '+'/'-'
in the string.
Parameters
----------
width : int
Minimum length of resulting string; strings with length less than
- width be prepended with ‘0’ characters.
+ width be prepended with '0' characters.
Returns
-------
diff --git a/python/pyspark/pandas/window.py b/python/pyspark/pandas/window.py
index 591fa7d82875..56692c120d91 100644
--- a/python/pyspark/pandas/window.py
+++ b/python/pyspark/pandas/window.py
@@ -587,7 +587,7 @@ class Rolling(RollingLike[FrameLike]):
Value between 0 and 1 providing the quantile to compute.
.. deprecated:: 4.0.0
- This will be renamed to ‘q’ in a future version.
+ This will be renamed to 'q' in a future version.
accuracy : int, optional
Default accuracy of approximation. Larger value means better
accuracy.
diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py
index 97db4563ad2c..ca6d6c61f1ef 100644
--- a/python/pyspark/sql/conversion.py
+++ b/python/pyspark/sql/conversion.py
@@ -262,7 +262,7 @@ class PandasToArrowConversion:
# TODO(SPARK-55502): Unify UDTF and regular UDF conversion paths to
# eliminate the is_udtf flag.
Regular UDFs only catch ArrowInvalid to preserve legacy behavior
where
- e.g. string→decimal must raise an error. (default False)
+ e.g. string->decimal must raise an error. (default False)
Returns
-------
@@ -304,7 +304,7 @@ class PandasToArrowConversion:
"""Convert a single column (Series or DataFrame) to an Arrow Array.
Uses field.name for error messages instead of series.name to avoid
- copying the Series via rename() — a ~20% overhead on the hot path.
+ copying the Series via rename() - a ~20% overhead on the hot path.
"""
if isinstance(col, pd.DataFrame):
assert isinstance(field.dataType, StructType)
diff --git a/python/pyspark/sql/plot/core.py b/python/pyspark/sql/plot/core.py
index 526f4897d390..c3fdaeb3aa07 100644
--- a/python/pyspark/sql/plot/core.py
+++ b/python/pyspark/sql/plot/core.py
@@ -458,7 +458,7 @@ class PySparkPlotAccessor:
self, column: Optional[Union[str, List[str]]] = None, bins: int = 10,
**kwargs: Any
) -> "Figure":
"""
- Draw one histogram of the DataFrame’s columns.
+ Draw one histogram of the DataFrame's columns.
A `histogram`_ is a representation of the distribution of data.
diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
index 35670d910b54..2c4239eb7662 100644
--- a/python/pyspark/testing/utils.py
+++ b/python/pyspark/testing/utils.py
@@ -505,7 +505,7 @@ def assertSchemaEqual(
expected : StructType
The expected schema, for comparison with the actual schema.
ignoreNullable : bool, default True
- Specifies whether a column’s nullable property is included when
checking for
+ Specifies whether a column's nullable property is included when
checking for
schema equality.
When set to `True` (default), the nullable property of the columns
being compared
is not taken into account and the columns will be considered equal
even if they have
@@ -715,7 +715,7 @@ def assertDataFrameEqual(
The absolute tolerance, used in asserting approximate equality for
float values in actual
and expected. Set to 1e-8 by default. (See Notes)
ignoreNullable : bool, default True
- Specifies whether a column’s nullable property is included when
checking for
+ Specifies whether a column's nullable property is included when
checking for
schema equality.
When set to `True` (default), the nullable property of the columns
being compared
is not taken into account and the columns will be considered equal
even if they have
diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py
b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py
index 6242a6d784da..ce19458328aa 100644
--- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py
+++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py
@@ -28,8 +28,8 @@ Each mode generates separate golden files to capture their
different behaviors.
Each cell in the golden file uses the value@type format:
-- Success: [0, 1, null]@int16 — element values via scalar.as_py() and Arrow
type after cast
-- Failure: ERR@ArrowNotImplementedError — the exception class name
+- Success: [0, 1, null]@int16 - element values via scalar.as_py() and Arrow
type after cast
+- Failure: ERR@ArrowNotImplementedError - the exception class name
## Regenerating Golden Files
@@ -511,9 +511,9 @@ class PyArrowScalarTypeCastTests(_PyArrowCastTestBase):
ARM (aarch64/arm64): Unsafe float-to-integer casts produce different
results
than x86 due to IEEE 754 implementation-defined behavior:
- - ARM FCVT instructions saturate on overflow (inf→MAX, -inf→MIN, nan→0)
+ - ARM FCVT instructions saturate on overflow (inf->MAX, -inf->MIN,
nan->0)
- x86 SSE/AVX returns "integer indefinite" values
- - Negative float → unsigned int: ARM saturates to 0, x86 may wrap
+ - Negative float -> unsigned int: ARM saturates to 0, x86 may wrap
The golden files are generated on x86; ARM values are hardcoded below.
"""
overrides = {}
@@ -537,13 +537,13 @@ class PyArrowScalarTypeCastTests(_PyArrowCastTestBase):
if platform.machine() in ("aarch64", "arm64"):
overrides.update(
{
- # float16:standard [0.0, 1.5, -1.5, None] → unsigned int
types
+ # float16:standard [0.0, 1.5, -1.5, None] -> unsigned int
types
# -1.5 saturates to 0 on ARM, wraps on x86
("float16:standard", "uint8"): "[0, 1, 0, None]@uint8",
("float16:standard", "uint16"): "[0, 1, 0, None]@uint16",
("float16:standard", "uint32"): "[0, 1, 0, None]@uint32",
("float16:standard", "uint64"): "[0, 1, 0, None]@uint64",
- # float16:special [inf, nan, None] → integer types
+ # float16:special [inf, nan, None] -> integer types
("float16:special", "int8"): "[-1, 0, None]@int8",
("float16:special", "int16"): "[-1, 0, None]@int16",
("float16:special", "int32"): "[2147483647, 0,
None]@int32",
@@ -552,11 +552,11 @@ class PyArrowScalarTypeCastTests(_PyArrowCastTestBase):
("float16:special", "uint16"): "[65535, 0, None]@uint16",
("float16:special", "uint32"): "[4294967295, 0,
None]@uint32",
("float16:special", "uint64"): "[18446744073709551615, 0,
None]@uint64",
- # float32:standard [0.0, 1.5, -1.5, None] → unsigned int
types
+ # float32:standard [0.0, 1.5, -1.5, None] -> unsigned int
types
("float32:standard", "uint8"): "[0, 1, 0, None]@uint8",
("float32:standard", "uint32"): "[0, 1, 0, None]@uint32",
("float32:standard", "uint64"): "[0, 1, 0, None]@uint64",
- # float32:special [inf, -inf, nan, None] → integer types
+ # float32:special [inf, -inf, nan, None] -> integer types
("float32:special", "int8"): "[-1, 0, 0, None]@int8",
("float32:special", "int16"): "[-1, 0, 0, None]@int16",
("float32:special", "int32"): "[2147483647, -2147483648,
0, None]@int32",
@@ -568,11 +568,11 @@ class PyArrowScalarTypeCastTests(_PyArrowCastTestBase):
("float32:special", "uint16"): "[65535, 0, 0,
None]@uint16",
("float32:special", "uint32"): "[4294967295, 0, 0,
None]@uint32",
("float32:special", "uint64"): "[18446744073709551615, 0,
0, None]@uint64",
- # float64:standard [0.0, 1.5, -1.5, None] → unsigned int
types
+ # float64:standard [0.0, 1.5, -1.5, None] -> unsigned int
types
("float64:standard", "uint8"): "[0, 1, 0, None]@uint8",
("float64:standard", "uint16"): "[0, 1, 0, None]@uint16",
("float64:standard", "uint64"): "[0, 1, 0, None]@uint64",
- # float64:special [inf, -inf, nan, None] → integer types
+ # float64:special [inf, -inf, nan, None] -> integer types
("float64:special", "int8"): "[-1, 0, 0, None]@int8",
("float64:special", "int16"): "[-1, 0, 0, None]@int16",
("float64:special", "int32"): "[-1, 0, 0, None]@int32",
@@ -592,17 +592,17 @@ class PyArrowScalarTypeCastTests(_PyArrowCastTestBase):
# differences in LLVM code generation between the two
platforms.
overrides.update(
{
- # negative float → uint8/uint16: macOS ARM wraps
(255/65535),
+ # negative float -> uint8/uint16: macOS ARM wraps
(255/65535),
# Linux ARM saturates to 0
("float16:standard", "uint8"): "[0, 1, 255,
None]@uint8",
("float16:standard", "uint16"): "[0, 1, 65535,
None]@uint16",
("float32:standard", "uint8"): "[0, 1, 255,
None]@uint8",
("float64:standard", "uint8"): "[0, 1, 255,
None]@uint8",
("float64:standard", "uint16"): "[0, 1, 65535,
None]@uint16",
- # negative float → uint32: macOS ARM saturates to 0,
+ # negative float -> uint32: macOS ARM saturates to 0,
# Linux ARM wraps to 4294967295 (matching x86 golden)
("float64:standard", "uint32"): "[0, 1, 0,
None]@uint32",
- # special float → int32: macOS ARM saturates
(INT32_MAX/MIN),
+ # special float -> int32: macOS ARM saturates
(INT32_MAX/MIN),
# Linux ARM gives -1/0
("float64:special", "int32"): "[2147483647,
-2147483648, 0, None]@int32",
}
diff --git
a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_type_coercion.py
b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_type_coercion.py
index 7c4b38094663..aeb51c9020cd 100644
--- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_type_coercion.py
+++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_type_coercion.py
@@ -325,9 +325,9 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# Same timezone
([dt_utc], pa.timestamp("us", tz="UTC"), [dt_utc]),
([dt_sg], pa.timestamp("us", tz="Asia/Singapore"), [dt_sg]),
- # Cross timezone conversion (SG +8 → UTC, so 20:00 SG = 12:00 UTC)
+ # Cross timezone conversion (SG +8 -> UTC, so 20:00 SG = 12:00 UTC)
([dt_sg], pa.timestamp("us", tz="UTC"), [dt_utc]),
- # LA -8 → UTC, so 4:00 LA = 12:00 UTC
+ # LA -8 -> UTC, so 4:00 LA = 12:00 UTC
([dt_la], pa.timestamp("us", tz="UTC"), [dt_utc]),
# Naive to tz-aware (treated as UTC)
([dt_naive], pa.timestamp("us", tz="UTC"), [dt_utc]),
@@ -336,7 +336,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
]
self._run_coercion_tests_with_values(tz_cases)
- # Mixed timezones → same instant
+ # Mixed timezones -> same instant
ts_mixed = [
datetime.datetime(2024, 1, 1, 12, 0, 0, tzinfo=utc_tz),
datetime.datetime(2024, 1, 1, 20, 0, 0, tzinfo=sg_tz),
@@ -346,7 +346,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
self.assertEqual(values[0], values[1])
self.assertEqual(values[1], values[2])
- # Positive/negative UTC offsets → same instant
+ # Positive/negative UTC offsets -> same instant
ts_tokyo = pa.array(
[datetime.datetime(2024, 1, 1, 21, 0, tzinfo=tokyo_tz)],
type=pa.timestamp("us", tz="UTC"),
@@ -361,9 +361,9 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# (data, target_type, expected_values)
cross_type_ok_with_values = [
- # int → date (epoch days)
+ # int -> date (epoch days)
([19723], pa.date32(), [datetime.date(2024, 1, 1)]),
- # binary ↔ string (UTF-8)
+ # binary <-> string (UTF-8)
([b"hello", b"world"], pa.string(), ["hello", "world"]),
(["hello", "world"], pa.binary(), [b"hello", b"world"]),
]
@@ -371,7 +371,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# (data, target_type, expected_values)
cross_type_ok = [
- # int → timestamp (epoch seconds: 1704067200 = 2024-01-01 00:00:00
UTC)
+ # int -> timestamp (epoch seconds: 1704067200 = 2024-01-01
00:00:00 UTC)
([1704067200], pa.timestamp("s"), [datetime.datetime(2024, 1, 1,
0, 0, 0)]),
]
self._run_coercion_tests_with_values(cross_type_ok)
@@ -380,27 +380,27 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# (data, target_type)
cross_type_fail = [
- # numeric → string
+ # numeric -> string
([1, 2, 3], pa.string()),
([1.5, 2.5], pa.string()),
([True, False], pa.string()),
- # string → numeric
+ # string -> numeric
(["1", "2"], pa.int64()),
(["1.5", "2.5"], pa.float64()),
(["true"], pa.bool_()),
- # bool ↔ int
+ # bool <-> int
([True, False], pa.int64()),
([1, 0, 1], pa.bool_()),
- # temporal → numeric
+ # temporal -> numeric
([datetime.date(2024, 1, 1)], pa.int64()),
([datetime.datetime(2024, 1, 1)], pa.int64()),
([datetime.time(12, 0)], pa.int64()),
([datetime.timedelta(days=1)], pa.int64()),
- # date → timestamp
+ # date -> timestamp
([datetime.date(2024, 1, 1)], pa.timestamp("us")),
- # binary → numeric
+ # binary -> numeric
([b"hello"], pa.int64()),
- # nested → scalar
+ # nested -> scalar
([[1, 2, 3]], pa.int64()),
([{"a": 1}], pa.int64()),
]
@@ -412,10 +412,10 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# (data, target_type, expected_values)
precision_loss = [
- # float → int (truncation, not rounding)
+ # float -> int (truncation, not rounding)
([1.9, 2.1, 3.7], pa.int64(), [1, 2, 3]),
([-1.9, -2.1], pa.int64(), [-1, -2]),
- # decimal → int (truncation)
+ # decimal -> int (truncation)
([Decimal("1.9"), Decimal("2.1")], pa.int64(), [1, 2]),
]
self._run_coercion_tests_with_values(precision_loss)
@@ -431,7 +431,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
self.assertEqual(pa.array([t],
type=pa.time32("ms")).to_pylist()[0].microsecond, 123000)
self.assertEqual(pa.array([t],
type=pa.time32("s")).to_pylist()[0].microsecond, 0)
- # float64 → float32 precision loss
+ # float64 -> float32 precision loss
large_float = 1.23456789012345678
result = pa.array([large_float], type=pa.float32()).to_pylist()[0]
self.assertNotEqual(result, large_float)
@@ -463,7 +463,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# ==== 3.1 Numpy-backed Series ====
# (data, target_type, expected_values)
numpy_cases = [
- # Int types → float
+ # Int types -> float
(pd.Series([1, 2, 3], dtype="int8"), pa.float64(), [1.0, 2.0,
3.0]),
(pd.Series([1, 2, 3], dtype="int16"), pa.float64(), [1.0, 2.0,
3.0]),
(pd.Series([1, 2, 3], dtype="int32"), pa.float64(), [1.0, 2.0,
3.0]),
@@ -472,10 +472,10 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
(pd.Series([1, 2, 3], dtype="uint16"), pa.float64(), [1.0, 2.0,
3.0]),
(pd.Series([1, 2, 3], dtype="uint32"), pa.float64(), [1.0, 2.0,
3.0]),
(pd.Series([1, 2, 3], dtype="uint64"), pa.float64(), [1.0, 2.0,
3.0]),
- # Float types → int
+ # Float types -> int
(pd.Series([1.0, 2.0, 3.0], dtype="float32"), pa.int64(), [1, 2,
3]),
(pd.Series([1.0, 2.0, 3.0], dtype="float64"), pa.int64(), [1, 2,
3]),
- # Float ↔ float
+ # Float <-> float
(pd.Series([1.0, 2.0], dtype="float32"), pa.float64(), [1.0, 2.0]),
(pd.Series([1.0, 2.0], dtype="float64"), pa.float32(), [1.0, 2.0]),
# Narrowing
@@ -507,12 +507,12 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
),
(pd.Series([int64_min, int64_max], dtype="int64"), pa.int64(),
[int64_min, int64_max]),
(pd.Series([int8_min, 0, int8_max], dtype="int8"), pa.int64(),
[int8_min, 0, int8_max]),
- # NaN to int → None (pandas-specific behavior)
+ # NaN to int -> None (pandas-specific behavior)
(pd.Series([nan, 1.0], dtype="float64"), pa.int64(), [None, 1]),
]
self._run_coercion_tests_with_values(numpy_cases)
- # Special float values (NaN/Inf) - type only
+ # Special float values (NaN/Inf) -> type only
for data, target in [
(pd.Series([nan, 1.0], dtype="float64"), pa.float64()),
(pd.Series([inf, neg_inf], dtype="float64"), pa.float64()),
@@ -520,14 +520,14 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
]:
self.assertEqual(pa.array(data, type=target).type, target)
- # numpy int → decimal128 does NOT work
+ # numpy int -> decimal128 does NOT work
with self.assertRaises(pa.ArrowInvalid):
pa.array(pd.Series([1, 2, 3], dtype="int64"),
type=pa.decimal128(10, 0))
# ==== 3.2 Nullable Extension Types ====
# (data, target_type, expected_values)
nullable_cases = [
- # Int types → float
+ # Int types -> float
(pd.Series([1, 2, 3], dtype=pd.Int8Dtype()), pa.float64(), [1.0,
2.0, 3.0]),
(pd.Series([1, 2, 3], dtype=pd.Int16Dtype()), pa.float64(), [1.0,
2.0, 3.0]),
(pd.Series([1, 2, 3], dtype=pd.Int32Dtype()), pa.float64(), [1.0,
2.0, 3.0]),
@@ -591,7 +591,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# ==== 3.3 ArrowDtype-backed Series ====
# (data, target_type, expected_values)
arrow_cases = [
- # Int types → float
+ # Int types -> float
(pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())),
pa.float64(), [1.0, 2.0, 3.0]),
(pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())),
pa.float64(), [1.0, 2.0, 3.0]),
(pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())),
pa.float64(), [1.0, 2.0, 3.0]),
@@ -665,7 +665,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
]:
self.assertEqual(pa.array(data, type=target).type, target)
- # ArrowDtype int64 → decimal128 requires sufficient precision (19
digits)
+ # ArrowDtype int64 -> decimal128 requires sufficient precision (19
digits)
s = pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int64()))
self.assertEqual(
pa.array(s, type=pa.decimal128(19, 0)).to_pylist(),
@@ -677,7 +677,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# ==== 3.4 Datetime Types ====
# (data, target_type, expected_values)
datetime_cases = [
- # datetime64[ns] (numpy-backed) → various resolutions
+ # datetime64[ns] (numpy-backed) -> various resolutions
(
pd.Series(pd.to_datetime(["2024-01-01", "2024-01-02"])),
pa.timestamp("us"),
@@ -781,18 +781,18 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# In PyArrow < 19, ArrowDtype-backed Series keeps original type
# (data, target_type, expected_type_in_pyarrow18)
no_coercion_cases = [
- # Int types with float target → keeps original int type
+ # Int types with float target -> keeps original int type
(pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())),
pa.float64(), pa.int8()),
(pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())),
pa.float64(), pa.int16()),
(pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())),
pa.float64(), pa.int32()),
(pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int64())),
pa.float64(), pa.int64()),
- # Float types with int target → keeps original float type
+ # Float types with int target -> keeps original float type
(pd.Series([1.0, 2.0], dtype=pd.ArrowDtype(pa.float32())),
pa.int64(), pa.float32()),
(pd.Series([1.0, 2.0], dtype=pd.ArrowDtype(pa.float64())),
pa.int64(), pa.float64()),
- # Float with different float target → keeps original float type
+ # Float with different float target -> keeps original float type
(pd.Series([1.0, 2.0], dtype=pd.ArrowDtype(pa.float32())),
pa.float64(), pa.float32()),
(pd.Series([1.0, 2.0], dtype=pd.ArrowDtype(pa.float64())),
pa.float32(), pa.float64()),
- # Int with narrower int target → keeps original int type
+ # Int with narrower int target -> keeps original int type
(pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int64())), pa.int8(),
pa.int64()),
(pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int64())),
pa.int16(), pa.int64()),
]
@@ -825,7 +825,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# ==== 4.1 All Int/Float Types ====
# (data, target_type, expected_values)
numeric_cases = [
- # Int types → float64
+ # Int types -> float64
(np.array([1, 2, 3], dtype=np.int8), pa.float64(), [1.0, 2.0,
3.0]),
(np.array([1, 2, 3], dtype=np.int16), pa.float64(), [1.0, 2.0,
3.0]),
(np.array([1, 2, 3], dtype=np.int32), pa.float64(), [1.0, 2.0,
3.0]),
@@ -834,11 +834,11 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
(np.array([1, 2, 3], dtype=np.uint16), pa.float64(), [1.0, 2.0,
3.0]),
(np.array([1, 2, 3], dtype=np.uint32), pa.float64(), [1.0, 2.0,
3.0]),
(np.array([1, 2, 3], dtype=np.uint64), pa.float64(), [1.0, 2.0,
3.0]),
- # Float types → int64
+ # Float types -> int64
(np.array([1.0, 2.0, 3.0], dtype=np.float16), pa.int64(), [1, 2,
3]),
(np.array([1.0, 2.0, 3.0], dtype=np.float32), pa.int64(), [1, 2,
3]),
(np.array([1.0, 2.0, 3.0], dtype=np.float64), pa.int64(), [1, 2,
3]),
- # Float ↔ float
+ # Float <-> float
(np.array([1.0, 2.0], dtype=np.float32), pa.float64(), [1.0, 2.0]),
(np.array([1.0, 2.0], dtype=np.float64), pa.float32(), [1.0, 2.0]),
# Widening
@@ -858,19 +858,19 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
]
self._run_coercion_tests_with_values(numeric_cases)
- # numpy int64 → decimal128 does NOT work
+ # numpy int64 -> decimal128 does NOT work
with self.assertRaises(pa.ArrowInvalid):
pa.array(np.array([1, 2, 3], dtype=np.int64),
type=pa.decimal128(10, 0))
# ==== 4.2 Boundary Values ====
# (data, target_type, expected_values)
boundary_cases = [
- # Int min/max → same type
+ # Int min/max -> same type
(np.array([int8_min, int8_max], dtype=np.int8), pa.int8(),
[int8_min, int8_max]),
(np.array([int16_min, int16_max], dtype=np.int16), pa.int16(),
[int16_min, int16_max]),
(np.array([int32_min, int32_max], dtype=np.int32), pa.int32(),
[int32_min, int32_max]),
(np.array([int64_min, int64_max], dtype=np.int64), pa.int64(),
[int64_min, int64_max]),
- # Int min/max → float64 (exact for smaller types)
+ # Int min/max -> float64 (exact for smaller types)
(
np.array([int8_min, int8_max], dtype=np.int8),
pa.float64(),
@@ -893,7 +893,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
self._run_coercion_tests_with_values(boundary_cases)
# ==== 4.3 Special Float Values ====
- # NaN/Inf → float (type check only, NaN equality is tricky)
+ # NaN/Inf -> float (type check only, NaN equality is tricky)
for data, target in [
(np.array([nan, 1.0, nan], dtype=np.float64), pa.float64()),
(np.array([inf, neg_inf], dtype=np.float64), pa.float64()),
@@ -1015,7 +1015,7 @@ class PyArrowTypeCoercionTests(unittest.TestCase):
# (data, target_type, expected_values)
struct_cases = [
- # Field type coercion (int → float)
+ # Field type coercion (int -> float)
(
[{"x": 1, "y": 2}, {"x": 3, "y": 4}],
pa.struct([("x", pa.float64()), ("y", pa.float64())]),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]