This is an automated email from the ASF dual-hosted git repository.
ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 9e6b25718688 [SPARK-55700][PS] Fix handling integer keys on Series
with non-integer index
9e6b25718688 is described below
commit 9e6b2571868812cc960f79ac5ced6eda6b89266f
Author: Takuya Ueshin <[email protected]>
AuthorDate: Thu Feb 26 11:57:25 2026 -0800
[SPARK-55700][PS] Fix handling integer keys on Series with non-integer index
### What changes were proposed in this pull request?
Fixes handling integer keys on Series with non-integer index.
### Why are the changes needed?
Integer keys on Series with non-integer index doesn't handle them as
positional index with pandas 3 anymore.
For example:
```py
>>> dates = pd.date_range("20130101", periods=6)
>>> pdf = pd.DataFrame(np.random.randn(6, 4), index=dates,
columns=list("ABCD"))
```
- pandas 2
```py
>>> pdf.A[4]
<stdin>:1: FutureWarning: Series.__getitem__ treating keys as positions is
deprecated. In a future version, integer keys will always be treated as labels
(consistent with DataFrame behavior). To access a value by position, use
`ser.iloc[pos]`
np.float64(-1.2836101861392761)
```
- pandas 3
```py
>>> pdf.A[4]
Traceback (most recent call last):
...
KeyError: 4
```
### Does this PR introduce _any_ user-facing change?
Yes, it will behave more like pandas 3.
### How was this patch tested?
Updated the related tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54499 from ueshin/issues/SPARK-55700/treating_keys_as_positions.
Authored-by: Takuya Ueshin <[email protected]>
Signed-off-by: Takuya Ueshin <[email protected]>
---
python/pyspark/pandas/series.py | 27 +++++++++++++---------
.../pandas/tests/indexes/test_indexing_adv.py | 9 ++++++--
2 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 1015ff4db4d9..f1fb0069fa74 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -7354,19 +7354,24 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
)
def __getitem__(self, key: Any) -> Any:
- if type(key) == int and not isinstance(self.index.spark.data_type,
(IntegerType, LongType)):
- warnings.warn(
- "Series.__getitem__ treating keys as positions is deprecated. "
- "In a future version, integer keys will always be treated as
labels "
- "(consistent with DataFrame behavior). "
- "To access a value by position, use `ser.iloc[pos]`",
- FutureWarning,
+ if LooseVersion(pd.__version__) < "3.0.0":
+ treating_keys_as_positions = type(key) == int and not isinstance(
+ self.index.spark.data_type, (IntegerType, LongType)
)
+ if treating_keys_as_positions:
+ warnings.warn(
+ "Series.__getitem__ treating keys as positions is
deprecated. "
+ "In a future version, integer keys will always be treated
as labels "
+ "(consistent with DataFrame behavior). "
+ "To access a value by position, use `ser.iloc[pos]`",
+ FutureWarning,
+ )
+ else:
+ treating_keys_as_positions = False
try:
- if (isinstance(key, slice) and any(type(n) == int for n in
[key.start, key.stop])) or (
- type(key) == int
- and not isinstance(self.index.spark.data_type, (IntegerType,
LongType))
- ):
+ if (
+ isinstance(key, slice) and any(type(n) == int for n in
[key.start, key.stop])
+ ) or treating_keys_as_positions:
# Seems like pandas Series always uses int as positional
search when slicing
# with ints, searches based on index values when the value is
int.
return self.iloc[key]
diff --git a/python/pyspark/pandas/tests/indexes/test_indexing_adv.py
b/python/pyspark/pandas/tests/indexes/test_indexing_adv.py
index fdebdcbd0002..90c9d5dbb0ef 100644
--- a/python/pyspark/pandas/tests/indexes/test_indexing_adv.py
+++ b/python/pyspark/pandas/tests/indexes/test_indexing_adv.py
@@ -22,6 +22,7 @@ import numpy as np
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.pandas.exceptions import SparkPandasNotImplementedError
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -339,8 +340,12 @@ class IndexingAdvMixin:
self.assert_eq(psdf[10:3], pdf[10:3], almost=True)
# Index loc search
- self.assert_eq(psdf.A[4], pdf.A[4])
- self.assert_eq(psdf.A[3], pdf.A[3])
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(psdf.A[4], pdf.A[4])
+ self.assert_eq(psdf.A[3], pdf.A[3])
+ else:
+ with self.assertRaises(KeyError):
+ psdf.A[4]
# Positional iloc search
self.assert_eq(psdf.A[:4], pdf.A[:4], almost=True)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]