This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7c12ff62ac03 [SPARK-50988][ML][PYTHON][CONNECT] Fix uid
inconsistencies for estimator and model
7c12ff62ac03 is described below
commit 7c12ff62ac0348a852f32a191b9f623ed7379a08
Author: Bobby Wang <[email protected]>
AuthorDate: Sun Jan 26 17:53:32 2025 +0800
[SPARK-50988][ML][PYTHON][CONNECT] Fix uid inconsistencies for estimator
and model
### What changes were proposed in this pull request?
The uid of the model trained by the corresponding estimator is not equal to
the uid of the estimator, which is a bug. This PR has fixed this issue.
### Why are the changes needed?
Fix the bug to make the uid of the estimator and model equal
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
The CI passes
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #49674 from wbo4958/uid.
Authored-by: Bobby Wang <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/ml/tests/test_als.py | 14 +++++++-------
python/pyspark/ml/tests/test_classification.py | 10 ++++++++++
python/pyspark/ml/tests/test_clustering.py | 3 +++
python/pyspark/ml/tests/test_feature.py | 19 +++++++++++++++++++
python/pyspark/ml/tests/test_fpm.py | 2 +-
python/pyspark/ml/tests/test_regression.py | 4 ++++
python/pyspark/ml/util.py | 4 +++-
python/pyspark/ml/wrapper.py | 2 +-
8 files changed, 48 insertions(+), 10 deletions(-)
diff --git a/python/pyspark/ml/tests/test_als.py
b/python/pyspark/ml/tests/test_als.py
index bd6cd1cb212f..299d56c70325 100644
--- a/python/pyspark/ml/tests/test_als.py
+++ b/python/pyspark/ml/tests/test_als.py
@@ -48,13 +48,19 @@ class ALSTestsMixin:
self.assertEqual(als.getSeed(), 1)
self.assertEqual(als.getMaxIter(), 2)
+ model = als.fit(df)
+
# Estimator save & load
with tempfile.TemporaryDirectory(prefix="ALS") as d:
als.write().overwrite().save(d)
als2 = ALS.load(d)
self.assertEqual(str(als), str(als2))
- model = als.fit(df)
+ model.write().overwrite().save(d)
+ model2 = ALSModel.load(d)
+ self.assertEqual(str(model), str(model2))
+
+ self.assertEqual(als.uid, model.uid)
self.assertEqual(model.rank, 10)
self.assertEqual(model.itemFactors.columns, ["id", "features"])
@@ -84,12 +90,6 @@ class ALSTestsMixin:
self.assertEqual(output4.columns, ["item", "recommendations"])
self.assertEqual(output4.count(), 3)
- # Model save & load
- with tempfile.TemporaryDirectory(prefix="als_model") as d:
- model.write().overwrite().save(d)
- model2 = ALSModel.load(d)
- self.assertEqual(str(model), str(model2))
-
def test_ambiguous_column(self):
data = self.spark.createDataFrame(
[[1, 15, 1], [1, 2, 2], [2, 3, 4], [2, 2, 5]],
diff --git a/python/pyspark/ml/tests/test_classification.py
b/python/pyspark/ml/tests/test_classification.py
index 8ee2dcac5c12..0aa2ecb5ca84 100644
--- a/python/pyspark/ml/tests/test_classification.py
+++ b/python/pyspark/ml/tests/test_classification.py
@@ -64,6 +64,7 @@ class ClassificationTestsMixin:
self.assertEqual(nb.getWeightCol(), "weight")
model = nb.fit(df)
+ self.assertEqual(model.uid, nb.uid)
self.assertEqual(model.numClasses, 2)
self.assertEqual(model.numFeatures, 2)
self.assertTrue(
@@ -126,6 +127,7 @@ class ClassificationTestsMixin:
upperBoundsOnIntercepts=Vectors.dense(0.0),
)
lor_model = lor.fit(df)
+ self.assertEqual(lor.uid, lor_model.uid)
def check_result(model: LogisticRegressionModel) -> None:
self.assertTrue(
@@ -159,6 +161,7 @@ class ClassificationTestsMixin:
upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0),
)
lor_model = lor.fit(df)
+ self.assertEqual(lor.uid, lor_model.uid)
def check_result(model: LogisticRegressionModel) -> None:
expected = [
@@ -196,6 +199,7 @@ class ClassificationTestsMixin:
lor = LogisticRegression(weightCol="weight")
model = lor.fit(df)
+ self.assertEqual(lor.uid, model.uid)
# status changes 1
for t in [0.0, 0.1, 0.2, 0.5, 1.0]:
@@ -224,6 +228,7 @@ class ClassificationTestsMixin:
)
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight",
fitIntercept=False)
model = lr.fit(df)
+ self.assertEqual(lr.uid, model.uid)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
@@ -385,6 +390,7 @@ class ClassificationTestsMixin:
self.assertEqual(svc.getRegParam(), 1.0)
model = svc.fit(df)
+ self.assertEqual(svc.uid, model.uid)
self.assertEqual(model.numClasses, 2)
self.assertEqual(model.numFeatures, 2)
self.assertTrue(np.allclose(model.intercept, 0.025877458475338313,
atol=1e-4))
@@ -464,6 +470,7 @@ class ClassificationTestsMixin:
self.assertEqual(dt.getLeafCol(), "leaf")
model = dt.fit(df)
+ self.assertEqual(dt.uid, model.uid)
self.assertEqual(model.numClasses, 2)
self.assertEqual(model.numFeatures, 2)
self.assertEqual(model.depth, 2)
@@ -531,6 +538,7 @@ class ClassificationTestsMixin:
self.assertEqual(gbt.getLeafCol(), "leaf")
model = gbt.fit(df)
+ self.assertEqual(gbt.uid, model.uid)
self.assertEqual(model.numClasses, 2)
self.assertEqual(model.numFeatures, 2)
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
@@ -609,6 +617,7 @@ class ClassificationTestsMixin:
self.assertEqual(rf.getLeafCol(), "leaf")
model = rf.fit(df)
+ self.assertEqual(rf.uid, model.uid)
self.assertEqual(model.numClasses, 2)
self.assertEqual(model.numFeatures, 2)
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
@@ -695,6 +704,7 @@ class ClassificationTestsMixin:
self.assertEqual(rf.getLeafCol(), "leaf")
model = rf.fit(df)
+ self.assertEqual(rf.uid, model.uid)
self.assertEqual(model.numClasses, 3)
self.assertEqual(model.numFeatures, 2)
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
diff --git a/python/pyspark/ml/tests/test_clustering.py
b/python/pyspark/ml/tests/test_clustering.py
index a6685914eab8..e6013d10fa8e 100644
--- a/python/pyspark/ml/tests/test_clustering.py
+++ b/python/pyspark/ml/tests/test_clustering.py
@@ -64,6 +64,7 @@ class ClusteringTestsMixin:
self.assertEqual(km.getWeightCol(), "weight")
model = km.fit(df)
+ self.assertEqual(km.uid, model.uid)
# TODO: support KMeansModel.numFeatures in Python
# self.assertEqual(model.numFeatures, 2)
@@ -132,6 +133,7 @@ class ClusteringTestsMixin:
self.assertEqual(bkm.getWeightCol(), "weight")
model = bkm.fit(df)
+ self.assertEqual(bkm.uid, model.uid)
# TODO: support KMeansModel.numFeatures in Python
# self.assertEqual(model.numFeatures, 2)
@@ -203,6 +205,7 @@ class ClusteringTestsMixin:
self.assertEqual(gmm.getSeed(), 1)
model = gmm.fit(df)
+ self.assertEqual(gmm.uid, model.uid)
# TODO: support GMM.numFeatures in Python
# self.assertEqual(model.numFeatures, 2)
self.assertEqual(len(model.weights), 2)
diff --git a/python/pyspark/ml/tests/test_feature.py
b/python/pyspark/ml/tests/test_feature.py
index 1424ed4947e2..d7bd5ef4a1fc 100644
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@@ -123,6 +123,7 @@ class FeatureTestsMixin:
# single input
si = StringIndexer(inputCol="label1", outputCol="index1")
model = si.fit(df.select("label1"))
+ self.assertEqual(si.uid, model.uid)
# read/write
with tempfile.TemporaryDirectory(prefix="string_indexer") as tmp_dir:
@@ -183,6 +184,7 @@ class FeatureTestsMixin:
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
model = pca.fit(df)
+ self.assertEqual(pca.uid, model.uid)
self.assertEqual(model.getK(), 2)
self.assertTrue(
np.allclose(model.explainedVariance.toArray(), [0.79439, 0.20560],
atol=1e-4)
@@ -272,6 +274,7 @@ class FeatureTestsMixin:
self.assertEqual(scaler.getOutputCol(), "scaled")
model = scaler.fit(df)
+ self.assertEqual(scaler.uid, model.uid)
self.assertTrue(np.allclose(model.mean.toArray(), [1.66666667],
atol=1e-4))
self.assertTrue(np.allclose(model.std.toArray(), [1.52752523],
atol=1e-4))
@@ -311,6 +314,7 @@ class FeatureTestsMixin:
self.assertEqual(scaler.getOutputCol(), "scaled")
model = scaler.fit(df)
+ self.assertEqual(scaler.uid, model.uid)
self.assertTrue(np.allclose(model.maxAbs.toArray(), [3.0], atol=1e-4))
output = model.transform(df)
@@ -349,6 +353,7 @@ class FeatureTestsMixin:
self.assertEqual(scaler.getOutputCol(), "scaled")
model = scaler.fit(df)
+ self.assertEqual(scaler.uid, model.uid)
self.assertTrue(np.allclose(model.originalMax.toArray(), [3.0],
atol=1e-4))
self.assertTrue(np.allclose(model.originalMin.toArray(), [0.0],
atol=1e-4))
@@ -388,6 +393,7 @@ class FeatureTestsMixin:
self.assertEqual(scaler.getOutputCol(), "scaled")
model = scaler.fit(df)
+ self.assertEqual(scaler.uid, model.uid)
self.assertTrue(np.allclose(model.range.toArray(), [3.0], atol=1e-4))
self.assertTrue(np.allclose(model.median.toArray(), [2.0], atol=1e-4))
@@ -422,6 +428,7 @@ class FeatureTestsMixin:
self.assertEqual(selector.getOutputCol(), "selectedFeatures")
model = selector.fit(df)
+ self.assertEqual(selector.uid, model.uid)
self.assertEqual(model.selectedFeatures, [2])
output = model.transform(df)
@@ -456,6 +463,7 @@ class FeatureTestsMixin:
self.assertEqual(selector.getSelectionThreshold(), 1)
model = selector.fit(df)
+ self.assertEqual(selector.uid, model.uid)
self.assertEqual(model.selectedFeatures, [3])
output = model.transform(df)
@@ -487,6 +495,7 @@ class FeatureTestsMixin:
self.assertEqual(selector.getOutputCol(), "selectedFeatures")
model = selector.fit(df)
+ self.assertEqual(selector.uid, model.uid)
self.assertEqual(model.selectedFeatures, [2])
output = model.transform(df)
@@ -516,6 +525,7 @@ class FeatureTestsMixin:
self.assertEqual(w2v.getMaxIter(), 1)
model = w2v.fit(df)
+ self.assertEqual(w2v.uid, model.uid)
self.assertEqual(model.getVectors().columns, ["word", "vector"])
self.assertEqual(model.getVectors().count(), 3)
@@ -567,6 +577,7 @@ class FeatureTestsMixin:
self.assertEqual(imputer.getOutputCols(), ["out_a", "out_b"])
model = imputer.fit(df)
+ self.assertEqual(imputer.uid, model.uid)
self.assertEqual(model.surrogateDF.columns, ["a", "b"])
self.assertEqual(model.surrogateDF.count(), 1)
self.assertEqual(list(model.surrogateDF.head()), [3.0, 4.0])
@@ -598,6 +609,7 @@ class FeatureTestsMixin:
self.assertEqual(cv.getOutputCol(), "vectors")
model = cv.fit(df)
+ self.assertEqual(cv.uid, model.uid)
self.assertEqual(sorted(model.vocabulary), ["a", "b", "c"])
output = model.transform(df)
@@ -624,6 +636,7 @@ class FeatureTestsMixin:
self.assertEqual(encoder.getOutputCols(), ["output"])
model = encoder.fit(df)
+ self.assertEqual(encoder.uid, model.uid)
self.assertEqual(model.categorySizes, [3])
output = model.transform(df)
@@ -900,6 +913,7 @@ class FeatureTestsMixin:
self.assertListEqual(idf.params, [idf.inputCol, idf.minDocFreq,
idf.outputCol])
model = idf.fit(df, {idf.outputCol: "idf"})
+ self.assertEqual(idf.uid, model.uid)
# self.assertEqual(
# model.uid, idf.uid, "Model should inherit the UID from its
parent estimator."
# )
@@ -1012,6 +1026,7 @@ class FeatureTestsMixin:
)
cv = CountVectorizer(binary=True, inputCol="words",
outputCol="features")
model = cv.fit(dataset)
+ self.assertEqual(cv.uid, model.uid)
transformedList = model.transform(dataset).select("features",
"expected").collect()
@@ -1047,6 +1062,8 @@ class FeatureTestsMixin:
)
cv = CountVectorizer(inputCol="words", outputCol="features")
model1 = cv.setMaxDF(3).fit(dataset)
+ self.assertEqual(cv.uid, model1.uid)
+
self.assertEqual(model1.vocabulary, ["b", "c", "d"])
transformedList1 = model1.transform(dataset).select("features",
"expected").collect()
@@ -1119,6 +1136,8 @@ class FeatureTestsMixin:
# Does not index label by default since it's numeric type.
rf = RFormula(formula="y ~ x + s")
model = rf.fit(df)
+ self.assertEqual(rf.uid, model.uid)
+
transformedDF = model.transform(df)
self.assertEqual(transformedDF.head().label, 1.0)
# Force to index label.
diff --git a/python/pyspark/ml/tests/test_fpm.py
b/python/pyspark/ml/tests/test_fpm.py
index 61194cc1d0d2..cc8ead7127d6 100644
--- a/python/pyspark/ml/tests/test_fpm.py
+++ b/python/pyspark/ml/tests/test_fpm.py
@@ -47,7 +47,7 @@ class FPMTestsMixin:
self.assertEqual(fp.getNumPartitions(), 1)
model = fp.fit(df)
-
+ self.assertEqual(fp.uid, model.uid)
self.assertEqual(model.freqItemsets.columns, ["items", "freq"])
self.assertEqual(model.freqItemsets.count(), 54)
diff --git a/python/pyspark/ml/tests/test_regression.py
b/python/pyspark/ml/tests/test_regression.py
index 16a94a6e0f67..ed357127d983 100644
--- a/python/pyspark/ml/tests/test_regression.py
+++ b/python/pyspark/ml/tests/test_regression.py
@@ -71,6 +71,7 @@ class RegressionTestsMixin:
self.assertEqual(lr.getWeightCol(), "weight")
model = lr.fit(df)
+ self.assertEqual(lr.uid, model.uid)
self.assertEqual(model.numFeatures, 2)
self.assertTrue(np.allclose(model.scale, 1.0, atol=1e-4))
self.assertTrue(np.allclose(model.intercept, -0.35, atol=1e-4))
@@ -280,6 +281,7 @@ class RegressionTestsMixin:
self.assertEqual(dt.getLeafCol(), "leaf")
model = dt.fit(df)
+ self.assertEqual(dt.uid, model.uid)
self.assertEqual(model.numFeatures, 2)
self.assertEqual(model.depth, 2)
self.assertEqual(model.numNodes, 5)
@@ -337,6 +339,7 @@ class RegressionTestsMixin:
self.assertEqual(gbt.getLeafCol(), "leaf")
model = gbt.fit(df)
+ self.assertEqual(gbt.uid, model.uid)
self.assertEqual(model.numFeatures, 2)
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
# model.trees
@@ -412,6 +415,7 @@ class RegressionTestsMixin:
self.assertEqual(rf.getLeafCol(), "leaf")
model = rf.fit(df)
+ self.assertEqual(rf.uid, model.uid)
self.assertEqual(model.numFeatures, 2)
# TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
# model.trees
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 309f8452ac79..34bee0599a02 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -135,7 +135,9 @@ def try_remote_fit(f: FuncT) -> FuncT:
(_, properties, _) = client.execute_command(command)
model_info = deserialize(properties)
client.add_ml_cache(model_info.obj_ref.id)
- return model_info.obj_ref.id
+ model = self._create_model(model_info.obj_ref.id)
+ model._resetUid(self.uid)
+ return self._copyValues(model)
else:
return f(self, dataset)
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 40670fdb84a2..f88045e718a5 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -387,7 +387,6 @@ class JavaEstimator(JavaParams, Estimator[JM],
metaclass=ABCMeta):
"""
raise NotImplementedError()
- @try_remote_fit
def _fit_java(self, dataset: DataFrame) -> "JavaObject":
"""
Fits a Java model to the input dataset.
@@ -407,6 +406,7 @@ class JavaEstimator(JavaParams, Estimator[JM],
metaclass=ABCMeta):
self._transfer_params_to_java()
return self._java_obj.fit(dataset._jdf)
+ @try_remote_fit
def _fit(self, dataset: DataFrame) -> JM:
java_model = self._fit_java(dataset)
model = self._create_model(java_model)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]