This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 6baddd08866d [SPARK-50921][ML][PYTHON][CONNECT] Support
MultilayerPerceptronClassifier on Connect
6baddd08866d is described below
commit 6baddd08866da755e7e2875e04eb71d608f00df4
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Jan 27 16:21:13 2025 +0800
[SPARK-50921][ML][PYTHON][CONNECT] Support MultilayerPerceptronClassifier
on Connect
### What changes were proposed in this pull request?
Support MultilayerPerceptronClassifier on Connect
### Why are the changes needed?
feature parity
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
added test
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #49686 from zhengruifeng/ml_connect_mlp.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.../services/org.apache.spark.ml.Estimator | 1 +
.../services/org.apache.spark.ml.Transformer | 1 +
.../MultilayerPerceptronClassifier.scala | 4 +-
python/pyspark/ml/tests/test_classification.py | 98 ++++++++++++++++++++++
.../org/apache/spark/sql/connect/ml/MLUtils.scala | 1 +
5 files changed, 104 insertions(+), 1 deletion(-)
diff --git
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
index 97526bf1a0c0..ef56903de5e0 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
@@ -22,6 +22,7 @@
org.apache.spark.ml.classification.NaiveBayes
org.apache.spark.ml.classification.LinearSVC
org.apache.spark.ml.classification.LogisticRegression
+org.apache.spark.ml.classification.MultilayerPerceptronClassifier
org.apache.spark.ml.classification.DecisionTreeClassifier
org.apache.spark.ml.classification.RandomForestClassifier
org.apache.spark.ml.classification.GBTClassifier
diff --git
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
index c6faa54c147b..c973a9899878 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -38,6 +38,7 @@ org.apache.spark.ml.feature.HashingTF
org.apache.spark.ml.classification.NaiveBayesModel
org.apache.spark.ml.classification.LinearSVCModel
org.apache.spark.ml.classification.LogisticRegressionModel
+org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel
org.apache.spark.ml.classification.DecisionTreeClassificationModel
org.apache.spark.ml.classification.RandomForestClassificationModel
org.apache.spark.ml.classification.GBTClassificationModel
diff --git
a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 106282b9dc3a..a09bf7a7aa36 100644
---
a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++
b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -22,7 +22,7 @@ import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.Since
import org.apache.spark.ml.ann.{FeedForwardTopology, FeedForwardTrainer}
import org.apache.spark.ml.feature.OneHotEncoderModel
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
@@ -283,6 +283,8 @@ class MultilayerPerceptronClassificationModel private[ml] (
with MultilayerPerceptronParams with Serializable with MLWritable
with HasTrainingSummary[MultilayerPerceptronClassificationTrainingSummary]{
+ private[ml] def this() = this(Identifiable.randomUID("mlpc"), Vectors.empty)
+
@Since("1.6.0")
override lazy val numFeatures: Int = $(layers).head
diff --git a/python/pyspark/ml/tests/test_classification.py
b/python/pyspark/ml/tests/test_classification.py
index 0aa2ecb5ca84..bea622db9079 100644
--- a/python/pyspark/ml/tests/test_classification.py
+++ b/python/pyspark/ml/tests/test_classification.py
@@ -44,6 +44,10 @@ from pyspark.ml.classification import (
BinaryRandomForestClassificationTrainingSummary,
GBTClassifier,
GBTClassificationModel,
+ MultilayerPerceptronClassifier,
+ MultilayerPerceptronClassificationModel,
+ MultilayerPerceptronClassificationSummary,
+ MultilayerPerceptronClassificationTrainingSummary,
)
@@ -760,6 +764,100 @@ class ClassificationTestsMixin:
self.assertEqual(str(model), str(model2))
self.assertEqual(model.toDebugString, model2.toDebugString)
+ def test_mlp(self):
+ df = (
+ self.spark.createDataFrame(
+ [
+ (1.0, 1.0, Vectors.dense(0.0, 5.0)),
+ (0.0, 2.0, Vectors.dense(1.0, 2.0)),
+ (1.0, 3.0, Vectors.dense(2.0, 1.0)),
+ (0.0, 4.0, Vectors.dense(3.0, 3.0)),
+ ],
+ ["label", "weight", "features"],
+ )
+ .coalesce(1)
+ .sortWithinPartitions("weight")
+ )
+
+ mlp = MultilayerPerceptronClassifier(
+ layers=[2, 2],
+ maxIter=1,
+ seed=1,
+ )
+ self.assertEqual(mlp.getLayers(), [2, 2])
+ self.assertEqual(mlp.getMaxIter(), 1)
+ self.assertEqual(mlp.getSeed(), 1)
+
+ model = mlp.fit(df)
+ self.assertEqual(mlp.uid, model.uid)
+ self.assertEqual(model.numClasses, 2)
+ self.assertEqual(model.numFeatures, 2)
+ self.assertTrue(
+ np.allclose(
+ model.weights.toArray(),
+ [
+ 0.43562736294302623,
+ 0.364580202422002,
+ -1.4112729385978997,
+ -1.2643591053546168,
+ 1.1512595235805883,
+ 0.7857317704872436,
+ ],
+ atol=1e-4,
+ ),
+ model.weights,
+ )
+
+ vec = Vectors.dense(0.0, 5.0)
+ pred = model.predict(vec)
+ self.assertEqual(pred, 1.0)
+ pred = model.predictRaw(vec)
+ self.assertTrue(
+ np.allclose(pred.toArray(), [-5.905105169408911,
-5.53606375628584], atol=1e-4), pred
+ )
+ pred = model.predictProbability(vec)
+ self.assertTrue(
+ np.allclose(pred.toArray(), [0.4087726702431394,
0.5912273297568605], atol=1e-4), pred
+ )
+
+ output = model.transform(df)
+ expected_cols = [
+ "label",
+ "weight",
+ "features",
+ "rawPrediction",
+ "probability",
+ "prediction",
+ ]
+ self.assertEqual(output.columns, expected_cols)
+ self.assertEqual(output.count(), 4)
+
+ # model summary
+ self.assertTrue(model.hasSummary)
+ summary = model.summary()
+ self.assertIsInstance(summary,
MultilayerPerceptronClassificationSummary)
+ self.assertIsInstance(summary,
MultilayerPerceptronClassificationTrainingSummary)
+ self.assertEqual(summary.labels, [0.0, 1.0])
+ self.assertEqual(summary.accuracy, 0.75)
+ self.assertEqual(summary.predictions.columns, expected_cols)
+
+ summary2 = model.evaluate(df)
+ self.assertIsInstance(summary2,
MultilayerPerceptronClassificationSummary)
+ self.assertNotIsInstance(summary2,
MultilayerPerceptronClassificationTrainingSummary)
+ self.assertEqual(summary2.labels, [0.0, 1.0])
+ self.assertEqual(summary2.accuracy, 0.75)
+ self.assertEqual(summary2.predictions.columns, expected_cols)
+
+ # Model save & load
+ with tempfile.TemporaryDirectory(prefix="mlpc") as d:
+ mlp.write().overwrite().save(d)
+ mlp2 = MultilayerPerceptronClassifier.load(d)
+ self.assertEqual(str(mlp), str(mlp2))
+
+ model.write().overwrite().save(d)
+ model2 = MultilayerPerceptronClassificationModel.load(d)
+ self.assertEqual(str(model), str(model2))
+
class ClassificationTests(ClassificationTestsMixin, unittest.TestCase):
def setUp(self) -> None:
diff --git
a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
index 75aed57ae2d2..86655c7045bf 100644
---
a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
+++
b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
@@ -530,6 +530,7 @@ private[ml] object MLUtils {
Set("intercept", "coefficients", "interceptVector", "coefficientMatrix",
"evaluate")),
(classOf[LogisticRegressionSummary], Set("probabilityCol", "featuresCol")),
(classOf[BinaryLogisticRegressionSummary], Set("scoreCol")),
+ (classOf[MultilayerPerceptronClassificationModel], Set("weights",
"evaluate")),
// Regression Models
(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]