(spark) branch master updated: [SPARK-50921][ML][PYTHON][CONNECT] Support MultilayerPerceptronClassifier on Connect

ruifengz Mon, 27 Jan 2025 00:21:38 -0800

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 6baddd08866d [SPARK-50921][ML][PYTHON][CONNECT] Support 
MultilayerPerceptronClassifier on Connect
6baddd08866d is described below

commit 6baddd08866da755e7e2875e04eb71d608f00df4
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Jan 27 16:21:13 2025 +0800

    [SPARK-50921][ML][PYTHON][CONNECT] Support MultilayerPerceptronClassifier 
on Connect
    
    ### What changes were proposed in this pull request?
     Support MultilayerPerceptronClassifier on Connect
    
    ### Why are the changes needed?
    feature parity
    
    ### Does this PR introduce _any_ user-facing change?
    yes
    
    ### How was this patch tested?
    added test
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #49686 from zhengruifeng/ml_connect_mlp.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 .../services/org.apache.spark.ml.Estimator         |  1 +
 .../services/org.apache.spark.ml.Transformer       |  1 +
 .../MultilayerPerceptronClassifier.scala           |  4 +-
 python/pyspark/ml/tests/test_classification.py     | 98 ++++++++++++++++++++++
 .../org/apache/spark/sql/connect/ml/MLUtils.scala  |  1 +
 5 files changed, 104 insertions(+), 1 deletion(-)

diff --git 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
index 97526bf1a0c0..ef56903de5e0 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
@@ -22,6 +22,7 @@
 org.apache.spark.ml.classification.NaiveBayes
 org.apache.spark.ml.classification.LinearSVC
 org.apache.spark.ml.classification.LogisticRegression
+org.apache.spark.ml.classification.MultilayerPerceptronClassifier
 org.apache.spark.ml.classification.DecisionTreeClassifier
 org.apache.spark.ml.classification.RandomForestClassifier
 org.apache.spark.ml.classification.GBTClassifier
diff --git 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
index c6faa54c147b..c973a9899878 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -38,6 +38,7 @@ org.apache.spark.ml.feature.HashingTF
 org.apache.spark.ml.classification.NaiveBayesModel
 org.apache.spark.ml.classification.LinearSVCModel
 org.apache.spark.ml.classification.LogisticRegressionModel
+org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel
 org.apache.spark.ml.classification.DecisionTreeClassificationModel
 org.apache.spark.ml.classification.RandomForestClassificationModel
 org.apache.spark.ml.classification.GBTClassificationModel
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 106282b9dc3a..a09bf7a7aa36 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -22,7 +22,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.annotation.Since
 import org.apache.spark.ml.ann.{FeedForwardTopology, FeedForwardTrainer}
 import org.apache.spark.ml.feature.OneHotEncoderModel
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
@@ -283,6 +283,8 @@ class MultilayerPerceptronClassificationModel private[ml] (
   with MultilayerPerceptronParams with Serializable with MLWritable
   with HasTrainingSummary[MultilayerPerceptronClassificationTrainingSummary]{
 
+  private[ml] def this() = this(Identifiable.randomUID("mlpc"), Vectors.empty)
+
   @Since("1.6.0")
   override lazy val numFeatures: Int = $(layers).head
 
diff --git a/python/pyspark/ml/tests/test_classification.py 
b/python/pyspark/ml/tests/test_classification.py
index 0aa2ecb5ca84..bea622db9079 100644
--- a/python/pyspark/ml/tests/test_classification.py
+++ b/python/pyspark/ml/tests/test_classification.py
@@ -44,6 +44,10 @@ from pyspark.ml.classification import (
     BinaryRandomForestClassificationTrainingSummary,
     GBTClassifier,
     GBTClassificationModel,
+    MultilayerPerceptronClassifier,
+    MultilayerPerceptronClassificationModel,
+    MultilayerPerceptronClassificationSummary,
+    MultilayerPerceptronClassificationTrainingSummary,
 )
 
 
@@ -760,6 +764,100 @@ class ClassificationTestsMixin:
             self.assertEqual(str(model), str(model2))
             self.assertEqual(model.toDebugString, model2.toDebugString)
 
+    def test_mlp(self):
+        df = (
+            self.spark.createDataFrame(
+                [
+                    (1.0, 1.0, Vectors.dense(0.0, 5.0)),
+                    (0.0, 2.0, Vectors.dense(1.0, 2.0)),
+                    (1.0, 3.0, Vectors.dense(2.0, 1.0)),
+                    (0.0, 4.0, Vectors.dense(3.0, 3.0)),
+                ],
+                ["label", "weight", "features"],
+            )
+            .coalesce(1)
+            .sortWithinPartitions("weight")
+        )
+
+        mlp = MultilayerPerceptronClassifier(
+            layers=[2, 2],
+            maxIter=1,
+            seed=1,
+        )
+        self.assertEqual(mlp.getLayers(), [2, 2])
+        self.assertEqual(mlp.getMaxIter(), 1)
+        self.assertEqual(mlp.getSeed(), 1)
+
+        model = mlp.fit(df)
+        self.assertEqual(mlp.uid, model.uid)
+        self.assertEqual(model.numClasses, 2)
+        self.assertEqual(model.numFeatures, 2)
+        self.assertTrue(
+            np.allclose(
+                model.weights.toArray(),
+                [
+                    0.43562736294302623,
+                    0.364580202422002,
+                    -1.4112729385978997,
+                    -1.2643591053546168,
+                    1.1512595235805883,
+                    0.7857317704872436,
+                ],
+                atol=1e-4,
+            ),
+            model.weights,
+        )
+
+        vec = Vectors.dense(0.0, 5.0)
+        pred = model.predict(vec)
+        self.assertEqual(pred, 1.0)
+        pred = model.predictRaw(vec)
+        self.assertTrue(
+            np.allclose(pred.toArray(), [-5.905105169408911, 
-5.53606375628584], atol=1e-4), pred
+        )
+        pred = model.predictProbability(vec)
+        self.assertTrue(
+            np.allclose(pred.toArray(), [0.4087726702431394, 
0.5912273297568605], atol=1e-4), pred
+        )
+
+        output = model.transform(df)
+        expected_cols = [
+            "label",
+            "weight",
+            "features",
+            "rawPrediction",
+            "probability",
+            "prediction",
+        ]
+        self.assertEqual(output.columns, expected_cols)
+        self.assertEqual(output.count(), 4)
+
+        # model summary
+        self.assertTrue(model.hasSummary)
+        summary = model.summary()
+        self.assertIsInstance(summary, 
MultilayerPerceptronClassificationSummary)
+        self.assertIsInstance(summary, 
MultilayerPerceptronClassificationTrainingSummary)
+        self.assertEqual(summary.labels, [0.0, 1.0])
+        self.assertEqual(summary.accuracy, 0.75)
+        self.assertEqual(summary.predictions.columns, expected_cols)
+
+        summary2 = model.evaluate(df)
+        self.assertIsInstance(summary2, 
MultilayerPerceptronClassificationSummary)
+        self.assertNotIsInstance(summary2, 
MultilayerPerceptronClassificationTrainingSummary)
+        self.assertEqual(summary2.labels, [0.0, 1.0])
+        self.assertEqual(summary2.accuracy, 0.75)
+        self.assertEqual(summary2.predictions.columns, expected_cols)
+
+        # Model save & load
+        with tempfile.TemporaryDirectory(prefix="mlpc") as d:
+            mlp.write().overwrite().save(d)
+            mlp2 = MultilayerPerceptronClassifier.load(d)
+            self.assertEqual(str(mlp), str(mlp2))
+
+            model.write().overwrite().save(d)
+            model2 = MultilayerPerceptronClassificationModel.load(d)
+            self.assertEqual(str(model), str(model2))
+
 
 class ClassificationTests(ClassificationTestsMixin, unittest.TestCase):
     def setUp(self) -> None:
diff --git 
a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
 
b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
index 75aed57ae2d2..86655c7045bf 100644
--- 
a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
+++ 
b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
@@ -530,6 +530,7 @@ private[ml] object MLUtils {
       Set("intercept", "coefficients", "interceptVector", "coefficientMatrix", 
"evaluate")),
     (classOf[LogisticRegressionSummary], Set("probabilityCol", "featuresCol")),
     (classOf[BinaryLogisticRegressionSummary], Set("scoreCol")),
+    (classOf[MultilayerPerceptronClassificationModel], Set("weights", 
"evaluate")),
 
     // Regression Models
     (


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-50921][ML][PYTHON][CONNECT] Support MultilayerPerceptronClassifier on Connect

Reply via email to