This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e2a0171aeb43 [SPARK-45414][SQL][TESTS] Add regression tests for XML
mixed type serialization
e2a0171aeb43 is described below
commit e2a0171aeb4370a2903940430707eb74db58d5ab
Author: David Roberts <[email protected]>
AuthorDate: Mon Feb 23 00:43:05 2026 +0800
[SPARK-45414][SQL][TESTS] Add regression tests for XML mixed type
serialization
This adds two regression tests for SPARK-45414 to prevent reintroduction of
the bug where string tag content was misplaced when writing XML with mixed
column types (structs, arrays, and strings).
Tests verify:
- String columns between and after nested types write correctly
- Attributes mixed with string elements serialize properly
### What changes were proposed in this pull request?
Add two regression tests for SPARK-45414 to prevent reintroduction of the
bug where string tag content could be misplaced when writing XML with mixed
column types.
### Why are the changes needed?
SPARK-45414 reported a bug in the spark-xml library where string content
was misplaced when writing structs with mixed types. While this bug was fixed
during spark-xml integration into Spark core, there were no tests ensuring this
specific scenario remains correct. These tests provide coverage for the bug
scenario.
### Does this PR introduce _any_ user-facing change?
No, this only adds test coverage.
### How was this patch tested?
The new tests were run successfully:
### Was this patch authored or co-authored using generative AI tooling?
Yes, co-authored with Claude Sonnet 4.5.
Generated-by: Claude Sonnet 4.5
Closes #53857 from jdavidroberts/spark-45414-xml-tests.
Lead-authored-by: David Roberts <[email protected]>
Co-authored-by: jdavidroberts
<[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../xml/parsers/StaxXmlGeneratorSuite.scala | 106 +++++++++++++++++++++
1 file changed, 106 insertions(+)
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlGeneratorSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlGeneratorSuite.scala
index 1798d32d8a2c..fefe54c7bae9 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlGeneratorSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlGeneratorSuite.scala
@@ -75,4 +75,110 @@ final class StaxXmlGeneratorSuite extends
SharedSparkSession {
assert(df.collect().toSeq === newDf.collect().toSeq)
}
+ // SPARK-45414: Test for string tag content misplacement issue (found in
spark-xml library)
+ // with mixed column types
+ test("SPARK-45414: write mixed types with string columns between and after
nested types") {
+ import org.apache.spark.sql.Row
+
+ // Create a schema with mixed types: struct, array, and string columns
+ // This reproduces the scenario from SPARK-45414 where string content gets
misplaced
+ val schema = StructType(
+ Seq(
+ StructField("id", IntegerType, nullable = false),
+ StructField(
+ "metadata",
+ StructType(Seq(StructField("version", StringType),
StructField("timestamp", LongType))),
+ nullable = true),
+ StructField("description", StringType, nullable = true), // String
between nested types
+ StructField("tags", ArrayType(StringType), nullable = true),
+ StructField("color", StringType, nullable = true), // String at the end
+ StructField("numbers", ArrayType(IntegerType), nullable = true)))
+
+ val data = Seq(
+ Row(1, Row("v1.0", 1000L), "MyDescription", Array("tag1", "tag2"),
"Red", Array(1, 2, 3)),
+ Row(2, Row("v2.0", 2000L), "AnotherDescription", Array("tag3"), "Blue",
Array(4, 5)))
+
+ val df = spark.createDataFrame(spark.sparkContext.parallelize(data),
schema)
+
+ // Write to XML
+ val targetFile =
+
Files.createTempDirectory("StaxXmlGeneratorSuite").resolve("mixed-types.xml").toString
+ df.write.option("rowTag", "item").xml(targetFile)
+
+ // Read back and verify the content is in correct XML tags
+ val readDf = spark.read.option("rowTag",
"item").schema(schema).xml(targetFile)
+ val results = readDf.collect()
+
+ // Verify structure is preserved
+ assert(results.length === 2)
+
+ // Verify first row - ensure no data misplacement
+ assert(results(0).getAs[Int]("id") === 1)
+ val metadata1 = results(0).getAs[Row]("metadata")
+ assert(metadata1.getAs[String]("version") === "v1.0")
+ assert(metadata1.getAs[Long]("timestamp") === 1000L)
+ // Critical: ensure "MyDescription" is in description field, not in tags
or color
+ assert(results(0).getAs[String]("description") === "MyDescription")
+ assert(results(0).getAs[Seq[String]]("tags") === Seq("tag1", "tag2"))
+ // Critical: ensure "Red" is in color field, not misplaced
+ assert(results(0).getAs[String]("color") === "Red")
+ assert(results(0).getAs[Seq[Int]]("numbers") === Seq(1, 2, 3))
+
+ // Verify second row
+ assert(results(1).getAs[Int]("id") === 2)
+ val metadata2 = results(1).getAs[Row]("metadata")
+ assert(metadata2.getAs[String]("version") === "v2.0")
+ assert(metadata2.getAs[Long]("timestamp") === 2000L)
+ assert(results(1).getAs[String]("description") === "AnotherDescription")
+ assert(results(1).getAs[Seq[String]]("tags") === Seq("tag3"))
+ assert(results(1).getAs[String]("color") === "Blue")
+ assert(results(1).getAs[Seq[Int]]("numbers") === Seq(4, 5))
+ }
+
+ // SPARK-45414: Test with attributes mixed with elements
+ test("SPARK-45414: write mixed types with attributes and string elements") {
+ import org.apache.spark.sql.Row
+
+ // Schema with attributes (using _ prefix) and string elements
+ val schema = StructType(
+ Seq(
+ StructField("_id", IntegerType, nullable = false), // attribute
+ StructField(
+ "nested",
+ StructType(
+ Seq(
+ StructField("_attr1", StringType), // attribute
+ StructField("value", StringType))),
+ nullable = true),
+ StructField("description", StringType, nullable = true), // element
+ StructField("items", ArrayType(IntegerType), nullable = true),
+ StructField("name", StringType, nullable = true) // element at end
+ ))
+
+ val data = Seq(Row(100, Row("attrValue", "nestedValue"), "DescText",
Array(1, 2), "ItemName"))
+
+ val df = spark.createDataFrame(spark.sparkContext.parallelize(data),
schema)
+
+ val targetFile =
+
Files.createTempDirectory("StaxXmlGeneratorSuite").resolve("mixed-attrs.xml").toString
+ df.write.option("rowTag", "record").option("attributePrefix",
"_").xml(targetFile)
+
+ val readDf = spark.read
+ .option("rowTag", "record")
+ .option("attributePrefix", "_")
+ .schema(schema)
+ .xml(targetFile)
+ val results = readDf.collect()
+
+ assert(results.length === 1)
+ assert(results(0).getAs[Int]("_id") === 100)
+ val nested = results(0).getAs[Row]("nested")
+ assert(nested.getAs[String]("_attr1") === "attrValue")
+ assert(nested.getAs[String]("value") === "nestedValue")
+ // Critical: ensure string elements are not misplaced
+ assert(results(0).getAs[String]("description") === "DescText")
+ assert(results(0).getAs[Seq[Int]]("items") === Seq(1, 2))
+ assert(results(0).getAs[String]("name") === "ItemName")
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]