This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new 421c474832da [SPARK-54099][SQL] XML variant parser should fall back to
string on decimal parsing errors
421c474832da is described below
commit 421c474832da2da5411ab07c611b02f4f35c9cf5
Author: Xiaonan Yang <[email protected]>
AuthorDate: Mon Nov 3 08:03:08 2025 +0900
[SPARK-54099][SQL] XML variant parser should fall back to string on decimal
parsing errors
### What changes were proposed in this pull request?
When parsing XML data with `parse_xml` that contains decimal numbers with
very large
exponents (e.g., "1E+2147483647"), the conversion to Variant type fails
with:
```
java.lang.ArithmeticException: BigInteger would overflow supported range
at java.base/java.math.BigDecimal.setScale(BigDecimal.java:3000)
at
org.apache.spark.sql.catalyst.xml.StaxXmlParser$.org$apache$spark$sql$catalyst$xml$StaxXmlParser$$appendXMLCharacterToVariant(StaxXmlParser.scala:1335)
```
It's because the parser calls `setScale(0)` to normalize the decimal. When
the scale is extremely negative (e.g., -2147483647), `setScale(0)` attempts to
multiply the unscaled value by 10^2147483647, causing BigInteger overflow.
This PR will catch all errors when parsing strings as decimal in the XML
variant parser and fall back to string.
### Why are the changes needed?
Bug fix.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
New UT.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #52801 from xiaonanyang-db/SPARK-54099.
Authored-by: Xiaonan Yang <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit 88eef06c221d9e1ec9e64e17296c20733572e3aa)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../spark/sql/catalyst/xml/StaxXmlParser.scala | 25 +++++++++++-----------
.../datasources/xml/XmlVariantSuite.scala | 20 +++++++++++++++++
2 files changed, 33 insertions(+), 12 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
index 1a47fa7bd43f..3d6c57c9f746 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -1278,18 +1278,19 @@ object StaxXmlParser {
// Try parsing the value as decimal
val decimalParser = ExprUtils.getDecimalParser(options.locale)
- allCatch opt decimalParser(value) match {
- case Some(decimalValue) =>
- var d = decimalValue
- if (d.scale() < 0) {
- d = d.setScale(0)
- }
- if (d.scale <= VariantUtil.MAX_DECIMAL16_PRECISION &&
- d.precision <= VariantUtil.MAX_DECIMAL16_PRECISION) {
- builder.appendDecimal(d)
- return
- }
- case _ =>
+ try {
+ var d = decimalParser(value)
+ if (d.scale() < 0) {
+ d = d.setScale(0)
+ }
+ if (d.scale <= VariantUtil.MAX_DECIMAL16_PRECISION &&
+ d.precision <= VariantUtil.MAX_DECIMAL16_PRECISION) {
+ builder.appendDecimal(d)
+ return
+ }
+ } catch {
+ case NonFatal(_) =>
+ // Ignore the exception and parse it as a string below
}
// If the character is of other primitive types, parse it as a string
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
index 5f5e017038b7..15713e759b81 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
@@ -942,6 +942,26 @@ class XmlVariantSuite extends QueryTest with
SharedSparkSession with TestXmlData
.map(_.getString(0).replaceAll("\\s+", ""))
assert(xmlResult.head === xmlStr)
}
+
+ test(
+ "[SPARK-54099] XML variant parser should fall back to string " +
+ "when failing to parse decimal values"
+ ) {
+ // Decimals with extreme exponents. The variant parser should throw
ArithmeticException when
+ // parsing these values as Decimal:
+ val decimalString = Seq(
+ "1E+2147483647", // Maximum int exponent - scale would be -2147483647
+ "5E+1000000000", // 1 billion exponent
+ "1.23E+999999999", // Very large exponent
+ "0.001E+2147483640" // Still results in huge effective exponent
+ )
+ decimalString.foreach { str =>
+ testParser(
+ xml = s"<ROW><decimal>$str</decimal></ROW>",
+ expectedJsonStr = s"""{"decimal":"$str"}"""
+ )
+ }
+ }
}
class XmlVariantSuiteWithLegacyParser extends XmlVariantSuite {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]