This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new cc0ea60d6eee [SPARK-47218][SQL] XML: Ignore commented row tags in XML
tokenizer
cc0ea60d6eee is described below
commit cc0ea60d6eeeeb21fb5012512f0470cb669c911d
Author: Yousof Hosny <[email protected]>
AuthorDate: Fri Mar 1 09:53:42 2024 +0900
[SPARK-47218][SQL] XML: Ignore commented row tags in XML tokenizer
### What changes were proposed in this pull request?
Added some logic to handle comments in XMLTokenizer. Comments are ignored
anytime they appear. Anything enclosed in a leftmost `` will be ignored.
Does not handle nested comments:
` -->` will leave the last `-->` dangling.
Does not handle comments *within* row tags
- `<ROW> ...` will fail
- `<ROW id="1" > ...` will fail
These two cases are not considered valid XML according to the official spec
(no nested comments, no comments within tags)
### Why are the changes needed?
This is a bug which results in incorrect data being parsed.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Unit test covering different cases of where comments may be placed relative
to row tags.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #45325 from yhosny/xml-tokenizer.
Lead-authored-by: Yousof Hosny <[email protected]>
Co-authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../spark/sql/catalyst/xml/StaxXmlParser.scala | 56 ++++++++++++++++++++++
.../test-data/xml-resources/commented-row.xml | 25 ++++++++++
.../sql/execution/datasources/xml/XmlSuite.scala | 10 ++++
3 files changed, 91 insertions(+)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
index 544a761219cd..a4d15695145d 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -624,6 +624,8 @@ class XmlTokenizer(
private var buffer = new StringBuilder()
private val startTag = s"<${options.rowTag}>"
private val endTag = s"</${options.rowTag}>"
+ private val commentStart = s"<!--"
+ private val commentEnd = s"-->"
/**
* Finds the start of the next record.
@@ -671,9 +673,35 @@ class XmlTokenizer(
nextString
}
+ private def readUntilCommentClose(): Boolean = {
+ var i = 0
+ while (true) {
+ val cOrEOF = reader.read()
+ if (cOrEOF == -1) {
+ // End of file.
+ return false
+ }
+ val c = cOrEOF.toChar
+ if (c == commentEnd(i)) {
+ if (i >= commentEnd.length - 1) {
+ // Found comment close.
+ return true
+ }
+ i += 1
+ } else {
+ i = 0
+ }
+ }
+
+ // Unreachable (a comment tag must close)
+ false
+ }
+
private def readUntilStartElement(): Boolean = {
currentStartTag = startTag
var i = 0
+ var commentIdx = 0
+
while (true) {
val cOrEOF = reader.read()
if (cOrEOF == -1) { // || (i == 0 && getFilePosition() > end)) {
@@ -681,6 +709,19 @@ class XmlTokenizer(
return false
}
val c = cOrEOF.toChar
+
+ if (c == commentStart(commentIdx)) {
+ if (commentIdx >= commentStart.length - 1) {
+ // If a comment beigns we must ignore all character until its end
+ commentIdx = 0
+ readUntilCommentClose()
+ } else {
+ commentIdx += 1
+ }
+ } else {
+ commentIdx = 0
+ }
+
if (c == startTag(i)) {
if (i >= startTag.length - 1) {
// Found start tag.
@@ -708,6 +749,8 @@ class XmlTokenizer(
// Index into the start or end tag that has matched so far
var si = 0
var ei = 0
+ // Index into the start of a comment tag that matched so far
+ var commentIdx = 0
// How many other start tags enclose the one that's started already?
var depth = 0
// Previously read character
@@ -730,6 +773,19 @@ class XmlTokenizer(
val c = cOrEOF.toChar
buffer.append(c)
+ if (c == commentStart(commentIdx)) {
+ if (commentIdx >= commentStart.length - 1) {
+ // If a comment beigns we must ignore everything until its end
+ buffer.setLength(buffer.length - commentStart.length)
+ commentIdx = 0
+ readUntilCommentClose()
+ } else {
+ commentIdx += 1
+ }
+ } else {
+ commentIdx = 0
+ }
+
if (c == '>' && prevC != '/') {
canSelfClose = false
}
diff --git
a/sql/core/src/test/resources/test-data/xml-resources/commented-row.xml
b/sql/core/src/test/resources/test-data/xml-resources/commented-row.xml
new file mode 100644
index 000000000000..c014391eda80
--- /dev/null
+++ b/sql/core/src/test/resources/test-data/xml-resources/commented-row.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<!-- Should read 1 to 8 without any 0s -->
+<ROWSET>
+ <ROW><a>1</a></ROW>
+ <!-- <ROW><a>0</a></ROW>-->
+ <ROW><a>2</a></ROW>
+ <!-- <ROW><a>0</a></ROW>-->
+ <ROW>
+ <!-- Just another comment -->
+ <a> <!-- before a value --> 3 <!-- after a value --></a>
+ <!-- <ROW>
+ <a>3</a>
+ </ROW> -->
+ </ROW>
+ <!-- --><ROW><!----><!----><a><!---->4<!----></a><!----></ROW><!-- -->
+ <!-- -->
<!----><ROW><a>5</a></ROW><!--<ROW><a>0</a></ROW>--><ROW><a>6</a></ROW>
+ <ROW><a>7</a></ROW>
+ <!-- <ROW><a>0</a></ROW>-->
+ <ROW> <!-- 0 </ROW> --> <!-- --> <a> 8 </a> </ROW>
+ <ROW><a>9</a></ROW>
+ <NOTROW><!----></NOTROW>
+ <!----><!----><NOTROW><!----><!----></NOTROW><!----><!---->
+ <ROW><a>10</a></ROW>
+ <!-- <ROW><a>0</a></ROW>
+</ROWSET>
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
index d7f5ee1874c2..2194f76e7da6 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
@@ -2540,6 +2540,16 @@ class XmlSuite
checkAnswer(df, Seq(Row(Row(Array(1, 2, 3), Array(1, 2)))))
}
+ test("ignore commented row tags") {
+ val results = spark.read.format("xml")
+ .option("rowTag", "ROW")
+ .option("multiLine", "true")
+ .load(getTestResourcePath(resDir + "commented-row.xml"))
+
+ val expectedResults = Seq.range(1, 11).map(Row(_))
+ checkAnswer(results, expectedResults)
+ }
+
test("capture values interspersed between elements - nested struct") {
val xmlString =
s"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]