(spark) branch master updated: [SPARK-47218][SQL] XML: Ignore commented row tags in XML tokenizer

gurwls223 Thu, 29 Feb 2024 16:54:00 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new cc0ea60d6eee [SPARK-47218][SQL] XML: Ignore commented row tags in XML 
tokenizer
cc0ea60d6eee is described below

commit cc0ea60d6eeeeb21fb5012512f0470cb669c911d
Author: Yousof Hosny <[email protected]>
AuthorDate: Fri Mar 1 09:53:42 2024 +0900

    [SPARK-47218][SQL] XML: Ignore commented row tags in XML tokenizer
    
    ### What changes were proposed in this pull request?
    
    Added some logic to handle comments in XMLTokenizer. Comments are ignored 
anytime they appear. Anything enclosed in a leftmost `` will be ignored.
    
    Does not handle nested comments:
    ` -->` will leave the last `-->` dangling.
    
    Does not handle comments *within* row tags
    - `<ROW> ...` will fail
    - `<ROW id="1" > ...` will fail
    
    These two cases are not considered valid XML according to the official spec 
(no nested comments, no comments within tags)
    
    ### Why are the changes needed?
    
    This is a bug which results in incorrect data being parsed.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Unit test covering different cases of where comments may be placed relative 
to row tags.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #45325 from yhosny/xml-tokenizer.
    
    Lead-authored-by: Yousof Hosny <[email protected]>
    Co-authored-by: Hyukjin Kwon <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 .../spark/sql/catalyst/xml/StaxXmlParser.scala     | 56 ++++++++++++++++++++++
 .../test-data/xml-resources/commented-row.xml      | 25 ++++++++++
 .../sql/execution/datasources/xml/XmlSuite.scala   | 10 ++++
 3 files changed, 91 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
index 544a761219cd..a4d15695145d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -624,6 +624,8 @@ class XmlTokenizer(
   private var buffer = new StringBuilder()
   private val startTag = s"<${options.rowTag}>"
   private val endTag = s"</${options.rowTag}>"
+  private val commentStart = s"<!--"
+  private val commentEnd = s"-->"
 
     /**
    * Finds the start of the next record.
@@ -671,9 +673,35 @@ class XmlTokenizer(
       nextString
     }
 
+  private def readUntilCommentClose(): Boolean = {
+    var i = 0
+    while (true) {
+      val cOrEOF = reader.read()
+      if (cOrEOF == -1) {
+        // End of file.
+        return false
+      }
+      val c = cOrEOF.toChar
+      if (c == commentEnd(i)) {
+        if (i >= commentEnd.length - 1) {
+          // Found comment close.
+          return true
+        }
+        i += 1
+      } else {
+        i = 0
+      }
+    }
+
+    // Unreachable (a comment tag must close)
+    false
+  }
+
   private def readUntilStartElement(): Boolean = {
     currentStartTag = startTag
     var i = 0
+    var commentIdx = 0
+
     while (true) {
       val cOrEOF = reader.read()
       if (cOrEOF == -1) { // || (i == 0 && getFilePosition() > end)) {
@@ -681,6 +709,19 @@ class XmlTokenizer(
         return false
       }
       val c = cOrEOF.toChar
+
+      if (c == commentStart(commentIdx)) {
+        if (commentIdx >= commentStart.length - 1) {
+          //  If a comment beigns we must ignore all character until its end
+          commentIdx = 0
+          readUntilCommentClose()
+        } else {
+          commentIdx += 1
+        }
+      } else {
+        commentIdx = 0
+      }
+
       if (c == startTag(i)) {
         if (i >= startTag.length - 1) {
           // Found start tag.
@@ -708,6 +749,8 @@ class XmlTokenizer(
     // Index into the start or end tag that has matched so far
     var si = 0
     var ei = 0
+    // Index into the start of a comment tag that matched so far
+    var commentIdx = 0
     // How many other start tags enclose the one that's started already?
     var depth = 0
     // Previously read character
@@ -730,6 +773,19 @@ class XmlTokenizer(
       val c = cOrEOF.toChar
       buffer.append(c)
 
+      if (c == commentStart(commentIdx)) {
+        if (commentIdx >= commentStart.length - 1) {
+          //  If a comment beigns we must ignore everything until its end
+          buffer.setLength(buffer.length - commentStart.length)
+          commentIdx = 0
+          readUntilCommentClose()
+        } else {
+          commentIdx += 1
+        }
+      } else {
+        commentIdx = 0
+      }
+
       if (c == '>' && prevC != '/') {
         canSelfClose = false
       }
diff --git 
a/sql/core/src/test/resources/test-data/xml-resources/commented-row.xml 
b/sql/core/src/test/resources/test-data/xml-resources/commented-row.xml
new file mode 100644
index 000000000000..c014391eda80
--- /dev/null
+++ b/sql/core/src/test/resources/test-data/xml-resources/commented-row.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<!-- Should read 1 to 8 without any 0s -->
+<ROWSET>
+    <ROW><a>1</a></ROW>
+    <!-- <ROW><a>0</a></ROW>-->
+    <ROW><a>2</a></ROW>
+    <!-- <ROW><a>0</a></ROW>-->
+    <ROW>
+        <!-- Just another comment -->
+        <a> <!-- before a value --> 3 <!-- after a value --></a>
+        <!-- <ROW>
+             <a>3</a>
+            </ROW> -->
+    </ROW>
+    <!-- --><ROW><!----><!----><a><!---->4<!----></a><!----></ROW><!-- -->
+    <!-- --> 
<!----><ROW><a>5</a></ROW><!--<ROW><a>0</a></ROW>--><ROW><a>6</a></ROW>
+    <ROW><a>7</a></ROW>
+    <!-- <ROW><a>0</a></ROW>-->
+    <ROW> <!-- 0 </ROW> --> <!-- --> <a> 8 </a> </ROW>
+    <ROW><a>9</a></ROW>
+    <NOTROW><!----></NOTROW>
+    <!----><!----><NOTROW><!----><!----></NOTROW><!----><!---->
+    <ROW><a>10</a></ROW>
+    <!-- <ROW><a>0</a></ROW>
+</ROWSET>
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
index d7f5ee1874c2..2194f76e7da6 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
@@ -2540,6 +2540,16 @@ class XmlSuite
     checkAnswer(df, Seq(Row(Row(Array(1, 2, 3), Array(1, 2)))))
   }
 
+  test("ignore commented row tags") {
+    val results = spark.read.format("xml")
+      .option("rowTag", "ROW")
+      .option("multiLine", "true")
+      .load(getTestResourcePath(resDir + "commented-row.xml"))
+
+    val expectedResults = Seq.range(1, 11).map(Row(_))
+    checkAnswer(results, expectedResults)
+  }
+
   test("capture values interspersed between elements - nested struct") {
     val xmlString =
       s"""


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-47218][SQL] XML: Ignore commented row tags in XML tokenizer

Reply via email to