This is an automated email from the ASF dual-hosted git repository.

jiadongb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git


The following commit(s) were added to refs/heads/main by this push:
     new ac909a07dc fix(operator): correct regex escaping in WordCloud operator 
(#4261)
ac909a07dc is described below

commit ac909a07dc656689a946fcc8cf08fbfe6b4595b5
Author: Jiadong Bai <[email protected]>
AuthorDate: Thu Mar 5 16:06:00 2026 -0800

    fix(operator): correct regex escaping in WordCloud operator (#4261)
    
    ### What changes were proposed in this PR?
    
    Fixed two issues in `WordCloudOpDesc.scala`:
    
    1. **Regex escaping bug**: The `pyb` refactor in #4189 changed
    `manipulateTable()` from `s"..."` to `pyb"""..."""`, but the regex `\\w`
    was not adjusted. In `s"..."`, `\\w` is an escape sequence producing
    `\w`. In triple-quoted `pyb"""..."""`, backslashes are literal, so `\\w`
    stays as `\\w` — producing `r'\\w'` in Python, which matches a literal
    backslash + `w` instead of word characters. This caused all rows to be
    filtered out, resulting in: *"text column does not contain words or
    contains only nulls."* Fixed by changing to `\w`.
    
    2. **Duplicate statement**: Removed a duplicate `Map(...)` line in
    `getOutputSchemas`.
    
    Added unit tests to verify the regex pattern is correct.
    
    ### Any related issues, documentation, discussions?
    
    Regression introduced by #4189.
    
    ### How was this PR tested?
    
    Added `WordCloudOpDescSpec` with tests that verify:
    - `manipulateTable()` uses `r'\w'` (not `r'\\w'`)
    - Text column name appears in generated code
    
    All tests pass.
    
    ### Was this PR authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (Claude Opus 4.6)
    
    ---------
    
    Co-authored-by: Claude Opus 4.6 <[email protected]>
---
 .../visualization/wordCloud/WordCloudOpDesc.scala  |  3 +-
 .../wordCloud/WordCloudOpDescSpec.scala            | 57 ++++++++++++++++++++++
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala
 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala
index 19861f4a14..9daf6dd67b 100644
--- 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala
+++ 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala
@@ -52,7 +52,6 @@ class WordCloudOpDesc extends PythonOperatorDescriptor {
     val outputSchema = Schema()
       .add("html-content", AttributeType.STRING)
     Map(operatorInfo.outputPorts.head.id -> outputSchema)
-    Map(operatorInfo.outputPorts.head.id -> outputSchema)
   }
 
   override def operatorInfo: OperatorInfo =
@@ -67,7 +66,7 @@ class WordCloudOpDesc extends PythonOperatorDescriptor {
   def manipulateTable(): PythonTemplateBuilder = {
     pyb"""
        |        table.dropna(subset = [$textColumn], inplace = True) #remove 
missing values
-       |        table = table[table[$textColumn].str.contains(r'\\w', 
regex=True)]
+       |        table = table[table[$textColumn].str.contains(r'\w', 
regex=True)]
        |"""
   }
 
diff --git 
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDescSpec.scala
 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDescSpec.scala
new file mode 100644
index 0000000000..7be95c3a7a
--- /dev/null
+++ 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDescSpec.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.visualization.wordCloud
+
+import org.scalatest.BeforeAndAfter
+import org.scalatest.flatspec.AnyFlatSpec
+
+class WordCloudOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
+
+  var opDesc: WordCloudOpDesc = _
+
+  before {
+    opDesc = new WordCloudOpDesc()
+  }
+
+  it should "use correct regex pattern to match word characters" in {
+    opDesc.textColumn = "text_col"
+    val code = opDesc.manipulateTable().plain
+    assert(
+      code.contains("""r'\w'"""),
+      "regex should use single backslash \\w to match word characters"
+    )
+    assert(
+      !code.contains("""r'\\w'"""),
+      "regex should not use double backslash \\\\w which matches literal 
backslash+w"
+    )
+  }
+
+  it should "include the text column in manipulateTable" in {
+    opDesc.textColumn = "my_text"
+    val code = opDesc.manipulateTable().plain
+    assert(code.contains("my_text"))
+  }
+
+  it should "include the text column in createWordCloudFigure" in {
+    opDesc.textColumn = "my_text"
+    val code = opDesc.createWordCloudFigure().plain
+    assert(code.contains("my_text"))
+  }
+}

Reply via email to