This is an automated email from the ASF dual-hosted git repository.
jiadongb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git
The following commit(s) were added to refs/heads/main by this push:
new ac909a07dc fix(operator): correct regex escaping in WordCloud operator
(#4261)
ac909a07dc is described below
commit ac909a07dc656689a946fcc8cf08fbfe6b4595b5
Author: Jiadong Bai <[email protected]>
AuthorDate: Thu Mar 5 16:06:00 2026 -0800
fix(operator): correct regex escaping in WordCloud operator (#4261)
### What changes were proposed in this PR?
Fixed two issues in `WordCloudOpDesc.scala`:
1. **Regex escaping bug**: The `pyb` refactor in #4189 changed
`manipulateTable()` from `s"..."` to `pyb"""..."""`, but the regex `\\w`
was not adjusted. In `s"..."`, `\\w` is an escape sequence producing
`\w`. In triple-quoted `pyb"""..."""`, backslashes are literal, so `\\w`
stays as `\\w` — producing `r'\\w'` in Python, which matches a literal
backslash + `w` instead of word characters. This caused all rows to be
filtered out, resulting in: *"text column does not contain words or
contains only nulls."* Fixed by changing to `\w`.
2. **Duplicate statement**: Removed a duplicate `Map(...)` line in
`getOutputSchemas`.
Added unit tests to verify the regex pattern is correct.
### Any related issues, documentation, discussions?
Regression introduced by #4189.
### How was this PR tested?
Added `WordCloudOpDescSpec` with tests that verify:
- `manipulateTable()` uses `r'\w'` (not `r'\\w'`)
- Text column name appears in generated code
All tests pass.
### Was this PR authored or co-authored using generative AI tooling?
Generated-by: Claude Code (Claude Opus 4.6)
---------
Co-authored-by: Claude Opus 4.6 <[email protected]>
---
.../visualization/wordCloud/WordCloudOpDesc.scala | 3 +-
.../wordCloud/WordCloudOpDescSpec.scala | 57 ++++++++++++++++++++++
2 files changed, 58 insertions(+), 2 deletions(-)
diff --git
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala
index 19861f4a14..9daf6dd67b 100644
---
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala
+++
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala
@@ -52,7 +52,6 @@ class WordCloudOpDesc extends PythonOperatorDescriptor {
val outputSchema = Schema()
.add("html-content", AttributeType.STRING)
Map(operatorInfo.outputPorts.head.id -> outputSchema)
- Map(operatorInfo.outputPorts.head.id -> outputSchema)
}
override def operatorInfo: OperatorInfo =
@@ -67,7 +66,7 @@ class WordCloudOpDesc extends PythonOperatorDescriptor {
def manipulateTable(): PythonTemplateBuilder = {
pyb"""
| table.dropna(subset = [$textColumn], inplace = True) #remove
missing values
- | table = table[table[$textColumn].str.contains(r'\\w',
regex=True)]
+ | table = table[table[$textColumn].str.contains(r'\w',
regex=True)]
|"""
}
diff --git
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDescSpec.scala
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDescSpec.scala
new file mode 100644
index 0000000000..7be95c3a7a
--- /dev/null
+++
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/wordCloud/WordCloudOpDescSpec.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.visualization.wordCloud
+
+import org.scalatest.BeforeAndAfter
+import org.scalatest.flatspec.AnyFlatSpec
+
+class WordCloudOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
+
+ var opDesc: WordCloudOpDesc = _
+
+ before {
+ opDesc = new WordCloudOpDesc()
+ }
+
+ it should "use correct regex pattern to match word characters" in {
+ opDesc.textColumn = "text_col"
+ val code = opDesc.manipulateTable().plain
+ assert(
+ code.contains("""r'\w'"""),
+ "regex should use single backslash \\w to match word characters"
+ )
+ assert(
+ !code.contains("""r'\\w'"""),
+ "regex should not use double backslash \\\\w which matches literal
backslash+w"
+ )
+ }
+
+ it should "include the text column in manipulateTable" in {
+ opDesc.textColumn = "my_text"
+ val code = opDesc.manipulateTable().plain
+ assert(code.contains("my_text"))
+ }
+
+ it should "include the text column in createWordCloudFigure" in {
+ opDesc.textColumn = "my_text"
+ val code = opDesc.createWordCloudFigure().plain
+ assert(code.contains("my_text"))
+ }
+}