This is an automated email from the ASF dual-hosted git repository.

github-merge-queue[bot] pushed a commit to branch 
gh-readonly-queue/main/pr-5658-73c76f51920b0900de67bbc0baa1ee5be5b87bf0
in repository https://gitbox.apache.org/repos/asf/texera.git

commit 190823f7562ba8c0bb2a515b0ce4823cf640e049
Author: Xinyuan Lin <[email protected]>
AuthorDate: Fri Jun 12 17:19:41 2026 -0700

    test(workflow-operator): add unit test coverage for CaseSensitiveAnalyzer 
(#5658)
    
    ### What changes were proposed in this PR?
    
    Pin behavior of the Lucene `Analyzer` used by the keyword-search
    operator when the user opts into case-sensitive matching. The
    abstraction skips the lowercasing pipeline used by `StandardAnalyzer`,
    so a regression here would silently downgrade case-sensitive search. No
    production-code changes.
    
    | Spec | Source class | Tests |
    | --- | --- | --- |
    | `CaseSensitiveAnalyzerSpec` | `CaseSensitiveAnalyzer` | 13 |
    
    Spec file name follows the `<srcClassName>Spec.scala` one-to-one
    convention.
    
    **Behavior pinned**
    
    | Surface | Contract |
    | --- | --- |
    | Mixed-case input | every emitted token preserves its original case |
    | All-uppercase / all-lowercase tokens | preserved (no normalization in
    either direction) |
    | Single-space splitting | tokens are separated cleanly |
    | Tabs and newlines | also split tokens |
    | Collapsed whitespace runs | no empty tokens emitted |
    | Embedded punctuation (`abc,def`) | stays one token
    (`WhitespaceTokenizer` only splits on whitespace) |
    | Sentence-final punctuation (`Hello, world!`) | stays attached
    (`Hello,`, `world!`) |
    | Empty input | no tokens |
    | Pure-whitespace input | no tokens |
    | `StopFilter` with `CharArraySet.EMPTY_SET` | English stop words (`the`
    / `and` / `a`) are NOT removed (vs `StandardAnalyzer`'s default
    behavior) |
    | Different field names | same tokenization (field-name independent) |
    | Successive `tokenStream` calls | each gets its own independent stream
    |
    
    The harness uses the canonical Lucene `reset → incrementToken → end →
    close` lifecycle and collects `CharTermAttribute` values into a buffer —
    same pattern any future analyzer spec in this codebase should follow.
    
    ### Any related issues, documentation, discussions?
    
    Closes #5654.
    
    ### How was this PR tested?
    
    Pure unit-test addition; verified locally with:
    
    - `sbt "WorkflowOperator/testOnly
    org.apache.texera.amber.operator.keywordSearch.CaseSensitiveAnalyzerSpec"`
    — 13 tests, all green
    - `sbt scalafmtCheckAll` — clean
    - CI to confirm
    
    ### Was this PR authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (Opus 4.7 [1M context])
---
 .../keywordSearch/CaseSensitiveAnalyzerSpec.scala  | 182 +++++++++++++++++++++
 1 file changed, 182 insertions(+)

diff --git 
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/keywordSearch/CaseSensitiveAnalyzerSpec.scala
 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/keywordSearch/CaseSensitiveAnalyzerSpec.scala
new file mode 100644
index 0000000000..776db23a4c
--- /dev/null
+++ 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/keywordSearch/CaseSensitiveAnalyzerSpec.scala
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.keywordSearch
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
+import org.scalatest.flatspec.AnyFlatSpec
+
+import java.io.StringReader
+import scala.collection.mutable.ArrayBuffer
+
+class CaseSensitiveAnalyzerSpec extends AnyFlatSpec {
+
+  // 
---------------------------------------------------------------------------
+  // Helper — drive an Analyzer over a single input and collect the emitted
+  // token strings.
+  //
+  // The Lucene TokenStream lifecycle is: reset → while incrementToken → end →
+  // close. Skipping reset() raises IllegalStateException on some Lucene
+  // versions; we follow the canonical contract to keep the spec robust.
+  // 
---------------------------------------------------------------------------
+
+  private def tokensOf(fieldName: String, input: String): List[String] = {
+    val analyzer = new CaseSensitiveAnalyzer
+    val stream = analyzer.tokenStream(fieldName, new StringReader(input))
+    val termAttr = stream.addAttribute(classOf[CharTermAttribute])
+    val out = ArrayBuffer.empty[String]
+    try {
+      stream.reset()
+      while (stream.incrementToken()) {
+        out.append(termAttr.toString)
+      }
+      stream.end()
+    } finally {
+      stream.close()
+      analyzer.close()
+    }
+    out.toList
+  }
+
+  // 
---------------------------------------------------------------------------
+  // Case preservation — the whole point of CaseSensitiveAnalyzer is to
+  // SKIP the lowercasing pipeline used by StandardAnalyzer.
+  // 
---------------------------------------------------------------------------
+
+  "CaseSensitiveAnalyzer" should "preserve case in every emitted token" in {
+    assert(tokensOf("body", "Hello World") == List("Hello", "World"))
+  }
+
+  it should "preserve mixed-case tokens (e.g. CamelCase identifiers)" in {
+    assert(tokensOf("body", "FooBar BazQux") == List("FooBar", "BazQux"))
+  }
+
+  it should "preserve all-uppercase tokens" in {
+    assert(tokensOf("body", "URL HTTP HTML") == List("URL", "HTTP", "HTML"))
+  }
+
+  it should "preserve all-lowercase tokens (no upcasing either)" in {
+    assert(tokensOf("body", "alpha beta gamma") == List("alpha", "beta", 
"gamma"))
+  }
+
+  // 
---------------------------------------------------------------------------
+  // Whitespace tokenization — the underlying tokenizer is
+  // WhitespaceTokenizer; pin its splitting behavior.
+  // 
---------------------------------------------------------------------------
+
+  "CaseSensitiveAnalyzer (whitespace tokenizer)" should
+    "split on a single space" in {
+    assert(tokensOf("body", "a b c") == List("a", "b", "c"))
+  }
+
+  it should "split on tabs and newlines" in {
+    assert(tokensOf("body", "a\tb\nc") == List("a", "b", "c"))
+  }
+
+  it should "collapse runs of whitespace (no empty tokens emitted)" in {
+    assert(tokensOf("body", "a   b\n\nc") == List("a", "b", "c"))
+  }
+
+  // 
---------------------------------------------------------------------------
+  // Punctuation — WhitespaceTokenizer keeps punctuation attached to
+  // adjacent characters (it only splits on whitespace).
+  // 
---------------------------------------------------------------------------
+
+  it should
+    "leave punctuation attached to tokens (WhitespaceTokenizer only splits on 
whitespace)" in {
+    // `"abc,def"` has no whitespace inside, so it stays one token.
+    assert(tokensOf("body", "abc,def") == List("abc,def"))
+    // Sentence-final punctuation also stays attached.
+    assert(tokensOf("body", "Hello, world!") == List("Hello,", "world!"))
+  }
+
+  // 
---------------------------------------------------------------------------
+  // Edge cases
+  // 
---------------------------------------------------------------------------
+
+  "CaseSensitiveAnalyzer (empty input)" should "produce no tokens" in {
+    assert(tokensOf("body", "") == Nil)
+  }
+
+  it should "produce no tokens for pure-whitespace input" in {
+    assert(tokensOf("body", "   \t\n  ") == Nil)
+  }
+
+  // 
---------------------------------------------------------------------------
+  // StopFilter — empty stop-word set; nothing should be filtered out.
+  // 
---------------------------------------------------------------------------
+
+  "CaseSensitiveAnalyzer (StopFilter with CharArraySet.EMPTY_SET)" should
+    "NOT remove common English stop words (the / and / of / a)" in {
+    // StandardAnalyzer's default stop set would strip "the", "and",
+    // "of", "a"; this analyzer is built with `CharArraySet.EMPTY_SET`
+    // so every token survives. Pin that explicitly.
+    val out = tokensOf("body", "the quick and a brown fox jumps of off")
+    assert(
+      out == List("the", "quick", "and", "a", "brown", "fox", "jumps", "of", 
"off")
+    )
+  }
+
+  // 
---------------------------------------------------------------------------
+  // Field-name independence — tokenStream uses the same pipeline
+  // regardless of field name, and each call gets its own TokenStream.
+  // 
---------------------------------------------------------------------------
+
+  "CaseSensitiveAnalyzer" should
+    "produce the same tokens for the same input across different field names" 
in {
+    val a = tokensOf("title", "Hello World")
+    val b = tokensOf("body", "Hello World")
+    assert(a == b, "field name must not change tokenization")
+  }
+
+  it should
+    "return independent TokenStreams for successive tokenStream calls on the 
SAME analyzer instance" in {
+    // Reuse one analyzer across two tokenStream calls — consuming the
+    // first stream must not affect the second. The helper would create
+    // a fresh analyzer per call, masking the intra-analyzer reuse
+    // behavior; do the lifecycle manually here.
+    val analyzer = new CaseSensitiveAnalyzer
+    try {
+      def collect(input: String): List[String] = {
+        val stream = analyzer.tokenStream("body", new 
java.io.StringReader(input))
+        val termAttr = stream.addAttribute(classOf[CharTermAttribute])
+        val out = ArrayBuffer.empty[String]
+        try {
+          stream.reset()
+          while (stream.incrementToken()) {
+            out.append(termAttr.toString)
+          }
+          stream.end()
+        } finally {
+          stream.close()
+        }
+        out.toList
+      }
+      val first = collect("alpha Beta GAMMA")
+      val second = collect("alpha Beta GAMMA")
+      assert(first == List("alpha", "Beta", "GAMMA"))
+      assert(second == first, "second tokenStream call must not be affected by 
the first")
+      // Different input on the SAME analyzer also produces correct tokens.
+      val third = collect("foo bar")
+      assert(third == List("foo", "bar"))
+    } finally {
+      analyzer.close()
+    }
+  }
+}

Reply via email to