This is an automated email from the ASF dual-hosted git repository.

tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 441a2df742 Add tests to clarify the limits of base64 encoded string 
detector. (#15497)
441a2df742 is described below

commit 441a2df74292c90dc099301c9d06864be95f2f21
Author: Ting Chen <tingc...@uber.com>
AuthorDate: Fri Apr 11 16:58:46 2025 -0700

    Add tests to clarify the limits of base64 encoded string detector. (#15497)
---
 .../local/recordtransformer/SchemaConformingTransformer.java       | 7 ++++---
 .../local/recordtransformer/SchemaConformingTransformerTest.java   | 7 +++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java
index a049d37e67..554409821a 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java
@@ -94,8 +94,8 @@ import org.slf4j.LoggerFactory;
  * }
  * Apart from the basic transformation above, this transformer today also does 
the following additional tasks (which in
  * future can be decoupled from this transformer):
- *    1. Put all field + value pair in a special column "_mergedTextIndex" to 
facilitate text indexing and search. This
- *       extra step can be enabled via mergedTextIndexFieldSpec.
+ *    1. Put all field + value pair in a special column "_mergedTextIndex" to 
facilitate full text indexing and search.
+ *    This extra step can be enabled via mergedTextIndexFieldSpec.
  *    2. Allow users to tag certain fields in the input record not to be 
included in the catch-all field.
  * </pre>
  * <p>
@@ -338,7 +338,8 @@ public class SchemaConformingTransformer implements 
RecordTransformer {
       putExtrasField(_transformerConfig.getUnindexableExtrasField(), 
_unindexableExtrasFieldType,
           extraFieldsContainer.getUnindexableExtras(), outputRecord);
 
-      // Generate merged text index
+      // Generate merged text index. This optional step puts all field + value 
pairs in the input record in a special
+      // column "_mergedTextIndex" to perform full text indexing and search.
       if (null != _mergedTextIndexFieldSpec && !mergedTextIndexMap.isEmpty()) {
         List<String> luceneDocuments = 
getLuceneDocumentsFromMergedTextIndexMap(mergedTextIndexMap);
         if (_mergedTextIndexFieldSpec.isSingleValueField()) {
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java
index 32985f9832..6ed751f0d4 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java
@@ -1037,13 +1037,20 @@ public class SchemaConformingTransformerTest {
     String binaryDataWithTrailingPeriods = "ABCxyz12345-_+/=..";
     String binaryDataWithRandomPeriods = "A.BCxy.z12345-_+/=..";
     String shortBinaryData = "short";
+    String longBinaryDataWithColon = 
"field:1:1:v1Cgy+ypzk8yf9JzsdkBjvZ1jM8Mem/BTtNilst64Df/34xmJzeRstmihpfrWZ";
+    String jsonBinaryData = 
"{\"field\":\"text:1:1:v1Cgy+ypzk8yf9JzsdkBjvZ1jM8Mem/BTtNilst64Df/34xmJzeRstmihpfrWZ\"}";
     int minLength = 10;
 
+    // A space is not expected in a based64 encoded string.
     assertFalse(SchemaConformingTransformer.base64ValueFilter(text.getBytes(), 
minLength));
     
assertTrue(SchemaConformingTransformer.base64ValueFilter(binaryData.getBytes(), 
minLength));
     
assertTrue(SchemaConformingTransformer.base64ValueFilter(binaryDataWithTrailingPeriods.getBytes(),
 minLength));
     
assertFalse(SchemaConformingTransformer.base64ValueFilter(binaryDataWithRandomPeriods.getBytes(),
 minLength));
     
assertFalse(SchemaConformingTransformer.base64ValueFilter(shortBinaryData.getBytes(),
 minLength));
+    // A colon : is not expected in base64 encoded string.
+    
assertFalse(SchemaConformingTransformer.base64ValueFilter(longBinaryDataWithColon.getBytes(),
 minLength));
+    // Json string can not be detected as base64 encoded string even one field 
has base64 encoded strings.
+    
assertFalse(SchemaConformingTransformer.base64ValueFilter(jsonBinaryData.getBytes(),
 minLength));
   }
 
   @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to