This is an automated email from the ASF dual-hosted git repository. tingchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new 441a2df742 Add tests to clarify the limits of base64 encoded string detector. (#15497) 441a2df742 is described below commit 441a2df74292c90dc099301c9d06864be95f2f21 Author: Ting Chen <tingc...@uber.com> AuthorDate: Fri Apr 11 16:58:46 2025 -0700 Add tests to clarify the limits of base64 encoded string detector. (#15497) --- .../local/recordtransformer/SchemaConformingTransformer.java | 7 ++++--- .../local/recordtransformer/SchemaConformingTransformerTest.java | 7 +++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java index a049d37e67..554409821a 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformer.java @@ -94,8 +94,8 @@ import org.slf4j.LoggerFactory; * } * Apart from the basic transformation above, this transformer today also does the following additional tasks (which in * future can be decoupled from this transformer): - * 1. Put all field + value pair in a special column "_mergedTextIndex" to facilitate text indexing and search. This - * extra step can be enabled via mergedTextIndexFieldSpec. + * 1. Put all field + value pair in a special column "_mergedTextIndex" to facilitate full text indexing and search. + * This extra step can be enabled via mergedTextIndexFieldSpec. * 2. Allow users to tag certain fields in the input record not to be included in the catch-all field. * </pre> * <p> @@ -338,7 +338,8 @@ public class SchemaConformingTransformer implements RecordTransformer { putExtrasField(_transformerConfig.getUnindexableExtrasField(), _unindexableExtrasFieldType, extraFieldsContainer.getUnindexableExtras(), outputRecord); - // Generate merged text index + // Generate merged text index. This optional step puts all field + value pairs in the input record in a special + // column "_mergedTextIndex" to perform full text indexing and search. if (null != _mergedTextIndexFieldSpec && !mergedTextIndexMap.isEmpty()) { List<String> luceneDocuments = getLuceneDocumentsFromMergedTextIndexMap(mergedTextIndexMap); if (_mergedTextIndexFieldSpec.isSingleValueField()) { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java index 32985f9832..6ed751f0d4 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerTest.java @@ -1037,13 +1037,20 @@ public class SchemaConformingTransformerTest { String binaryDataWithTrailingPeriods = "ABCxyz12345-_+/=.."; String binaryDataWithRandomPeriods = "A.BCxy.z12345-_+/=.."; String shortBinaryData = "short"; + String longBinaryDataWithColon = "field:1:1:v1Cgy+ypzk8yf9JzsdkBjvZ1jM8Mem/BTtNilst64Df/34xmJzeRstmihpfrWZ"; + String jsonBinaryData = "{\"field\":\"text:1:1:v1Cgy+ypzk8yf9JzsdkBjvZ1jM8Mem/BTtNilst64Df/34xmJzeRstmihpfrWZ\"}"; int minLength = 10; + // A space is not expected in a based64 encoded string. assertFalse(SchemaConformingTransformer.base64ValueFilter(text.getBytes(), minLength)); assertTrue(SchemaConformingTransformer.base64ValueFilter(binaryData.getBytes(), minLength)); assertTrue(SchemaConformingTransformer.base64ValueFilter(binaryDataWithTrailingPeriods.getBytes(), minLength)); assertFalse(SchemaConformingTransformer.base64ValueFilter(binaryDataWithRandomPeriods.getBytes(), minLength)); assertFalse(SchemaConformingTransformer.base64ValueFilter(shortBinaryData.getBytes(), minLength)); + // A colon : is not expected in base64 encoded string. + assertFalse(SchemaConformingTransformer.base64ValueFilter(longBinaryDataWithColon.getBytes(), minLength)); + // Json string can not be detected as base64 encoded string even one field has base64 encoded strings. + assertFalse(SchemaConformingTransformer.base64ValueFilter(jsonBinaryData.getBytes(), minLength)); } @Test --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org