This is an automated email from the ASF dual-hosted git repository. jackie pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new df053536c2 Clean up and enhance functions in SchemaConformingTransformer (#14546) df053536c2 is described below commit df053536c2bda57a3d41566c7b12ccc737610244 Author: lnbest0707 <106711887+lnbest0707-u...@users.noreply.github.com> AuthorDate: Wed Dec 4 11:13:54 2024 -0800 Clean up and enhance functions in SchemaConformingTransformer (#14546) --- .../SchemaConformingTransformerV2.java | 244 +++++------ .../SchemaConformingTransformerV2Test.java | 483 ++++++++++++++------- .../SchemaConformingTransformerV2Config.java | 166 +++++-- 3 files changed, 550 insertions(+), 343 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java index 2aed00f0c3..78962fd5ee 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java @@ -28,6 +28,7 @@ import java.util.Deque; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import javax.annotation.Nonnull; @@ -49,13 +50,13 @@ import org.apache.pinot.spi.utils.JsonUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + /** * This transformer evolves from {@link SchemaConformingTransformer} and is designed to support extra cases for * better text searching: * - Support over-lapping schema fields, in which case it could support schema column "a" and "a.b" at the same time. * And it only allows primitive type fields to be the value. * - Extract flattened key-value pairs as mergedTextIndex for better text searching. - * - Add shingle index tokenization functionality for extremely large text fields. * <p> * For example, consider this record: * <pre> @@ -129,8 +130,8 @@ import org.slf4j.LoggerFactory; public class SchemaConformingTransformerV2 implements RecordTransformer { private static final Logger _logger = LoggerFactory.getLogger(SchemaConformingTransformerV2.class); private static final int MAXIMUM_LUCENE_DOCUMENT_SIZE = 32766; - private static final String MIN_DOCUMENT_LENGTH_DESCRIPTION = - "key length + `:` + shingle index overlap length + one non-overlap char"; + private static final List<String> MERGED_TEXT_INDEX_SUFFIX_TO_EXCLUDE = Arrays.asList("_logtype", "_dictionaryVars", + "_encodedVars"); private final boolean _continueOnError; private final SchemaConformingTransformerV2Config _transformerConfig; @@ -143,6 +144,7 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { @Nullable private PinotMeter _realtimeMergedTextIndexTruncatedDocumentSizeMeter = null; private String _tableName; + private int _jsonKeyValueSeparatorByteCount; private long _mergedTextIndexDocumentBytesCount = 0L; private long _mergedTextIndexDocumentCount = 0L; @@ -171,6 +173,8 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { _tableName = tableConfig.getTableName(); _schemaTree = validateSchemaAndCreateTree(schema, _transformerConfig); _serverMetrics = ServerMetrics.get(); + _jsonKeyValueSeparatorByteCount = _transformerConfig.getJsonKeyValueSeparator() + .getBytes(java.nio.charset.StandardCharsets.UTF_8).length; } /** @@ -189,6 +193,20 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { SchemaConformingTransformer.getAndValidateExtrasFieldType(schema, indexableExtrasFieldName); } + Map<String, String> columnNameToJsonKeyPathMap = transformerConfig.getColumnNameToJsonKeyPathMap(); + for (Map.Entry<String, String> entry : columnNameToJsonKeyPathMap.entrySet()) { + String columnName = entry.getKey(); + FieldSpec fieldSpec = schema.getFieldSpecFor(entry.getKey()); + Preconditions.checkState(null != fieldSpec, "Field '%s' doesn't exist in schema", columnName); + } + Set<String> preserveFieldNames = transformerConfig.getFieldPathsToPreserveInput(); + for (String preserveFieldName : preserveFieldNames) { + Preconditions.checkState( + columnNameToJsonKeyPathMap.containsValue(preserveFieldName) + || schema.getFieldSpecFor(preserveFieldName) != null, + "Preserved path '%s' doesn't exist in columnNameToJsonKeyPathMap or schema", preserveFieldName); + } + validateSchemaAndCreateTree(schema, transformerConfig); } @@ -264,7 +282,7 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { currentNode = childNode; } } - currentNode.setColumn(jsonKeyPathToColumnNameMap.get(field)); + currentNode.setColumn(jsonKeyPathToColumnNameMap.get(field), schema); } return rootNode; @@ -303,9 +321,9 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { if (null != _mergedTextIndexFieldSpec && !mergedTextIndexMap.isEmpty()) { List<String> luceneDocuments = getLuceneDocumentsFromMergedTextIndexMap(mergedTextIndexMap); if (_mergedTextIndexFieldSpec.isSingleValueField()) { - outputRecord.putValue(_transformerConfig.getMergedTextIndexField(), String.join(" ", luceneDocuments)); + outputRecord.putValue(_mergedTextIndexFieldSpec.getName(), String.join(" ", luceneDocuments)); } else { - outputRecord.putValue(_transformerConfig.getMergedTextIndexField(), luceneDocuments); + outputRecord.putValue(_mergedTextIndexFieldSpec.getName(), luceneDocuments); } } } catch (Exception e) { @@ -382,23 +400,33 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { String keyJsonPath = String.join(".", jsonPath); + Set<String> fieldPathsToDrop = _transformerConfig.getFieldPathsToDrop(); + if (null != fieldPathsToDrop && fieldPathsToDrop.contains(keyJsonPath)) { + return extraFieldsContainer; + } + + SchemaTreeNode currentNode = + parentNode == null ? null : parentNode.getChild(key, _transformerConfig.isUseAnonymousDotInFieldNames()); if (_transformerConfig.getFieldPathsToPreserveInput().contains(keyJsonPath) || _transformerConfig.getFieldPathsToPreserveInputWithIndex().contains(keyJsonPath)) { - outputRecord.putValue(keyJsonPath, value); + if (currentNode != null) { + outputRecord.putValue(currentNode.getColumnName(), currentNode.getValue(value)); + } else { + outputRecord.putValue(keyJsonPath, value); + } if (_transformerConfig.getFieldPathsToPreserveInputWithIndex().contains(keyJsonPath)) { flattenAndAddToMergedTextIndexMap(mergedTextIndexMap, keyJsonPath, value); } return extraFieldsContainer; } + String unindexableFieldSuffix = _transformerConfig.getUnindexableFieldSuffix(); + isIndexable = isIndexable && (null == unindexableFieldSuffix || !key.endsWith(unindexableFieldSuffix)); - Set<String> fieldPathsToDrop = _transformerConfig.getFieldPathsToDrop(); - if (null != fieldPathsToDrop && fieldPathsToDrop.contains(keyJsonPath)) { + // return in advance to truncate the subtree if nothing left to be added + if (currentNode == null && !storeIndexableExtras && !storeUnindexableExtras) { return extraFieldsContainer; } - SchemaTreeNode currentNode = parentNode == null ? null : parentNode.getChild(key); - String unindexableFieldSuffix = _transformerConfig.getUnindexableFieldSuffix(); - isIndexable = isIndexable && (null == unindexableFieldSuffix || !key.endsWith(unindexableFieldSuffix)); if (value == null) { return extraFieldsContainer; } @@ -413,12 +441,14 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { if (_transformerConfig.getFieldsToDoubleIngest().contains(keyJsonPath)) { extraFieldsContainer.addIndexableEntry(key, value); } - mergedTextIndexMap.put(keyJsonPath, value); + mergedTextIndexMap.put(currentNode.getColumnName(), value); } else { // The field is not mapped to one of the dedicated columns in the Pinot table schema. Thus it will be put // into the extraField column of the table. if (storeIndexableExtras) { - extraFieldsContainer.addIndexableEntry(key, value); + if (!_transformerConfig.getFieldPathsToSkipStorage().contains(keyJsonPath)) { + extraFieldsContainer.addIndexableEntry(key, value); + } mergedTextIndexMap.put(keyJsonPath, value); } } @@ -439,7 +469,7 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { /** * Generate a Lucene document based on the provided key-value pair. - * The index document follows this format: "val:key". + * The index document follows this format: "val" + jsonKeyValueSeparator + "key". * @param kv used to generate text index documents * @param indexDocuments a list to store the generated index documents * @param mergedTextIndexDocumentMaxLength which we enforce via truncation during document generation @@ -475,129 +505,30 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { private void addLuceneDoc(List<String> indexDocuments, Integer mergedTextIndexDocumentMaxLength, String key, String val) { - // TODO: theoretically, the key length + 1 could cause integer overflow. But in reality, upstream message size - // limit usually could not reach that high. We should revisit this if we see any issue. - if (key.length() + 1 > MAXIMUM_LUCENE_DOCUMENT_SIZE) { + if (key.length() + _jsonKeyValueSeparatorByteCount > MAXIMUM_LUCENE_DOCUMENT_SIZE) { _logger.error("The provided key's length is too long, text index document cannot be truncated"); return; } // Truncate the value to ensure the generated index document is less or equal to mergedTextIndexDocumentMaxLength - // The value length should be the mergedTextIndexDocumentMaxLength minus ":" character (length 1) minus key length - int valueTruncationLength = mergedTextIndexDocumentMaxLength - 1 - key.length(); + // The value length should be the mergedTextIndexDocumentMaxLength minus key length, and then minus the byte length + // of ":" or the specified Json key value separator character + int valueTruncationLength = mergedTextIndexDocumentMaxLength - _jsonKeyValueSeparatorByteCount - key.length(); if (val.length() > valueTruncationLength) { _realtimeMergedTextIndexTruncatedDocumentSizeMeter = _serverMetrics .addMeteredTableValue(_tableName, ServerMeter.REALTIME_MERGED_TEXT_IDX_TRUNCATED_DOCUMENT_SIZE, - key.length() + 1 + val.length(), _realtimeMergedTextIndexTruncatedDocumentSizeMeter); + key.length() + _jsonKeyValueSeparatorByteCount + val.length(), + _realtimeMergedTextIndexTruncatedDocumentSizeMeter); val = val.substring(0, valueTruncationLength); } - _mergedTextIndexDocumentBytesCount += key.length() + 1 + val.length(); + _mergedTextIndexDocumentBytesCount += key.length() + _jsonKeyValueSeparatorByteCount + val.length(); _mergedTextIndexDocumentCount += 1; _serverMetrics.setValueOfTableGauge(_tableName, ServerGauge.REALTIME_MERGED_TEXT_IDX_DOCUMENT_AVG_LEN, _mergedTextIndexDocumentBytesCount / _mergedTextIndexDocumentCount); - indexDocuments.add(val + ":" + key); - } - - /** - * Implement shingling for the merged text index based on the provided key-value pair. - * Each shingled index document retains the format of a standard index document: "val:key". However, "val" now - * denotes a sliding window of characters on the value. The total length of each shingled index document - * (key length + shingled value length + 1)must be less than or equal to shingleIndexMaxLength. The starting index - * of the sliding window for the value is increased by shinglingOverlapLength for every new shingled document. - * All shingle index documents, except for the last one, should have the maximum possible length. If the minimum - * document length (shingling overlap length + key length + 1) exceeds the maximum Lucene document size - * (MAXIMUM_LUCENE_DOCUMENT_SIZE), shingling is disabled, and the value is truncated to match the maximum Lucene - * document size. If shingleIndexMaxLength is lower than the required minimum document length and also lower than - * the maximum - * Lucene document size, shingleIndexMaxLength is adjusted to match the maximum Lucene document size. - * - * Note that the most important parameter, the shingleIndexOverlapLength, is the maximum search length that will yield - * results with 100% accuracy. - * - * Example: key-> "key", value-> "0123456789ABCDEF", max length: 10, shingling overlap length: 3 - * Generated documents: - * 012345:key - * 345678:key - * 6789AB:key - * 9ABCDE:key - * CDEF:key - * Any query with a length of 7 will yield no results, such as "0123456" or "6789ABC". - * Any query with a length of 3 will yield results with 100% accuracy (i.e. is always guaranteed to be searchable). - * Any query with a length between 4 and 6 (inclusive) has indeterminate accuracy. - * E.g. for queries with length 5, "12345", "789AB" will hit, while "23456" will miss. - * - * @param kv used to generate shingle text index documents - * @param shingleIndexDocuments a list to store the generated shingle index documents - * @param shingleIndexMaxLength the maximum length of each shingle index document. Needs to be greater than the - * length of the key and shingleIndexOverlapLength + 1, and must be lower or equal - * to MAXIMUM_LUCENE_DOCUMENT_SIZE. - * @param shingleIndexOverlapLength the number of characters in the kv-pair's value shared by two adjacent shingle - * index documents. If null, the overlap length will be defaulted to half of the max - * document length. - */ - public void generateShingleTextIndexDocument(Map.Entry<String, Object> kv, List<String> shingleIndexDocuments, - int shingleIndexMaxLength, int shingleIndexOverlapLength) { - String key = kv.getKey(); - String val; - // To avoid redundant leading and tailing '"', only convert to JSON string if the value is a list or an array - if (kv.getValue() instanceof Collection || kv.getValue() instanceof Object[]) { - try { - val = JsonUtils.objectToString(kv.getValue()); - } catch (JsonProcessingException e) { - val = kv.getValue().toString(); - } - } else { - val = kv.getValue().toString(); - } - final int valLength = val.length(); - final int documentSuffixLength = key.length() + 1; - final int minDocumentLength = documentSuffixLength + shingleIndexOverlapLength + 1; - - if (shingleIndexOverlapLength >= valLength) { - if (_logger.isDebugEnabled()) { - _logger.warn( - "The shingleIndexOverlapLength {} is longer than the value length {}. Shingling will not be applied since " - + "only one document will be generated.", shingleIndexOverlapLength, valLength); - } - generateTextIndexLuceneDocument(kv, shingleIndexDocuments, shingleIndexMaxLength); - return; - } - - if (minDocumentLength > MAXIMUM_LUCENE_DOCUMENT_SIZE) { - _logger.debug("The minimum document length {} (" + MIN_DOCUMENT_LENGTH_DESCRIPTION - + ") exceeds the limit of maximum Lucene document size " + MAXIMUM_LUCENE_DOCUMENT_SIZE - + ". Value will be truncated and shingling will not be applied.", minDocumentLength); - generateTextIndexLuceneDocument(kv, shingleIndexDocuments, shingleIndexMaxLength); - return; - } - - // This logging becomes expensive if user accidentally sets a very low shingleIndexMaxLength - if (shingleIndexMaxLength < minDocumentLength) { - _logger.debug("The shingleIndexMaxLength {} is smaller than the minimum document length {} (" - + MIN_DOCUMENT_LENGTH_DESCRIPTION + "). Increasing the shingleIndexMaxLength to maximum Lucene document size " - + MAXIMUM_LUCENE_DOCUMENT_SIZE + ".", shingleIndexMaxLength, minDocumentLength); - shingleIndexMaxLength = MAXIMUM_LUCENE_DOCUMENT_SIZE; - } - - // Shingle window slide length is the index position on the value which we shall advance on every iteration. - // We ensure shingleIndexMaxLength >= minDocumentLength so that shingleWindowSlideLength >= 1. - int shingleWindowSlideLength = shingleIndexMaxLength - shingleIndexOverlapLength - documentSuffixLength; - - // Generate shingle index documents - // When starting_idx + shingleIndexOverlapLength >= valLength, there are no new characters to capture, then we stop - // the shingle document generation loop. - // We ensure that shingleIndexOverlapLength < valLength so that this loop will be entered at lease once. - for (int i = 0; i + shingleIndexOverlapLength < valLength; i += shingleWindowSlideLength) { - String documentValStr = val.substring(i, Math.min(i + shingleIndexMaxLength - documentSuffixLength, valLength)); - String shingleIndexDocument = documentValStr + ":" + key; - shingleIndexDocuments.add(shingleIndexDocument); - _mergedTextIndexDocumentBytesCount += shingleIndexDocument.length(); - ++_mergedTextIndexDocumentCount; - } - _serverMetrics.setValueOfTableGauge(_tableName, ServerGauge.REALTIME_MERGED_TEXT_IDX_DOCUMENT_AVG_LEN, - _mergedTextIndexDocumentBytesCount / _mergedTextIndexDocumentCount); + addKeyValueToDocuments(indexDocuments, key, val, _transformerConfig.isReverseTextIndexKeyValueOrder(), + _transformerConfig.isOptimizeCaseInsensitiveSearch()); } private void flattenAndAddToMergedTextIndexMap(Map<String, Object> mergedTextIndexMap, String key, Object value) { @@ -643,23 +574,42 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { private List<String> getLuceneDocumentsFromMergedTextIndexMap(Map<String, Object> mergedTextIndexMap) { final Integer mergedTextIndexDocumentMaxLength = _transformerConfig.getMergedTextIndexDocumentMaxLength(); final @Nullable - Integer mergedTextIndexShinglingOverlapLength = _transformerConfig.getMergedTextIndexShinglingOverlapLength(); List<String> luceneDocuments = new ArrayList<>(); mergedTextIndexMap.entrySet().stream().filter(kv -> null != kv.getKey() && null != kv.getValue()) .filter(kv -> !_transformerConfig.getMergedTextIndexPathToExclude().contains(kv.getKey())).filter( kv -> !base64ValueFilter(kv.getValue().toString().getBytes(), _transformerConfig.getMergedTextIndexBinaryDocumentDetectionMinLength())).filter( - kv -> _transformerConfig.getMergedTextIndexSuffixToExclude().stream() - .anyMatch(suffix -> !kv.getKey().endsWith(suffix))).forEach(kv -> { - if (null == mergedTextIndexShinglingOverlapLength) { - generateTextIndexLuceneDocument(kv, luceneDocuments, mergedTextIndexDocumentMaxLength); - } else { - generateShingleTextIndexDocument(kv, luceneDocuments, mergedTextIndexDocumentMaxLength, - mergedTextIndexShinglingOverlapLength); - } + kv -> !MERGED_TEXT_INDEX_SUFFIX_TO_EXCLUDE.stream() + .anyMatch(suffix -> kv.getKey().endsWith(suffix))).forEach(kv -> { + generateTextIndexLuceneDocument(kv, luceneDocuments, mergedTextIndexDocumentMaxLength); }); return luceneDocuments; } + + private void addKeyValueToDocuments(List<String> documents, String key, String value, boolean addInReverseOrder, + boolean addCaseInsensitiveVersion) { + addKeyValueToDocumentWithOrder(documents, key, value, addInReverseOrder); + + // To optimize the case insensitive search, add the lower case version if applicable + // Note that we only check the value as Key is always case-sensitive search + if (addCaseInsensitiveVersion && value.chars().anyMatch(Character::isUpperCase)) { + addKeyValueToDocumentWithOrder(documents, key, value.toLowerCase(Locale.ENGLISH), addInReverseOrder); + } + } + + private void addKeyValueToDocumentWithOrder(List<String> documents, String key, String value, + boolean addInReverseOrder) { + // Not doing refactor here to avoid allocating new intermediate string + if (addInReverseOrder) { + documents.add(_transformerConfig.getMergedTextIndexBeginOfDocAnchor() + value + + _transformerConfig.getJsonKeyValueSeparator() + key + + _transformerConfig.getMergedTextIndexEndOfDocAnchor()); + } else { + documents.add(_transformerConfig.getMergedTextIndexBeginOfDocAnchor() + key + + _transformerConfig.getJsonKeyValueSeparator() + value + + _transformerConfig.getMergedTextIndexEndOfDocAnchor()); + } + } } /** @@ -677,16 +627,16 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { */ class SchemaTreeNode { private boolean _isColumn; - private Map<String, SchemaTreeNode> _children; + private final Map<String, SchemaTreeNode> _children; // Taking the example of key "x.y.z", the keyName will be "z" and the parentPath will be "x.y" // Root node would have keyName as "" and parentPath as null // Root node's children will have keyName as the first level key and parentPath as "" @Nonnull - private String _keyName; + private final String _keyName; @Nullable private String _columnName; @Nullable - private String _parentPath; + private final String _parentPath; private FieldSpec _fieldSpec; public SchemaTreeNode(String keyName, String parentPath, Schema schema) { @@ -700,11 +650,12 @@ class SchemaTreeNode { return _isColumn; } - public void setColumn(String columnName) { + public void setColumn(String columnName, Schema schema) { if (columnName == null) { _columnName = getJsonKeyPath(); } else { _columnName = columnName; + _fieldSpec = schema.getFieldSpecFor(columnName); } _isColumn = true; } @@ -728,10 +679,26 @@ class SchemaTreeNode { return child; } - public SchemaTreeNode getChild(String key) { + private SchemaTreeNode getChild(String key) { return _children.get(key); } + public SchemaTreeNode getChild(String key, boolean useAnonymousDot) { + if (useAnonymousDot && key.contains(".")) { + SchemaTreeNode node = this; + for (String subKey : key.split("\\.")) { + if (node != null) { + node = node.getChild(subKey); + } else { + return null; + } + } + return node; + } else { + return getChild(key); + } + } + public String getKeyName() { return _keyName; } @@ -751,6 +718,9 @@ class SchemaTreeNode { if (value instanceof Object[]) { return JsonUtils.objectToString(Arrays.asList((Object[]) value)); } + if (value instanceof Map) { + return JsonUtils.objectToString(value); + } } catch (JsonProcessingException e) { return value.toString(); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java index d004f703f6..45c021977a 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java @@ -28,12 +28,10 @@ import com.fasterxml.jackson.databind.node.NumericNode; import com.fasterxml.jackson.databind.node.ObjectNode; import com.fasterxml.jackson.databind.node.TextNode; import java.io.IOException; -import java.util.AbstractMap; -import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import javax.annotation.Nonnull; @@ -50,7 +48,6 @@ import org.testng.Assert; import org.testng.annotations.Test; import static org.mockito.Mockito.mock; -import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import static org.testng.AssertJUnit.fail; @@ -63,9 +60,12 @@ public class SchemaConformingTransformerV2Test { private static final String MERGED_TEXT_INDEX_FIELD_NAME = "__mergedTextIndex"; private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final JsonNodeFactory N = OBJECT_MAPPER.getNodeFactory(); + private static final String TEST_JSON_MESSAGE_NAME = "message"; + private static final String TEST_JSON_MESSAGE_LOGTYPE_NAME = "message_logtype"; private static final String TEST_JSON_ARRAY_FIELD_NAME = "arrayField"; private static final String TEST_JSON_NULL_FIELD_NAME = "nullField"; private static final String TEST_JSON_STRING_FIELD_NAME = "stringField"; + private static final String TEST_JSON_DOT_FIELD_NAME = "dotField.dotSuffix"; private static final String TEST_JSON_MAP_FIELD_NAME = "mapField"; private static final String TEST_JSON_MAP_EXTRA_FIELD_NAME = "mapFieldExtra"; private static final String TEST_JSON_MAP_NO_IDX_FIELD_NAME = "mapField_noIndex"; @@ -75,6 +75,7 @@ public class SchemaConformingTransformerV2Test { private static final ArrayNode TEST_JSON_ARRAY_NODE = N.arrayNode().add(0).add(1).add(2).add(3); private static final NullNode TEST_JSON_NULL_NODE = N.nullNode(); private static final TextNode TEST_JSON_STRING_NODE = N.textNode("a"); + private static final TextNode TEST_JSON_STRING_NODE_WITH_UPEERCASE = N.textNode("aA_123"); private static final NumericNode TEST_INT_NODE = N.numberNode(9); private static final TextNode TEST_JSON_STRING_NO_IDX_NODE = N.textNode("z"); private static final CustomObjectNode TEST_JSON_MAP_NODE = @@ -91,6 +92,9 @@ public class SchemaConformingTransformerV2Test { CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE).set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE); + private static final String JSON_KEY_VALUE_SEPARATOR = "\u001e"; + private static final String MERGED_TEXT_INDEX_BOD_ANCHOR = "\u0002"; + private static final String MERGED_TEXT_INDEX_EOD_ANCHOR = "\u0003"; static { ServerMetrics.register(mock(ServerMetrics.class)); @@ -103,7 +107,8 @@ public class SchemaConformingTransformerV2Test { IngestionConfig ingestionConfig = new IngestionConfig(); SchemaConformingTransformerV2Config schemaConformingTransformerV2Config = new SchemaConformingTransformerV2Config(true, INDEXABLE_EXTRAS_FIELD_NAME, true, UNINDEXABLE_EXTRAS_FIELD_NAME, - UNINDEXABLE_FIELD_SUFFIX, null, null, null, null, null, null, null, null, null); + UNINDEXABLE_FIELD_SUFFIX, null, null, null, null, null, null, false, null, null, null, null, null, null, + null, null, null, null); ingestionConfig.setSchemaConformingTransformerV2Config(schemaConformingTransformerV2Config); return new TableConfigBuilder(TableType.OFFLINE).setTableName("testTable").setIngestionConfig(ingestionConfig) .build(); @@ -111,12 +116,17 @@ public class SchemaConformingTransformerV2Test { private static TableConfig createDefaultTableConfig(String indexableExtrasField, String unindexableExtrasField, String unindexableFieldSuffix, Set<String> fieldPathsToDrop, Set<String> fieldPathsToPreserve, - Set<String> fieldPathToPreserverWithIndex, String mergedTextIndexField) { + Set<String> fieldPathsToPreserveWithIndex, Map<String, String> columnNameToJsonKeyPathMap, + String mergedTextIndexField, boolean useAnonymousDotInFieldNames, boolean optimizeCaseInsensitiveSearch, + Boolean reverseTextIndexKeyValueOrder) { IngestionConfig ingestionConfig = new IngestionConfig(); SchemaConformingTransformerV2Config schemaConformingTransformerV2Config = new SchemaConformingTransformerV2Config(indexableExtrasField != null, indexableExtrasField, unindexableExtrasField != null, unindexableExtrasField, unindexableFieldSuffix, fieldPathsToDrop, - fieldPathsToPreserve, fieldPathToPreserverWithIndex, mergedTextIndexField, null, null, null, null, null); + fieldPathsToPreserve, fieldPathsToPreserveWithIndex, null, columnNameToJsonKeyPathMap, + mergedTextIndexField, useAnonymousDotInFieldNames, optimizeCaseInsensitiveSearch, + reverseTextIndexKeyValueOrder, null, null, null, + null, null, JSON_KEY_VALUE_SEPARATOR, MERGED_TEXT_INDEX_BOD_ANCHOR, MERGED_TEXT_INDEX_EOD_ANCHOR); ingestionConfig.setSchemaConformingTransformerV2Config(schemaConformingTransformerV2Config); return new TableConfigBuilder(TableType.OFFLINE).setTableName("testTable").setIngestionConfig(ingestionConfig) .build(); @@ -137,6 +147,7 @@ public class SchemaConformingTransformerV2Test { { "arrayField" : [ 0, 1, 2, 3 ], "stringField" : "a", + "dotField.dotSuffix" : "a", "mapField" : { "arrayField" : [ 0, 1, 2, 3 ], "stringField" : "a" @@ -153,6 +164,7 @@ public class SchemaConformingTransformerV2Test { */ final CustomObjectNode inputJsonNode = CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE) + .set(TEST_JSON_DOT_FIELD_NAME, TEST_JSON_STRING_NODE) .set(TEST_JSON_NESTED_MAP_FIELD_NAME, CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE)); @@ -165,6 +177,7 @@ public class SchemaConformingTransformerV2Test { "json_data" : { "arrayField" : [ 0, 1, 2, 3 ], "stringField" : "a", + "dotField.dotSuffix" : "a", "mapField" : { "arrayField" : [ 0, 1, 2, 3 ], "stringField" : "a" @@ -184,19 +197,22 @@ public class SchemaConformingTransformerV2Test { // The input json node stripped of null fields. final CustomObjectNode inputJsonNodeWithoutNullFields = CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD) + .set(TEST_JSON_DOT_FIELD_NAME, TEST_JSON_STRING_NODE) .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD).set(TEST_JSON_NESTED_MAP_FIELD_NAME, - CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD) - .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)); + CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD) + .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)); expectedJsonNode = CustomObjectNode.create().set(INDEXABLE_EXTRAS_FIELD_NAME, inputJsonNodeWithoutNullFields); - transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode); + transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode, true); - // Three dedicated columns in schema, only two are populated, one ignored + // Four dedicated columns in schema, only two are populated, two ignored /* { "arrayField":[0, 1, 2, 3], "nestedFields.stringField":"a", "<indexableExtras>":{ + "dotField.dotSuffix" : "a", // it is not loaded to dedicated column because we do not enable anonymous dot in + field names "mapField": { "arrayField":[0, 1, 2, 3], "stringField":"a" @@ -214,6 +230,7 @@ public class SchemaConformingTransformerV2Test { */ schema = createDefaultSchemaBuilder().addMultiValueDimension(TEST_JSON_ARRAY_FIELD_NAME, DataType.INT) .addSingleValueDimension(TEST_JSON_MAP_FIELD_NAME, DataType.STRING) + .addSingleValueDimension(TEST_JSON_DOT_FIELD_NAME, DataType.STRING) .addSingleValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_STRING_FIELD_NAME, DataType.STRING) .build(); expectedJsonNode = CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) @@ -221,16 +238,18 @@ public class SchemaConformingTransformerV2Test { .set(INDEXABLE_EXTRAS_FIELD_NAME, CustomObjectNode.create().set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD) .setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD.deepCopy().removeAndReturn(TEST_JSON_ARRAY_FIELD_NAME)) + .set(TEST_JSON_DOT_FIELD_NAME, TEST_JSON_STRING_NODE) .set(TEST_JSON_NESTED_MAP_FIELD_NAME, CustomObjectNode.create().setAll( - TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD.deepCopy().removeAndReturn(TEST_JSON_STRING_FIELD_NAME)) + TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD.deepCopy().removeAndReturn(TEST_JSON_STRING_FIELD_NAME)) .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD))); - transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode); + transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode, false); // 8 dedicated columns, only 6 are populated /* { "arrayField" : [ 0, 1, 2, 3 ], "stringField" : "a", + "dotField.dotSuffix" : "a", "nestedField.arrayField" : [ 0, 1, 2, 3 ], "nestedField.stringField" : "a", "json_data" : { @@ -250,6 +269,7 @@ public class SchemaConformingTransformerV2Test { schema = createDefaultSchemaBuilder().addMultiValueDimension(TEST_JSON_ARRAY_FIELD_NAME, DataType.INT) .addSingleValueDimension(TEST_JSON_NULL_FIELD_NAME, DataType.STRING) .addSingleValueDimension(TEST_JSON_STRING_FIELD_NAME, DataType.STRING) + .addSingleValueDimension(TEST_JSON_DOT_FIELD_NAME, DataType.STRING) .addSingleValueDimension(TEST_JSON_MAP_FIELD_NAME, DataType.JSON) .addMultiValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_ARRAY_FIELD_NAME, DataType.INT) .addSingleValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_NULL_FIELD_NAME, DataType.STRING) @@ -259,11 +279,12 @@ public class SchemaConformingTransformerV2Test { expectedJsonNode = CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD) .set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) .set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) + .set(TEST_JSON_DOT_FIELD_NAME, TEST_JSON_STRING_NODE) .set(INDEXABLE_EXTRAS_FIELD_NAME, CustomObjectNode.create().set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD) .set(TEST_JSON_NESTED_MAP_FIELD_NAME, CustomObjectNode.create().set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD))); - transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode); + transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode, true); } @Test @@ -274,6 +295,7 @@ public class SchemaConformingTransformerV2Test { "stringField":"a", "intField_noIndex":9, "string_noIndex":"z", + "message": "a", "mapField":{ "arrayField":[0, 1, 2, 3], "stringField":"a", @@ -300,18 +322,20 @@ public class SchemaConformingTransformerV2Test { */ final CustomObjectNode inputJsonNode = CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) - .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE).set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) + .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE) + .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE_WITH_UPEERCASE) .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE) + .set(TEST_JSON_MESSAGE_NAME, TEST_JSON_STRING_NODE) .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX) .set(TEST_JSON_MAP_NO_IDX_FIELD_NAME, TEST_JSON_MAP_NODE).set(TEST_JSON_NESTED_MAP_FIELD_NAME, - CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME, - TEST_JSON_ARRAY_NODE) - .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE) - .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) - .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) - .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE) - .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX)); + CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME, + TEST_JSON_ARRAY_NODE) + .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE) + .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) + .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) + .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE) + .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX)); CustomObjectNode expectedJsonNode; CustomObjectNode expectedJsonNodeWithMergedTextIndex; @@ -324,6 +348,7 @@ public class SchemaConformingTransformerV2Test { "indexableExtras":{ "arrayField":[0, 1, 2, 3], "stringField":"a", + "stringField":"aA_123", "mapField":{ "arrayField":[0, 1, 2, 3], "stringField":"a" @@ -358,18 +383,18 @@ public class SchemaConformingTransformerV2Test { } }, __mergedTextIndex: [ - // See the value of expectedJsonNodeWithMergedTextIndex + see the value of expectedJsonNodeWithMergedTextIndex ] } */ expectedJsonNode = CustomObjectNode.create().set(INDEXABLE_EXTRAS_FIELD_NAME, + CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) + .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE_WITH_UPEERCASE) + .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD).set(TEST_JSON_NESTED_MAP_FIELD_NAME, CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) - .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD) - .set(TEST_JSON_NESTED_MAP_FIELD_NAME, - CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) - .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) - .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD))) + .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD))) + .set(UNINDEXABLE_EXTRAS_FIELD_NAME, CustomObjectNode.create().set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE) @@ -381,16 +406,55 @@ public class SchemaConformingTransformerV2Test { .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NO_IDX_NODE))); transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(), inputJsonNode, expectedJsonNode); - expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, - N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField") - .add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField") - .add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField") - .add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField") - .add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField") - .add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField") - .add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField") - .add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField") - .add("a:nestedFields.mapField.stringField")); + expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, N.arrayNode() + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "aA_123" + JSON_KEY_VALUE_SEPARATOR + "stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "mapField.stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField" + + ".arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "message" + MERGED_TEXT_INDEX_EOD_ANCHOR)); transformWithUnIndexableFieldsAndMergedTextIndex( schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME, DataType.STRING).build(), inputJsonNode, expectedJsonNodeWithMergedTextIndex); @@ -409,6 +473,7 @@ public class SchemaConformingTransformerV2Test { "mapField":{ "arrayField":[0, 1, 2, 3], "stringField":"a" + "stringField":"aA_123" }, "nestedFields":{ "arrayField":[0, 1, 2, 3], @@ -446,7 +511,7 @@ public class SchemaConformingTransformerV2Test { expectedJsonNode = CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) .set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) .set(INDEXABLE_EXTRAS_FIELD_NAME, - CustomObjectNode.create().set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) + CustomObjectNode.create().set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE_WITH_UPEERCASE) .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD) .set(TEST_JSON_NESTED_MAP_FIELD_NAME, CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) @@ -463,16 +528,55 @@ public class SchemaConformingTransformerV2Test { .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NO_IDX_NODE))); transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(), inputJsonNode, expectedJsonNode); - expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, - N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField") - .add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField") - .add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField") - .add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField") - .add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField") - .add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField") - .add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField") - .add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField") - .add("a:nestedFields.mapField.stringField")); + expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, N.arrayNode() + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "aA_123" + JSON_KEY_VALUE_SEPARATOR + "stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "mapField.stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "message" + MERGED_TEXT_INDEX_EOD_ANCHOR)); transformWithUnIndexableFieldsAndMergedTextIndex( schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME, DataType.STRING).build(), inputJsonNode, expectedJsonNodeWithMergedTextIndex); @@ -490,6 +594,7 @@ public class SchemaConformingTransformerV2Test { { "arrayField":[0, 1, 2, 3], "stringField":"a", + "stringField":"aA_123", "nestedFields.arrayField":[0, 1, 2, 3], "nestedFields.stringField":"a", "indexableExtras":{ @@ -530,7 +635,7 @@ public class SchemaConformingTransformerV2Test { } */ expectedJsonNode = CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) - .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) + .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE_WITH_UPEERCASE) .set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) .set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) .set(INDEXABLE_EXTRAS_FIELD_NAME, @@ -548,16 +653,55 @@ public class SchemaConformingTransformerV2Test { .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE) .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NO_IDX_NODE))); transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(), inputJsonNode, expectedJsonNode); - expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, - N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField") - .add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField") - .add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField") - .add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField") - .add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField") - .add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField") - .add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField") - .add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField") - .add("a:nestedFields.mapField.stringField")); + expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, N.arrayNode() + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "aA_123" + JSON_KEY_VALUE_SEPARATOR + "stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "mapField.stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.stringField" + + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR + "message" + MERGED_TEXT_INDEX_EOD_ANCHOR)); transformWithUnIndexableFieldsAndMergedTextIndex( schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME, DataType.STRING).build(), inputJsonNode, expectedJsonNodeWithMergedTextIndex); @@ -568,12 +712,14 @@ public class SchemaConformingTransformerV2Test { /* { "arrayField":[0, 1, 2, 3], + "message_logtype": "a", "stringField":"a", "intField_noIndex":9, "string_noIndex":"z", "mapField":{ "arrayField":[0, 1, 2, 3], "stringField":"a", + "stringField":"aA_123", "intField_noIndex":9, "string_noIndex":"z" }, @@ -590,6 +736,7 @@ public class SchemaConformingTransformerV2Test { "nestedFields":{ "arrayField":[0, 1, 2, 3], "stringField":"a", + "stringField":"aA_123", "intField_noIndex":9, "string_noIndex":"z", "mapField":{ @@ -603,38 +750,49 @@ public class SchemaConformingTransformerV2Test { */ final CustomObjectNode inputJsonNode = CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE) - .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE).set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) + .set(TEST_JSON_MESSAGE_NAME, TEST_JSON_STRING_NODE) + .set(TEST_JSON_MESSAGE_LOGTYPE_NAME, TEST_JSON_STRING_NODE) + .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE) + .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE_WITH_UPEERCASE) .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE) .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX) .set(TEST_JSON_MAP_EXTRA_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX) .set(TEST_JSON_MAP_NO_IDX_FIELD_NAME, TEST_JSON_MAP_NODE).set(TEST_JSON_NESTED_MAP_FIELD_NAME, - CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME, - TEST_JSON_ARRAY_NODE) - .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE) - .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) - .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) - .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE) - .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX)); + CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME, + TEST_JSON_ARRAY_NODE) + .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE) + .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) + .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) + .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE) + .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX)); CustomObjectNode expectedJsonNode; CustomObjectNode expectedJsonNodeWithMergedTextIndex; Schema.SchemaBuilder schemaBuilder; - String destColumnName = "someMeaningfulName"; + String destStrColumnName = "mystringname_all_lowercases"; + String destMapColumnName = "myMapName"; // make array field as single value STRING, test the conversion function - // ignore the column nestedFields + // drop the column nestedFields.mapFields // preserve the entire mapField value + // preserve the nestedFields.arrayField value and test the conversion function // map the column someMeaningfulName to nestedFields.stringField - schemaBuilder = createDefaultSchemaBuilder().addSingleValueDimension("arrayField", DataType.STRING) - .addSingleValueDimension(TEST_JSON_MAP_FIELD_NAME, DataType.STRING) - .addSingleValueDimension(TEST_JSON_MAP_EXTRA_FIELD_NAME, DataType.STRING) - .addSingleValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME, DataType.JSON) - .addSingleValueDimension(destColumnName, DataType.STRING); + // abandon the json_data extra field + // mergedTextIndex should contain columns who are not in preserved or dropped list + // mergedTextIndex should contain message_logtye + schemaBuilder = createDefaultSchemaBuilder().addSingleValueDimension(TEST_JSON_ARRAY_FIELD_NAME, DataType.STRING) + .addSingleValueDimension(TEST_JSON_STRING_FIELD_NAME, DataType.STRING) + .addSingleValueDimension(TEST_JSON_MESSAGE_LOGTYPE_NAME, DataType.STRING) + .addSingleValueDimension(destMapColumnName, DataType.STRING) + .addSingleValueDimension(TEST_JSON_MAP_EXTRA_FIELD_NAME, DataType.JSON) + .addSingleValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_ARRAY_FIELD_NAME, DataType.STRING) + .addSingleValueDimension(destStrColumnName, DataType.STRING); Map<String, String> keyMapping = new HashMap<>() { { - put(destColumnName, TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_STRING_FIELD_NAME); + put(destStrColumnName, TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_STRING_FIELD_NAME); + put(destMapColumnName, TEST_JSON_MAP_FIELD_NAME); } }; Set<String> pathToDrop = new HashSet<>() { @@ -645,6 +803,7 @@ public class SchemaConformingTransformerV2Test { Set<String> pathToPreserve = new HashSet<>() { { add(TEST_JSON_MAP_FIELD_NAME); + add(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_ARRAY_FIELD_NAME); } }; Set<String> pathToPreserveWithIndex = new HashSet<>() { @@ -656,10 +815,14 @@ public class SchemaConformingTransformerV2Test { /* { "arrayField":[0,1,2,3], - "nestedFields.stringField":"a", - "mapField":{ + "message_logtype": "a", + "nestedFields.arrayField":[0,1,2,3], + "stringFiled":"aA_123" + "mystringname_all_lowercases":"a", + "myMapName":{ "arrayField":[0,1,2,3], "stringField":"a", + "stringField":"aA_123", "intField_noIndex":9, "string_noIndex":"z" }, @@ -675,6 +838,7 @@ public class SchemaConformingTransformerV2Test { "arrayField":[0, 1, 2, 3], } }, + "nestedField.arrayField":[0,1,2,3], "unindexableExtras":{ "intField_noIndex":9, "string_noIndex":"z", @@ -688,16 +852,21 @@ public class SchemaConformingTransformerV2Test { } }, __mergedTextIndex: [ - // check expectedJsonNodeWithMergedTextIndex + // check mergedTextIndexNode + ], + __mergedTextIndex_delimeter: [ + // check mergedTextIndexNode ] } */ expectedJsonNode = CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, N.textNode("[0,1,2,3]")) - .set(destColumnName, TEST_JSON_STRING_NODE).set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX) - .set(TEST_JSON_MAP_EXTRA_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX).set(INDEXABLE_EXTRAS_FIELD_NAME, - CustomObjectNode.create().set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE) - .set(TEST_JSON_NESTED_MAP_FIELD_NAME, - CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE))) + .set(TEST_JSON_MESSAGE_LOGTYPE_NAME, TEST_JSON_STRING_NODE) + .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE_WITH_UPEERCASE) + .set(destStrColumnName, TEST_JSON_STRING_NODE) + // For single value field, it would serialize the value whose format is slightly different + .set(destMapColumnName, N.textNode("{\"arrayField\":[0,1,2,3],\"stringField\":\"a\",\"intField_noIndex\":9," + + "\"stringField_noIndex\":\"z\"}")).set(TEST_JSON_MAP_EXTRA_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX) + .set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." + TEST_JSON_ARRAY_FIELD_NAME, N.textNode("[0,1,2,3]")) .set(UNINDEXABLE_EXTRAS_FIELD_NAME, CustomObjectNode.create().set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) @@ -707,45 +876,77 @@ public class SchemaConformingTransformerV2Test { CustomObjectNode.create().set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE) .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE))); - expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, - N.arrayNode().add("0:arrayField").add("1:arrayField").add("2:arrayField").add("3:arrayField") - .add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:nestedFields.arrayField") - .add("0:nestedFields.arrayField").add("1:nestedFields.arrayField").add("2:nestedFields.arrayField") - .add("3:nestedFields.arrayField").add("a:nestedFields.stringField") - .add("[0,1,2,3]:mapFieldExtra.arrayField").add("a:mapFieldExtra.stringField") - .add("0:mapFieldExtra.arrayField").add("1:mapFieldExtra.arrayField").add("2:mapFieldExtra.arrayField") - .add("3:mapFieldExtra.arrayField")); - transformKeyValueTransformation( + JsonNode mergedTextIndexNode = N.arrayNode().add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" + JSON_KEY_VALUE_SEPARATOR + "0" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" + JSON_KEY_VALUE_SEPARATOR + "1" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" + JSON_KEY_VALUE_SEPARATOR + "2" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" + JSON_KEY_VALUE_SEPARATOR + "3" + MERGED_TEXT_INDEX_EOD_ANCHOR) + .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" + JSON_KEY_VALUE_SEPARATOR + "[0,1,2,3]" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + destStrColumnName + JSON_KEY_VALUE_SEPARATOR + "a" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + TEST_JSON_STRING_FIELD_NAME + JSON_KEY_VALUE_SEPARATOR + + TEST_JSON_STRING_NODE_WITH_UPEERCASE.textValue() + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + TEST_JSON_STRING_FIELD_NAME + JSON_KEY_VALUE_SEPARATOR + + TEST_JSON_STRING_NODE_WITH_UPEERCASE.textValue().toLowerCase(Locale.ENGLISH) + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" + JSON_KEY_VALUE_SEPARATOR + "[0,1,2,3]" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.stringField" + JSON_KEY_VALUE_SEPARATOR + "a" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" + JSON_KEY_VALUE_SEPARATOR + "0" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" + JSON_KEY_VALUE_SEPARATOR + "1" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" + JSON_KEY_VALUE_SEPARATOR + "2" + + MERGED_TEXT_INDEX_EOD_ANCHOR).add( + MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" + JSON_KEY_VALUE_SEPARATOR + "3" + + MERGED_TEXT_INDEX_EOD_ANCHOR); + expectedJsonNodeWithMergedTextIndex = + expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, mergedTextIndexNode); + transformKeyValueTransformation(null, UNINDEXABLE_EXTRAS_FIELD_NAME, + MERGED_TEXT_INDEX_FIELD_NAME, schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME, DataType.STRING).build(), keyMapping, pathToDrop, pathToPreserve, pathToPreserveWithIndex, inputJsonNode, expectedJsonNodeWithMergedTextIndex); } - private void transformWithIndexableFields(Schema schema, JsonNode inputRecordJsonNode, JsonNode ouputRecordJsonNode) { - testTransform(INDEXABLE_EXTRAS_FIELD_NAME, null, null, schema, null, null, null, null, + private void transformWithIndexableFields(Schema schema, JsonNode inputRecordJsonNode, JsonNode ouputRecordJsonNode, + boolean useAnonymousDotInFieldNames) { + testTransform(INDEXABLE_EXTRAS_FIELD_NAME, null, null, useAnonymousDotInFieldNames, false, false, schema, null, + null, null, null, inputRecordJsonNode.toString(), ouputRecordJsonNode.toString()); } private void transformWithUnIndexableFieldsAndMergedTextIndex(Schema schema, JsonNode inputRecordJsonNode, JsonNode ouputRecordJsonNode) { - testTransform(INDEXABLE_EXTRAS_FIELD_NAME, UNINDEXABLE_EXTRAS_FIELD_NAME, MERGED_TEXT_INDEX_FIELD_NAME, schema, - null, null, null, null, inputRecordJsonNode.toString(), ouputRecordJsonNode.toString()); + testTransform(INDEXABLE_EXTRAS_FIELD_NAME, UNINDEXABLE_EXTRAS_FIELD_NAME, null, true, false, null, schema, null, + null, + null, null, inputRecordJsonNode.toString(), ouputRecordJsonNode.toString()); } - private void transformKeyValueTransformation(Schema schema, Map<String, String> keyMapping, - Set<String> fieldPathsToDrop, Set<String> fieldPathsToPreserve, Set<String> fieldPathsToPreserveWithIndex, - JsonNode inputRecordJsonNode, JsonNode ouputRecordJsonNode) { - testTransform(INDEXABLE_EXTRAS_FIELD_NAME, UNINDEXABLE_EXTRAS_FIELD_NAME, MERGED_TEXT_INDEX_FIELD_NAME, schema, - keyMapping, fieldPathsToDrop, fieldPathsToPreserve, fieldPathsToPreserveWithIndex, - inputRecordJsonNode.toString(), ouputRecordJsonNode.toString()); + private void transformKeyValueTransformation(String indexableExtraField, String unindeableExtraField, + String mergedTextIndexField, Schema schema, Map<String, String> keyMapping, Set<String> fieldPathsToDrop, + Set<String> fieldPathsToPreserve, Set<String> fieldPathsToPreserveWithIndex, JsonNode inputRecordJsonNode, + JsonNode ouputRecordJsonNode) { + testTransform(indexableExtraField, unindeableExtraField, mergedTextIndexField, true, true, false, schema, + keyMapping, + fieldPathsToDrop, fieldPathsToPreserve, fieldPathsToPreserveWithIndex, inputRecordJsonNode.toString(), + ouputRecordJsonNode.toString()); } - private void testTransform(String indexableExtrasField, String unindexableExtrasField, String mergedTextIndexField, + private void testTransform(String indexableExtrasField, String unindexableExtrasField, + String mergedTextIndexField, boolean useAnonymousDotInFieldNames, boolean optimizeCaseInsensitiveSearch, + Boolean reverseTextIndexKeyValueOrder, Schema schema, Map<String, String> keyMapping, Set<String> fieldPathsToDrop, Set<String> fieldPathsToPreserve, Set<String> fieldPathsToPreserveWithIndex, String inputRecordJSONString, String expectedOutputRecordJSONString) { TableConfig tableConfig = createDefaultTableConfig(indexableExtrasField, unindexableExtrasField, UNINDEXABLE_FIELD_SUFFIX, - fieldPathsToDrop, fieldPathsToPreserve, fieldPathsToPreserveWithIndex, mergedTextIndexField); - tableConfig.getIngestionConfig().getSchemaConformingTransformerV2Config().setColumnNameToJsonKeyPathMap(keyMapping); + fieldPathsToDrop, fieldPathsToPreserve, fieldPathsToPreserveWithIndex, keyMapping, mergedTextIndexField, + useAnonymousDotInFieldNames, + optimizeCaseInsensitiveSearch, reverseTextIndexKeyValueOrder); GenericRow outputRecord = transformRow(tableConfig, schema, inputRecordJSONString); Map<String, Object> expectedOutputRecordMap = jsonStringToMap(expectedOutputRecordJSONString); @@ -809,7 +1010,7 @@ public class SchemaConformingTransformerV2Test { .addSingleValueDimension("a.b.c", DataType.INT).build(); SchemaConformingTransformerV2.validateSchema(schema, new SchemaConformingTransformerV2Config(null, INDEXABLE_EXTRAS_FIELD_NAME, null, null, null, null, null, null, - null, null, null, null, null, null)); + null, null, null, null, null, null, null, null, null, null, null, null, null, null)); } catch (Exception ex) { fail("Should not have thrown any exception when overlapping schema occurs"); } @@ -820,7 +1021,7 @@ public class SchemaConformingTransformerV2Test { .addSingleValueDimension("a.b", DataType.STRING).build(); SchemaConformingTransformerV2.validateSchema(schema, new SchemaConformingTransformerV2Config(null, INDEXABLE_EXTRAS_FIELD_NAME, null, null, null, null, null, null, - null, null, null, null, null, null)); + null, null, null, null, null, null, null, null, null, null, null, null, null, null)); } catch (Exception ex) { fail("Should not have thrown any exception when overlapping schema occurs"); } @@ -835,67 +1036,11 @@ public class SchemaConformingTransformerV2Test { String shortBinaryData = "short"; int minLength = 10; - assertFalse(_RECORD_TRANSFORMER.base64ValueFilter(text.getBytes(), minLength)); - assertTrue(_RECORD_TRANSFORMER.base64ValueFilter(binaryData.getBytes(), minLength)); - assertTrue(_RECORD_TRANSFORMER.base64ValueFilter(binaryDataWithTrailingPeriods.getBytes(), minLength)); - assertFalse(_RECORD_TRANSFORMER.base64ValueFilter(binaryDataWithRandomPeriods.getBytes(), minLength)); - assertFalse(_RECORD_TRANSFORMER.base64ValueFilter(shortBinaryData.getBytes(), minLength)); - } - - @Test - public void testShingleIndexTokenization() { - String key = "key"; - String value = "0123456789ABCDEFGHIJ"; - int shingleIndexMaxLength; - int shingleIndexOverlapLength; - List<String> expectedTokenValues; - - shingleIndexMaxLength = 8; - shingleIndexOverlapLength = 1; - expectedTokenValues = new ArrayList<>( - Arrays.asList("0123:key", "3456:key", "6789:key", "9ABC:key", "CDEF:key", "FGHI:key", "IJ:key")); - testShingleIndexWithParams(key, value, shingleIndexMaxLength, shingleIndexOverlapLength, expectedTokenValues); - - shingleIndexMaxLength = 8; - shingleIndexOverlapLength = 2; - expectedTokenValues = new ArrayList<>( - Arrays.asList("0123:key", "2345:key", "4567:key", "6789:key", "89AB:key", "ABCD:key", "CDEF:key", "EFGH:key", - "GHIJ:key")); - testShingleIndexWithParams(key, value, shingleIndexMaxLength, shingleIndexOverlapLength, expectedTokenValues); - - // If shingleIndexMaxLength is lower than the minimum required length for merged text index token - // (length of the key + shingling overlap length + 1), then the shingleIndexMaxLength is adjusted to - // the maximum Lucene token size (32766) - shingleIndexMaxLength = 1; - shingleIndexOverlapLength = 5; - expectedTokenValues = new ArrayList<>(Arrays.asList(value + ":" + key)); - testShingleIndexWithParams(key, value, shingleIndexMaxLength, shingleIndexOverlapLength, expectedTokenValues); - - // If shingleIndexOverlapLength is equal to or longer than the length of the value, shingling cannot be applied and - // only one token is generated. - shingleIndexMaxLength = 32766; - shingleIndexOverlapLength = 100; - expectedTokenValues = new ArrayList<>(Arrays.asList(value + ":" + key)); - testShingleIndexWithParams(key, value, shingleIndexMaxLength, shingleIndexOverlapLength, expectedTokenValues); - - // Other corner cases, where the result would be the same as if shingling has not been applied - shingleIndexMaxLength = 300; - shingleIndexOverlapLength = 10; - expectedTokenValues = new ArrayList<>(Arrays.asList(value + ":" + key)); - testShingleIndexWithParams(key, value, shingleIndexMaxLength, shingleIndexOverlapLength, expectedTokenValues); - } - - private void testShingleIndexWithParams(String key, String value, Integer shingleIndexMaxLength, - Integer shingleIndexOverlapLength, List<String> expectedTokenValues) { - Map.Entry<String, Object> kv = new AbstractMap.SimpleEntry<>(key, value); - List<String> shingleIndexTokens = new ArrayList<>(); - _RECORD_TRANSFORMER.generateShingleTextIndexDocument(kv, shingleIndexTokens, shingleIndexMaxLength, - shingleIndexOverlapLength); - int numTokens = shingleIndexTokens.size(); - assertEquals(numTokens, expectedTokenValues.size()); - for (int i = 0; i < numTokens; i++) { - assertEquals(shingleIndexTokens.get(i), expectedTokenValues.get(i)); - } + assertFalse(SchemaConformingTransformerV2.base64ValueFilter(text.getBytes(), minLength)); + assertTrue(SchemaConformingTransformerV2.base64ValueFilter(binaryData.getBytes(), minLength)); + assertTrue(SchemaConformingTransformerV2.base64ValueFilter(binaryDataWithTrailingPeriods.getBytes(), minLength)); + assertFalse(SchemaConformingTransformerV2.base64ValueFilter(binaryDataWithRandomPeriods.getBytes(), minLength)); + assertFalse(SchemaConformingTransformerV2.base64ValueFilter(shortBinaryData.getBytes(), minLength)); } static class CustomObjectNode extends ObjectNode { diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SchemaConformingTransformerV2Config.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SchemaConformingTransformerV2Config.java index 5bc8e3e340..9d076cbfc3 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SchemaConformingTransformerV2Config.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SchemaConformingTransformerV2Config.java @@ -16,20 +16,20 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.pinot.spi.config.table.ingestion; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonPropertyDescription; -import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.Set; import javax.annotation.Nullable; import org.apache.pinot.spi.config.BaseJsonConfig; + public class SchemaConformingTransformerV2Config extends BaseJsonConfig { @JsonPropertyDescription("Enable indexable extras") private boolean _enableIndexableExtras = true; @@ -58,55 +58,76 @@ public class SchemaConformingTransformerV2Config extends BaseJsonConfig { + "input. This will NOT skip building mergedTextIndex for the field.") private Set<String> _fieldPathsToPreserveInputWithIndex = new HashSet<>(); + @JsonPropertyDescription("Array of flattened (dot-delimited) object paths not to store but only build " + + "mergedTextIndex for the field.") + private Set<String> _fieldPathsToSkipStorage = Set.of("message"); + @JsonPropertyDescription("Map from customized meaningful column name to json key path") private Map<String, String> _columnNameToJsonKeyPathMap = new HashMap<>(); @JsonPropertyDescription("mergedTextIndex field") private String _mergedTextIndexField = "__mergedTextIndex"; + @JsonPropertyDescription( + "If set to true {'a.b': 'c'} will be indexed in the same way as {'a': {'b': 'c}}. Otherwise, " + + "the former one will be ignored.") + private Boolean _useAnonymousDotInFieldNames = true; + + @JsonPropertyDescription("Whether to store extra lower cases value:key pairs in __mergedTextIndex to optimize case " + + "insensitive queries") + private Boolean _optimizeCaseInsensitiveSearch = false; + + @JsonPropertyDescription("Whether to store key and value in reverse order, if true store as value:key, else store" + + " as key:value") + private Boolean _reverseTextIndexKeyValueOrder = true; + @JsonPropertyDescription("mergedTextIndex document max length") private int _mergedTextIndexDocumentMaxLength = 32766; - @JsonPropertyDescription( - "Recall that merged text index document is in the format of <value:key>. " - + "The mergedTextIndex shingling overlap length refers to the " - + "maximum search length of the value that will yield results with " - + "100% accuracy. If the value is null, shingle index will be turned off " - + "and the value will be truncated such that the document is equal to " - + "_mergedTextIndexDocumentMaxLength" - ) - private @Nullable Integer _mergedTextIndexShinglingOverlapLength = null; - @JsonPropertyDescription("mergedTextIndex binary document detection minimum length") private Integer _mergedTextIndexBinaryDocumentDetectionMinLength = 512; @JsonPropertyDescription("Array of paths to exclude from merged text index.") private Set<String> _mergedTextIndexPathToExclude = new HashSet<>(); - // TODO: set default value from CLPRewriter once it open sourced - @JsonPropertyDescription("Array of suffix to exclude from merged text index.") - private List<String> _mergedTextIndexSuffixToExclude = Arrays.asList("_logtype", "_dictionaryVars", "_encodedVars"); + @JsonPropertyDescription("Anchor before merged text index value. Default is empty String") + private String _mergedTextIndexBeginOfDocAnchor = ""; + + @JsonPropertyDescription("Anchor after merged text index value. Default is empty String") + private String _mergedTextIndexEndOfDocAnchor = ""; @JsonPropertyDescription("Dedicated fields to double ingest into json_data column") private Set<String> _fieldsToDoubleIngest = new HashSet<>(); + @JsonPropertyDescription("Separator between key and value in json used in the Lucene index. Default is ':'.") + private String _jsonKeyValueSeparator = ":"; + @JsonCreator public SchemaConformingTransformerV2Config( @JsonProperty("enableIndexableExtras") @Nullable Boolean enableIndexableExtras, - @JsonProperty("indexableExtrasField") String indexableExtrasField, + @JsonProperty("indexableExtrasField") @Nullable String indexableExtrasField, @JsonProperty("enableUnindexableExtras") @Nullable Boolean enableUnindexableExtras, @JsonProperty("unindexableExtrasField") @Nullable String unindexableExtrasField, @JsonProperty("unindexableFieldSuffix") @Nullable String unindexableFieldSuffix, @JsonProperty("fieldPathsToDrop") @Nullable Set<String> fieldPathsToDrop, @JsonProperty("fieldPathsToKeepSameAsInput") @Nullable Set<String> fieldPathsToPreserveInput, @JsonProperty("fieldPathsToKeepSameAsInputWithIndex") @Nullable Set<String> fieldPathsToPreserveInputWithIndex, - @JsonProperty("mergedTextIndexField") @Nullable String mergedTextIndexField, + @JsonProperty("fieldPathsToSkipStorage") @Nullable Set<String> fieldPathsToSkipStorage, + @JsonProperty("columnNameToJsonKeyPathMap") @Nullable Map<String, String> columnNameToJsonKeyPathMap, + @JsonProperty("mergedTextIndexField") @Nullable String mergedTextIndexFields, + @JsonProperty("useAnonymousDotInFieldNames") @Nullable Boolean useAnonymousDotInFieldNames, + @JsonProperty("optimizeCaseInsensitiveSearch") @Nullable Boolean optimizeCaseInsensitiveSearch, + @JsonProperty("reverseTextIndexKeyValueOrder") @Nullable Boolean reverseTextIndexKeyValueOrder, @JsonProperty("mergedTextIndexDocumentMaxLength") @Nullable Integer mergedTextIndexDocumentMaxLength, - @JsonProperty("mergedTextIndexShinglingOverlapLength") @Nullable Integer mergedTextIndexShinglingOverlapLength, + @JsonProperty("mergedTextIndexBinaryTokenDetectionMinLength") + @Nullable Integer mergedTextIndexBinaryTokenDetectionMinLength, // Deprecated, add it to be backward compatible @JsonProperty("mergedTextIndexBinaryDocumentDetectionMinLength") @Nullable Integer mergedTextIndexBinaryDocumentDetectionMinLength, @JsonProperty("mergedTextIndexPathToExclude") @Nullable Set<String> mergedTextIndexPathToExclude, - @JsonProperty("fieldsToDoubleIngest") @Nullable Set<String> fieldsToDoubleIngest + @JsonProperty("fieldsToDoubleIngest") @Nullable Set<String> fieldsToDoubleIngest, + @JsonProperty("jsonKeyValueSeparator") @Nullable String jsonKeyValueSeparator, + @JsonProperty("mergedTextIndexBeginOfDocAnchor") @Nullable String mergedTextIndexBeginOfDocAnchor, + @JsonProperty("mergedTextIndexEndOfDocAnchor") @Nullable String mergedTextIndexEndOfDocAnchor ) { setEnableIndexableExtras(enableIndexableExtras); setIndexableExtrasField(indexableExtrasField); @@ -116,17 +137,30 @@ public class SchemaConformingTransformerV2Config extends BaseJsonConfig { setFieldPathsToDrop(fieldPathsToDrop); setFieldPathsToPreserveInput(fieldPathsToPreserveInput); setFieldPathsToPreserveInputWithIndex(fieldPathsToPreserveInputWithIndex); + setFieldPathsToSkipStorage(fieldPathsToSkipStorage); + setColumnNameToJsonKeyPathMap(columnNameToJsonKeyPathMap); - setMergedTextIndexField(mergedTextIndexField); + setMergedTextIndexField(mergedTextIndexFields); + setUseAnonymousDotInFieldNames(useAnonymousDotInFieldNames); + setOptimizeCaseInsensitiveSearch(optimizeCaseInsensitiveSearch); + setReverseTextIndexKeyValueOrder(reverseTextIndexKeyValueOrder); setMergedTextIndexDocumentMaxLength(mergedTextIndexDocumentMaxLength); - setMergedTextIndexShinglingDocumentOverlapLength(mergedTextIndexShinglingOverlapLength); + mergedTextIndexBinaryDocumentDetectionMinLength = mergedTextIndexBinaryDocumentDetectionMinLength == null + ? mergedTextIndexBinaryTokenDetectionMinLength : mergedTextIndexBinaryDocumentDetectionMinLength; setMergedTextIndexBinaryDocumentDetectionMinLength(mergedTextIndexBinaryDocumentDetectionMinLength); setMergedTextIndexPathToExclude(mergedTextIndexPathToExclude); setFieldsToDoubleIngest(fieldsToDoubleIngest); + setJsonKeyValueSeparator(jsonKeyValueSeparator); + setMergedTextIndexBeginOfDocAnchor(mergedTextIndexBeginOfDocAnchor); + setMergedTextIndexEndOfDocAnchor(mergedTextIndexEndOfDocAnchor); + } + + public Boolean isEnableIndexableExtras() { + return _enableIndexableExtras; } public SchemaConformingTransformerV2Config setEnableIndexableExtras(Boolean enableIndexableExtras) { - _enableIndexableExtras = enableIndexableExtras == null ? _enableUnindexableExtras : enableIndexableExtras; + _enableIndexableExtras = enableIndexableExtras == null ? _enableIndexableExtras : enableIndexableExtras; return this; } @@ -139,6 +173,10 @@ public class SchemaConformingTransformerV2Config extends BaseJsonConfig { return this; } + public Boolean isEnableUnindexableExtras() { + return _enableUnindexableExtras; + } + public SchemaConformingTransformerV2Config setEnableUnindexableExtras(Boolean enableUnindexableExtras) { _enableUnindexableExtras = enableUnindexableExtras == null ? _enableUnindexableExtras : enableUnindexableExtras; return this; @@ -181,6 +219,15 @@ public class SchemaConformingTransformerV2Config extends BaseJsonConfig { return this; } + public Set<String> getFieldPathsToSkipStorage() { + return _fieldPathsToSkipStorage; + } + + public SchemaConformingTransformerV2Config setFieldPathsToSkipStorage(Set<String> fieldPathsToSkipStorage) { + _fieldPathsToSkipStorage = fieldPathsToSkipStorage == null ? _fieldPathsToSkipStorage : fieldPathsToSkipStorage; + return this; + } + public Set<String> getFieldPathsToPreserveInputWithIndex() { return _fieldPathsToPreserveInputWithIndex; } @@ -189,7 +236,7 @@ public class SchemaConformingTransformerV2Config extends BaseJsonConfig { Set<String> fieldPathsToPreserveInputWithIndex) { _fieldPathsToPreserveInputWithIndex = fieldPathsToPreserveInputWithIndex == null ? _fieldPathsToPreserveInputWithIndex - : fieldPathsToPreserveInputWithIndex; + : fieldPathsToPreserveInputWithIndex; return this; } @@ -213,6 +260,36 @@ public class SchemaConformingTransformerV2Config extends BaseJsonConfig { return this; } + public Boolean isUseAnonymousDotInFieldNames() { + return _useAnonymousDotInFieldNames; + } + + public SchemaConformingTransformerV2Config setUseAnonymousDotInFieldNames(Boolean useAnonymousDotInFieldNames) { + _useAnonymousDotInFieldNames = useAnonymousDotInFieldNames == null ? _useAnonymousDotInFieldNames + : useAnonymousDotInFieldNames; + return this; + } + + public Boolean isOptimizeCaseInsensitiveSearch() { + return _optimizeCaseInsensitiveSearch; + } + + public SchemaConformingTransformerV2Config setOptimizeCaseInsensitiveSearch(Boolean optimizeCaseInsensitiveSearch) { + _optimizeCaseInsensitiveSearch = optimizeCaseInsensitiveSearch == null ? _optimizeCaseInsensitiveSearch + : optimizeCaseInsensitiveSearch; + return this; + } + + public Boolean isReverseTextIndexKeyValueOrder() { + return _reverseTextIndexKeyValueOrder; + } + + public SchemaConformingTransformerV2Config setReverseTextIndexKeyValueOrder(Boolean reverseTextIndexKeyValueOrder) { + _reverseTextIndexKeyValueOrder = reverseTextIndexKeyValueOrder == null ? _reverseTextIndexKeyValueOrder + : reverseTextIndexKeyValueOrder; + return this; + } + public Integer getMergedTextIndexDocumentMaxLength() { return _mergedTextIndexDocumentMaxLength; } @@ -225,16 +302,6 @@ public class SchemaConformingTransformerV2Config extends BaseJsonConfig { return this; } - public Integer getMergedTextIndexShinglingOverlapLength() { - return _mergedTextIndexShinglingOverlapLength; - } - - public SchemaConformingTransformerV2Config setMergedTextIndexShinglingDocumentOverlapLength( - Integer mergedTextIndexShinglingOverlapLength) { - _mergedTextIndexShinglingOverlapLength = mergedTextIndexShinglingOverlapLength; - return this; - } - public Integer getMergedTextIndexBinaryDocumentDetectionMinLength() { return _mergedTextIndexBinaryDocumentDetectionMinLength; } @@ -250,10 +317,6 @@ public class SchemaConformingTransformerV2Config extends BaseJsonConfig { return _mergedTextIndexPathToExclude; } - public List<String> getMergedTextIndexSuffixToExclude() { - return _mergedTextIndexSuffixToExclude; - } - public SchemaConformingTransformerV2Config setMergedTextIndexPathToExclude(Set<String> mergedTextIndexPathToExclude) { _mergedTextIndexPathToExclude = mergedTextIndexPathToExclude == null ? _mergedTextIndexPathToExclude : mergedTextIndexPathToExclude; @@ -268,4 +331,33 @@ public class SchemaConformingTransformerV2Config extends BaseJsonConfig { _fieldsToDoubleIngest = fieldsToDoubleIngest == null ? _fieldsToDoubleIngest : fieldsToDoubleIngest; return this; } + + public String getJsonKeyValueSeparator() { + return _jsonKeyValueSeparator; + } + + public void setJsonKeyValueSeparator(@Nullable String jsonKeyValueSeparator) { + _jsonKeyValueSeparator = jsonKeyValueSeparator == null ? ":" : jsonKeyValueSeparator; + } + + public String getMergedTextIndexBeginOfDocAnchor() { + return _mergedTextIndexBeginOfDocAnchor; + } + + public SchemaConformingTransformerV2Config setMergedTextIndexBeginOfDocAnchor( + String mergedTextIndexBeginOfDocAnchor) { + _mergedTextIndexBeginOfDocAnchor = mergedTextIndexBeginOfDocAnchor == null + ? _mergedTextIndexBeginOfDocAnchor : mergedTextIndexBeginOfDocAnchor; + return this; + } + + public String getMergedTextIndexEndOfDocAnchor() { + return _mergedTextIndexEndOfDocAnchor; + } + + public SchemaConformingTransformerV2Config setMergedTextIndexEndOfDocAnchor(String mergedTextIndexEndOfDocAnchor) { + _mergedTextIndexEndOfDocAnchor = mergedTextIndexEndOfDocAnchor == null + ? _mergedTextIndexEndOfDocAnchor : mergedTextIndexEndOfDocAnchor; + return this; + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org