This is an automated email from the ASF dual-hosted git repository. tingchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new 7ddb7a46f6 In SchemaConformingTransformer, Flatten array into multiple entries each with a key and array value. (#13890) 7ddb7a46f6 is described below commit 7ddb7a46f6520e66247810e487e263dd9a7ba9c1 Author: Ting Chen <tingc...@uber.com> AuthorDate: Thu Sep 5 15:46:00 2024 -0700 In SchemaConformingTransformer, Flatten array into multiple entries each with a key and array value. (#13890) * Flatten array into multiple entries each with a key and array value. * Fix lint issues. * Address the comments --- .../SchemaConformingTransformerV2.java | 28 +++++++--- .../SchemaConformingTransformerV2Test.java | 59 +++++++++++++--------- 2 files changed, 57 insertions(+), 30 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java index 583003b3f5..47b629f522 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java @@ -430,7 +430,7 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { } /** - * Generate an Lucene document based on the provided key-value pair. + * Generate a Lucene document based on the provided key-value pair. * The index document follows this format: "val:key". * @param kv used to generate text index documents * @param indexDocuments a list to store the generated index documents @@ -439,18 +439,34 @@ public class SchemaConformingTransformerV2 implements RecordTransformer { public void generateTextIndexLuceneDocument(Map.Entry<String, Object> kv, List<String> indexDocuments, Integer mergedTextIndexDocumentMaxLength) { String key = kv.getKey(); - String val; // To avoid redundant leading and tailing '"', only convert to JSON string if the value is a list or an array if (kv.getValue() instanceof Collection || kv.getValue() instanceof Object[]) { + // Add the entire array or collection as one string to the Lucene doc. try { - val = JsonUtils.objectToString(kv.getValue()); + addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength, key, JsonUtils.objectToString(kv.getValue())); + // To enable array contains search, we also add each array element with the key value pair to the Lucene doc. + // Currently it only supports 1 level flattening, any element deeper than 1 level will still stay nested. + if (kv.getValue() instanceof Collection) { + for (Object o : (Collection) kv.getValue()) { + addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength, key, JsonUtils.objectToString(o)); + } + } else if (kv.getValue() instanceof Object[]) { + for (Object o : (Object[]) kv.getValue()) { + addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength, key, JsonUtils.objectToString(o)); + } + } } catch (JsonProcessingException e) { - val = kv.getValue().toString(); + addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength, key, kv.getValue().toString()); } - } else { - val = kv.getValue().toString(); + return; } + // If the value is a single value + addLuceneDoc(indexDocuments, mergedTextIndexDocumentMaxLength, key, kv.getValue().toString()); + } + + private void addLuceneDoc(List<String> indexDocuments, Integer mergedTextIndexDocumentMaxLength, String key, + String val) { // TODO: theoretically, the key length + 1 could cause integer overflow. But in reality, upstream message size // limit usually could not reach that high. We should revisit this if we see any issue. if (key.length() + 1 > MAXIMUM_LUCENE_DOCUMENT_SIZE) { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java index 6189f14d42..cd1d85dc1d 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java @@ -326,7 +326,7 @@ public class SchemaConformingTransformerV2Test { // No schema schemaBuilder = createDefaultSchemaBuilder(); - /* + /* Expected output { "indexableExtras":{ "arrayField":[0, 1, 2, 3], @@ -370,10 +370,7 @@ public class SchemaConformingTransformerV2Test { } }, __mergedTextIndex: [ - "[0, 1, 2, 3]:arrayField", "a:stringField", - "[0, 1, 2, 3]:mapField.arrayField", "a:mapField.stringField", - "[0, 1, 2, 3]:nestedFields.arrayField", "a:nestedFields.stringField", - "[0, 1, 2, 3]:nestedFields.mapField.arrayField", "a:nestedFields.mapField.stringField", + // See the value of expectedJsonNodeWithMergedTextIndex ] } */ @@ -397,9 +394,15 @@ public class SchemaConformingTransformerV2Test { transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(), inputJsonNode, expectedJsonNode); expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, - N.arrayNode().add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField") - .add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("a:nestedFields.stringField") - .add("[0,1,2,3]:nestedFields.mapField.arrayField").add("a:nestedFields.mapField.stringField")); + N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField") + .add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField") + .add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField") + .add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField") + .add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField") + .add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField") + .add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField") + .add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField") + .add("a:nestedFields.mapField.stringField")); transformWithUnIndexableFieldsAndMergedTextIndex( schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME, DataType.STRING).build(), inputJsonNode, expectedJsonNodeWithMergedTextIndex); @@ -453,10 +456,7 @@ public class SchemaConformingTransformerV2Test { } }, __mergedTextIndex: [ - "[0, 1, 2, 3]:arrayField", "a:stringField", - "[0, 1, 2, 3]:mapField.arrayField", "a:mapField.stringField", - "[0, 1, 2, 3]:nestedFields.arrayField", "a:nestedFields.stringField", - "[0, 1, 2, 3]:nestedFields.mapField.arrayField", "a:nestedFields.mapField.stringField", + // See the value of expectedJsonNodeWithMergedTextIndex ] } */ @@ -480,9 +480,15 @@ public class SchemaConformingTransformerV2Test { transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(), inputJsonNode, expectedJsonNode); expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, - N.arrayNode().add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField") - .add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("a:nestedFields.stringField") - .add("[0,1,2,3]:nestedFields.mapField.arrayField").add("a:nestedFields.mapField.stringField")); + N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField") + .add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField") + .add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField") + .add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField") + .add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField") + .add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField") + .add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField") + .add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField") + .add("a:nestedFields.mapField.stringField")); transformWithUnIndexableFieldsAndMergedTextIndex( schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME, DataType.STRING).build(), inputJsonNode, expectedJsonNodeWithMergedTextIndex); @@ -540,10 +546,7 @@ public class SchemaConformingTransformerV2Test { } }, __mergedTextIndex: [ - "[0, 1, 2, 3]:arrayField", "a:stringField", - "[0, 1, 2, 3]:mapField.arrayField", "a:mapField.stringField", - "[0, 1, 2, 3]:nestedFields.arrayField", "a:nestedFields.stringField", - "[0, 1, 2, 3]:nestedFields.mapField.arrayField", "a:nestedFields.mapField.stringField", + // See the value of expectedJsonNodeWithMergedTextIndex ] } */ @@ -567,9 +570,15 @@ public class SchemaConformingTransformerV2Test { .set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NO_IDX_NODE))); transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(), inputJsonNode, expectedJsonNode); expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, - N.arrayNode().add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField") - .add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("a:nestedFields.stringField") - .add("[0,1,2,3]:nestedFields.mapField.arrayField").add("a:nestedFields.mapField.stringField")); + N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField") + .add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField") + .add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField") + .add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField") + .add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField") + .add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField") + .add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField") + .add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField") + .add("a:nestedFields.mapField.stringField")); transformWithUnIndexableFieldsAndMergedTextIndex( schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME, DataType.STRING).build(), inputJsonNode, expectedJsonNodeWithMergedTextIndex); @@ -713,8 +722,10 @@ public class SchemaConformingTransformerV2Test { .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME, TEST_JSON_STRING_NO_IDX_NODE))); expectedJsonNodeWithMergedTextIndex = expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, - N.arrayNode().add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:nestedFields.arrayField").add( - "a:nestedFields.stringField")); + N.arrayNode().add("0:arrayField").add("1:arrayField").add("2:arrayField").add("3:arrayField"). + add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:nestedFields.arrayField"). + add("0:nestedFields.arrayField").add("1:nestedFields.arrayField").add("2:nestedFields.arrayField"). + add("3:nestedFields.arrayField").add("a:nestedFields.stringField")); transformKeyValueTransformation( schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME, DataType.STRING).build(), keyMapping, pathToDrop, pathToPreserve, inputJsonNode, expectedJsonNodeWithMergedTextIndex); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org