lnbest0707-uber commented on code in PR #12788: URL: https://github.com/apache/pinot/pull/12788#discussion_r1554763002
########## pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SchemaConformingTransformerV2Config.java: ########## @@ -0,0 +1,237 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.spi.config.table.ingestion; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonPropertyDescription; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.pinot.spi.config.BaseJsonConfig; + +public class SchemaConformingTransformerV2Config extends BaseJsonConfig { + @JsonPropertyDescription("Enable indexable extras") + private boolean _enableIndexableExtras = true; + + @JsonPropertyDescription("Name of the field that should contain extra fields that are not part of the schema.") + private String _indexableExtrasField = "json_data"; + + @JsonPropertyDescription("Enable unindexable extras") + private boolean _enableUnindexableExtras = true; + + @JsonPropertyDescription( + "Like indexableExtrasField except it only contains fields with the suffix in unindexableFieldSuffix.") + private String _unindexableExtrasField = "json_data_no_idx"; + + @JsonPropertyDescription("The suffix of fields that must be stored in unindexableExtrasField") + private String _unindexableFieldSuffix = "_noindex"; + + @JsonPropertyDescription("Array of flattened (dot-delimited) object paths to drop") + private Set<String> _fieldPathsToDrop = new HashSet<>(); + + @JsonPropertyDescription("Map from customized meaningful column name to json key path") + private Map<String, String> _columnNameToJsonKeyPathMap = new HashMap<>(); + + @JsonPropertyDescription("mergedTextIndex field") + private String _mergedTextIndexField = "__mergedTextIndex"; + + @JsonPropertyDescription("mergedTextIndex token max length") + private int _mergedTextIndexTokenMaxLength = 32766; + + @JsonPropertyDescription( + "Recall that merged text index token is in the format of <value:key>. " + + "The mergedTextIndex shingling overlap length refers to the " + + "maximum search length of the value that will yield results with " + + "100% accuracy. If the value is null, shingle index will be turned off " + + "and the value will be truncated such that the token is equal to " + + "_mergedTextIndexTokenMaxLength" + ) + private @Nullable Integer _mergedTextIndexShinglingOverlapLength = null; + + @JsonPropertyDescription("mergedTextIndex binary token detection minimum length") + private Integer _mergedTextIndexBinaryTokenDetectionMinLength = 512; + + @JsonPropertyDescription("Array of paths to exclude from merged text index.") + private Set<String> _mergedTextIndexPathToExclude = new HashSet<>(); + + // TODO: set default value from CLPRewriter once it open sourced + @JsonPropertyDescription("Array of suffix to exclude from merged text index.") + private List<String> _mergedTextIndexSuffixToExclude = Arrays.asList("_logtype", "_dictionaryVars", "_encodedVars"); + + @JsonPropertyDescription("Dedicated fields to double ingest into json_data column") + private Set<String> _fieldsToDoubleIngest = new HashSet<>(); + + @JsonCreator + public SchemaConformingTransformerV2Config( + @JsonProperty("enableIndexableExtras") @Nullable Boolean enableIndexableExtras, + @JsonProperty("indexableExtrasField") String indexableExtrasField, + @JsonProperty("enableUnindexableExtras") @Nullable Boolean enableUnindexableExtras, + @JsonProperty("unindexableExtrasField") @Nullable String unindexableExtrasField, + @JsonProperty("unindexableFieldSuffix") @Nullable String unindexableFieldSuffix, + @JsonProperty("fieldPathsToDrop") @Nullable Set<String> fieldPathsToDrop, + @JsonProperty("mergedTextIndexField") @Nullable String mergedTextIndexField, + @JsonProperty("mergedTextIndexTokenMaxLength") @Nullable Integer mergedTextIndexTokenMaxLength, + @JsonProperty("mergedTextIndexShinglingOverlapLength") @Nullable Integer mergedTextIndexShinglingOverlapLength, + @JsonProperty("mergedTextIndexBinaryTokenDetectionMinLength") + @Nullable Integer mergedTextIndexBinaryTokenDetectionMinLength, + @JsonProperty("mergedTextIndexPathToExclude") @Nullable Set<String> mergedTextIndexPathToExclude, + @JsonProperty("fieldsToDoubleIngest") @Nullable Set<String> fieldsToDoubleIngest + ) { + setEnableIndexableExtras(enableIndexableExtras); + setIndexableExtrasField(indexableExtrasField); + setEnableUnindexableExtras(enableUnindexableExtras); + setUnindexableExtrasField(unindexableExtrasField); + setUnindexableFieldSuffix(unindexableFieldSuffix); + setFieldPathsToDrop(fieldPathsToDrop); + + setMergedTextIndexField(mergedTextIndexField); + setMergedTextIndexTokenMaxLength(mergedTextIndexTokenMaxLength); + setMergedTextIndexShinglingTokenOverlapLength(mergedTextIndexShinglingOverlapLength); + setMergedTextIndexBinaryTokenDetectionMinLength(mergedTextIndexBinaryTokenDetectionMinLength); + setMergedTextIndexPathToExclude(mergedTextIndexPathToExclude); + setFieldsToDoubleIngest(fieldsToDoubleIngest); + } + + public SchemaConformingTransformerV2Config setEnableIndexableExtras(Boolean enableIndexableExtras) { + _enableIndexableExtras = enableIndexableExtras == null ? _enableUnindexableExtras : enableIndexableExtras; + return this; + } + + public String getIndexableExtrasField() { + return _enableIndexableExtras ? _indexableExtrasField : null; + } + + public SchemaConformingTransformerV2Config setIndexableExtrasField(String indexableExtrasField) { + _indexableExtrasField = (null == indexableExtrasField) ? _indexableExtrasField : indexableExtrasField; + return this; + } + + public SchemaConformingTransformerV2Config setEnableUnindexableExtras(Boolean enableUnindexableExtras) { + _enableUnindexableExtras = enableUnindexableExtras == null ? _enableUnindexableExtras : enableUnindexableExtras; + return this; + } + + public String getUnindexableExtrasField() { + return _enableUnindexableExtras ? _unindexableExtrasField : null; + } + + public SchemaConformingTransformerV2Config setUnindexableExtrasField(String unindexableExtrasField) { + _unindexableExtrasField = (null == unindexableExtrasField) ? _unindexableExtrasField : unindexableExtrasField; + return this; + } + + public String getUnindexableFieldSuffix() { + return _unindexableFieldSuffix; + } + + public SchemaConformingTransformerV2Config setUnindexableFieldSuffix(String unindexableFieldSuffix) { + _unindexableFieldSuffix = (null == unindexableFieldSuffix) ? _unindexableFieldSuffix : unindexableFieldSuffix; + return this; + } + + public Set<String> getFieldPathsToDrop() { + return _fieldPathsToDrop; + } + + public SchemaConformingTransformerV2Config setFieldPathsToDrop(Set<String> fieldPathsToDrop) { + _fieldPathsToDrop = (null == fieldPathsToDrop) ? _fieldPathsToDrop : fieldPathsToDrop; + return this; + } + + public Map<String, String> getColumnNameToJsonKeyPathMap() { + return _columnNameToJsonKeyPathMap; + } + + public SchemaConformingTransformerV2Config setColumnNameToJsonKeyPathMap( + Map<String, String> columnNameToJsonKeyPathMap) { + _columnNameToJsonKeyPathMap = (null == columnNameToJsonKeyPathMap) + ? _columnNameToJsonKeyPathMap : columnNameToJsonKeyPathMap; + return this; + } + + public String getMergedTextIndexField() { + return _mergedTextIndexField; + } + + public SchemaConformingTransformerV2Config setMergedTextIndexField(String mergedTextIndexField) { + _mergedTextIndexField = (null == mergedTextIndexField) ? _mergedTextIndexField : mergedTextIndexField; + return this; + } + + public Integer getMergedTextIndexTokenMaxLength() { + return _mergedTextIndexTokenMaxLength; + } + + public SchemaConformingTransformerV2Config setMergedTextIndexTokenMaxLength( + Integer mergedTextIndexTokenMaxLength + ) { + _mergedTextIndexTokenMaxLength = (null == mergedTextIndexTokenMaxLength) + ? _mergedTextIndexTokenMaxLength : mergedTextIndexTokenMaxLength; + return this; + } + + public Integer getMergedTextIndexShinglingOverlapLength() { + return _mergedTextIndexShinglingOverlapLength; + } + + public SchemaConformingTransformerV2Config setMergedTextIndexShinglingTokenOverlapLength( + Integer mergedTextIndexShinglingOverlapLength) { + _mergedTextIndexShinglingOverlapLength = mergedTextIndexShinglingOverlapLength; Review Comment: We are treating `mergedTextIndexShinglingOverlapLength = null` as a special case in transformer, there should not be side effect as long as we check its null value during usage `if (null == mergedTextIndexShinglingOverlapLength) { generateTextIndexToken(kv, luceneTokens, mergedTextIndexTokenMaxLength); } else { generateShingleTextIndexToken(kv, luceneTokens, mergedTextIndexTokenMaxLength, mergedTextIndexShinglingOverlapLength); }` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org