gortiz commented on code in PR #10184: URL: https://github.com/apache/pinot/pull/10184#discussion_r1148959765
########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java: ########## @@ -133,183 +122,144 @@ public void init(SegmentGeneratorConfig segmentCreationSpec, SegmentIndexCreatio return; } - Collection<FieldSpec> fieldSpecs = schema.getAllFieldSpecs(); - Set<String> invertedIndexColumns = new HashSet<>(); - for (String columnName : _config.getInvertedIndexCreationColumns()) { - Preconditions.checkState(schema.hasColumn(columnName), - "Cannot create inverted index for column: %s because it is not in schema", columnName); - invertedIndexColumns.add(columnName); - } + Map<String, FieldIndexConfigs> indexConfigs = segmentCreationSpec.getIndexConfigsByColName(); - Set<String> bloomFilterColumns = new HashSet<>(); - for (String columnName : _config.getBloomFilterCreationColumns()) { - Preconditions.checkState(schema.hasColumn(columnName), - "Cannot create bloom filter for column: %s because it is not in schema", columnName); - bloomFilterColumns.add(columnName); - } - - Set<String> rangeIndexColumns = new HashSet<>(); - for (String columnName : _config.getRangeIndexCreationColumns()) { - Preconditions.checkState(schema.hasColumn(columnName), - "Cannot create range index for column: %s because it is not in schema", columnName); - rangeIndexColumns.add(columnName); - } - - Set<String> textIndexColumns = new HashSet<>(); - for (String columnName : _config.getTextIndexCreationColumns()) { - Preconditions.checkState(schema.hasColumn(columnName), - "Cannot create text index for column: %s because it is not in schema", columnName); - textIndexColumns.add(columnName); - } - - Set<String> fstIndexColumns = new HashSet<>(); - for (String columnName : _config.getFSTIndexCreationColumns()) { - Preconditions.checkState(schema.hasColumn(columnName), - "Cannot create FST index for column: %s because it is not in schema", columnName); - fstIndexColumns.add(columnName); - } - - Map<String, JsonIndexConfig> jsonIndexConfigs = _config.getJsonIndexConfigs(); - for (String columnName : jsonIndexConfigs.keySet()) { - Preconditions.checkState(schema.hasColumn(columnName), - "Cannot create json index for column: %s because it is not in schema", columnName); - } - - Set<String> forwardIndexDisabledColumns = new HashSet<>(); - for (String columnName : _config.getForwardIndexDisabledColumns()) { - Preconditions.checkState(schema.hasColumn(columnName), String.format("Invalid config. Can't disable " - + "forward index creation for a column: %s that does not exist in schema", columnName)); - forwardIndexDisabledColumns.add(columnName); - } - - Map<String, H3IndexConfig> h3IndexConfigs = _config.getH3IndexConfigs(); - for (String columnName : h3IndexConfigs.keySet()) { - Preconditions.checkState(schema.hasColumn(columnName), - "Cannot create H3 index for column: %s because it is not in schema", columnName); - } + _creatorsByColAndIndex = Maps.newHashMapWithExpectedSize(indexConfigs.keySet().size()); - // Initialize creators for dictionary, forward index and inverted index - IndexingConfig indexingConfig = _config.getTableConfig().getIndexingConfig(); - int rangeIndexVersion = indexingConfig.getRangeIndexVersion(); - for (FieldSpec fieldSpec : fieldSpecs) { - // Ignore virtual columns + for (String columnName : indexConfigs.keySet()) { + FieldSpec fieldSpec = schema.getFieldSpecFor(columnName); + if (fieldSpec == null) { + Preconditions.checkState(schema.hasColumn(columnName), + "Cannot create inverted index for column: %s because it is not in schema", columnName); + } if (fieldSpec.isVirtualColumn()) { + LOGGER.warn("Ignoring index creation for virtual column " + columnName); continue; } - String columnName = fieldSpec.getName(); + FieldIndexConfigs originalConfig = indexConfigs.get(columnName); ColumnIndexCreationInfo columnIndexCreationInfo = indexCreationInfoMap.get(columnName); Preconditions.checkNotNull(columnIndexCreationInfo, "Missing index creation info for column: %s", columnName); boolean dictEnabledColumn = createDictionaryForColumn(columnIndexCreationInfo, segmentCreationSpec, fieldSpec); - Preconditions.checkState(dictEnabledColumn || !invertedIndexColumns.contains(columnName), + Preconditions.checkState(dictEnabledColumn || !originalConfig.getConfig(StandardIndexes.inverted()).isEnabled(), "Cannot create inverted index for raw index column: %s", columnName); - boolean forwardIndexDisabled = forwardIndexDisabledColumns.contains(columnName); + IndexType<ForwardIndexConfig, ?, ForwardIndexCreator> forwardIdx = StandardIndexes.forward(); + boolean forwardIndexDisabled = !originalConfig.getConfig(forwardIdx).isEnabled(); IndexCreationContext.Common context = IndexCreationContext.builder() .withIndexDir(_indexDir) - .withCardinality(columnIndexCreationInfo.getDistinctValueCount()) .withDictionary(dictEnabledColumn) .withFieldSpec(fieldSpec) .withTotalDocs(segmentIndexCreationInfo.getTotalDocs()) - .withMinValue((Comparable<?>) columnIndexCreationInfo.getMin()) - .withMaxValue((Comparable<?>) columnIndexCreationInfo.getMax()) - .withTotalNumberOfEntries(columnIndexCreationInfo.getTotalNumberOfEntries()) .withColumnIndexCreationInfo(columnIndexCreationInfo) - .sorted(columnIndexCreationInfo.isSorted()) + .withIsOptimizedDictionary(_config.isOptimizeDictionary() + || _config.isOptimizeDictionaryForMetrics() && fieldSpec.getFieldType() == FieldSpec.FieldType.METRIC) .onHeap(segmentCreationSpec.isOnHeap()) .withForwardIndexDisabled(forwardIndexDisabled) + .withFixedLength(columnIndexCreationInfo.isFixedLength()) + .withTextCommitOnClose(true) .build(); - // Initialize forward index creator - ChunkCompressionType chunkCompressionType = - dictEnabledColumn ? null : getColumnCompressionType(segmentCreationSpec, fieldSpec); - // Sorted columns treat the 'forwardIndexDisabled' flag as a no-op - _forwardIndexCreatorMap.put(columnName, (forwardIndexDisabled && !columnIndexCreationInfo.isSorted()) - ? null : _indexCreatorProvider.newForwardIndexCreator( - context.forForwardIndex(chunkCompressionType, segmentCreationSpec.getColumnProperties()))); - - // Initialize inverted index creator; skip creating inverted index if sorted - if (invertedIndexColumns.contains(columnName) && !columnIndexCreationInfo.isSorted()) { - _invertedIndexCreatorMap.put(columnName, - _indexCreatorProvider.newInvertedIndexCreator(context.forInvertedIndex())); - } + + FieldIndexConfigs config = adaptConfig(columnName, originalConfig, columnIndexCreationInfo, segmentCreationSpec); + if (dictEnabledColumn) { // Create dictionary-encoded index // Initialize dictionary creator // TODO: Dictionary creator holds all unique values on heap. Consider keeping dictionary instead of creator // which uses off-heap memory. - SegmentDictionaryCreator dictionaryCreator = - new SegmentDictionaryCreator(fieldSpec, _indexDir, columnIndexCreationInfo.isUseVarLengthDictionary()); - _dictionaryCreatorMap.put(columnName, dictionaryCreator); - // Create dictionary + + DictionaryIndexType dictIdx = DictionaryIndexType.INSTANCE; + // Index conf should be present if dictEnabledColumn is true. In case it doesn't, getConfig will throw an + // exception + DictionaryIndexConfig dictConfig = config.getConfig(dictIdx); + if (!dictConfig.isEnabled()) { + throw new IllegalArgumentException("Dictionary index should be enabled"); + } + SegmentDictionaryCreator creator = dictIdx.createIndexCreator(context, dictConfig); + try { - dictionaryCreator.build(columnIndexCreationInfo.getSortedUniqueElementsArray()); + creator.build(context.getSortedUniqueElementsArray()); } catch (Exception e) { LOGGER.error("Error building dictionary for field: {}, cardinality: {}, number of bytes per entry: {}", - fieldSpec.getName(), columnIndexCreationInfo.getDistinctValueCount(), - dictionaryCreator.getNumBytesPerEntry()); + context.getFieldSpec().getName(), context.getCardinality(), creator.getNumBytesPerEntry()); throw e; } - } - if (bloomFilterColumns.contains(columnName)) { - if (indexingConfig.getBloomFilterConfigs() != null - && indexingConfig.getBloomFilterConfigs().containsKey(columnName)) { - _bloomFilterCreatorMap.put(columnName, _indexCreatorProvider.newBloomFilterCreator( - context.forBloomFilter(indexingConfig.getBloomFilterConfigs().get(columnName)))); - } else { - _bloomFilterCreatorMap.put(columnName, _indexCreatorProvider.newBloomFilterCreator( - context.forBloomFilter(new BloomFilterConfig(BloomFilterConfig.DEFAULT_FPP, 0, false)))); - } + _dictionaryCreatorMap.put(columnName, creator); } - if (!columnIndexCreationInfo.isSorted() && rangeIndexColumns.contains(columnName)) { - _rangeIndexFilterCreatorMap.put(columnName, - _indexCreatorProvider.newRangeIndexCreator(context.forRangeIndex(rangeIndexVersion))); + Map<IndexType<?, ?, ?>, IndexCreator> creatorsByIndex = + Maps.newHashMapWithExpectedSize(IndexService.getInstance().getAllIndexes().size()); + for (IndexType<?, ?, ?> index : IndexService.getInstance().getAllIndexes()) { + if (skipIndexType(index)) { + continue; + } + if (config.getConfig(index).isEnabled()) { + tryCreateCreator(creatorsByIndex, index, context, config); + } } - - if (textIndexColumns.contains(columnName)) { - FSTType fstType = FSTType.LUCENE; - List<FieldConfig> fieldConfigList = _config.getTableConfig().getFieldConfigList(); - if (fieldConfigList != null) { - for (FieldConfig fieldConfig : fieldConfigList) { - if (fieldConfig.getName().equals(columnName)) { - Map<String, String> properties = fieldConfig.getProperties(); - if (TextIndexUtils.isFstTypeNative(properties)) { - fstType = FSTType.NATIVE; - } - } - } + // TODO: Remove this when values stored as ForwardIndex stop depending on TextIndex config + IndexCreator oldFwdCreator = creatorsByIndex.get(forwardIdx); + if (oldFwdCreator instanceof AbstractForwardIndexCreator) { // this implies that oldFwdCreator != null + Object fakeForwardValue = calculateAlternativeValue(dictEnabledColumn, config, fieldSpec); Review Comment: This is indeed quite different. There were some strange hacks in `SegmentCOlumnarIndexCreator` when text (and only text, excluding fst) indexes were enabled. > why call it fake? what's the fakeFwdValue used for? It seems that in some situations we decided to change the value stored in forward index if and only if a text index was there. We have been discussing about that and I think what we actually want to do with that is to do not have a forward index when there is a text index and some property is true. Meanwhile, what we do is to insert an invalid value in the forward index. This invalid value is the same for all rows. This is why this is called fake, because it is actually a fake value that should never be used. > is this about the _rawValueForTextIndex set in TextIndexConfig? Yes > It sounds like we are tracking raw value with the config object? Yes. TextIndexConfig contains the value of `_rawValueForTextIndex`. It can be directly specified in text index config as: ```js // in fieldConfigList { indexes: [ text: { rawValue: "new value" } ] } ``` but the older syntax that used `properies.rawValueForTextIndex` is still valid. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org