GXM2333 opened a new issue #8259: URL: https://github.com/apache/pinot/issues/8259
Hello,I'm a beginner of pinot.I found segment is the smallest unit when data and index files flushing to disk. I use `SegmentDumpTool` class to parse one segment's `v3` directory.  And I found `metadata.properties` file contains basic metrics about table fields. `index_map` file contains offset and size of per field's index in `columns.psf` file, like field dictionary and forward_index. However, where are other indexes stored, like inverted index ,range index, bloom filter, etc? In my table config, I configured `invertedIndexColumns`, `sortedColumn` , `bloomFilterColumns`. But I only found dictionary and forward_index in `index_map` and `columns.psf`. Here is my table config. Would you mind explain pinot file formats a little bit? Thanks a lot. ``` { "OFFLINE": { "tableName": "transcripts_OFFLINE", "tableType": "OFFLINE", "segmentsConfig": { "timeType": "MILLISECONDS", "schemaName": "transcripts", "retentionTimeUnit": "HOURS", "retentionTimeValue": "2", "replication": "1", "timeColumnName": "timestampInEpoch", "allowNullTimeValue": false, "replicaGroupStrategyConfig": { "partitionColumn": "studentID", "numInstancesPerPartition": 2 } }, "tenants": { "broker": "DefaultTenant", "server": "DefaultTenant" }, "tableIndexConfig": { "invertedIndexColumns": [ "lastName" ], "sortedColumn": [ "firstName" ], "bloomFilterColumns": [ "score" ], "loadMode": "MMAP", "enableDefaultStarTree": false, "enableDynamicStarTreeCreation": false, "segmentPartitionConfig": { "columnPartitionMap": { "studentID": { "functionName": "HashCode", "numPartitions": 4 } } }, "aggregateMetrics": false, "nullHandlingEnabled": false, "rangeIndexColumns": [ "score" ], "rangeIndexVersion": 1, "autoGeneratedInvertedIndex": false, "createInvertedIndexDuringSegmentGeneration": false }, "metadata": { "customConfigs": {} }, "task": { "taskTypeConfigsMap": { "MergeRollupTask": { "1hour.mergeType": "rollup", "1hour.bucketTimePeriod": "1h", "1hour.bufferTimePeriod": "3h", "1hour.maxNumRecordsPerSegment": "1000000", "1hour.maxNumRecordsPerTask": "5000000", "1hour.maxNumParallelBuckets": "5", "1day.mergeType": "rollup", "1day.bucketTimePeriod": "1d", "1day.bufferTimePeriod": "1d", "1day.roundBucketTimePeriod": "1d", "1day.maxNumRecordsPerSegment": "1000000", "1day.maxNumRecordsPerTask": "5000000", "score.aggregationType": "sum" } } }, "routing": { "segmentPrunerTypes": [ "partition" ], "instanceSelectorType": "replicaGroup" }, "isDimTable": false }, "REALTIME": { "tableName": "transcripts_REALTIME", "tableType": "REALTIME", "segmentsConfig": { "timeType": "MILLISECONDS", "schemaName": "transcripts", "timeColumnName": "timestampInEpoch", "allowNullTimeValue": false, "replicasPerPartition": "2", "replicaGroupStrategyConfig": { "partitionColumn": "studentID", "numInstancesPerPartition": 2 } }, "tenants": { "broker": "DefaultTenant", "server": "DefaultTenant" }, "tableIndexConfig": { "invertedIndexColumns": [ "lastName" ], "streamConfigs": { "streamType": "kafka", "stream.kafka.consumer.type": "lowlevel", "stream.kafka.topic.name": "transcript-topic_1", "stream.kafka.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder", "stream.kafka.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory", "stream.kafka.broker.list": "aptssd1.safe.lycc.qihoo.net:19092", "realtime.segment.flush.threshold.rows": "0", "realtime.segment.flush.threshold.time": "3m", "realtime.segment.flush.threshold.segment.size": "50M", "stream.kafka.consumer.prop.auto.offset.reset": "smallest" }, "sortedColumn": [ "firstName" ], "bloomFilterColumns": [ "score" ], "loadMode": "MMAP", "enableDefaultStarTree": false, "enableDynamicStarTreeCreation": false, "segmentPartitionConfig": { "columnPartitionMap": { "studentID": { "functionName": "HashCode", "numPartitions": 4 } } }, "aggregateMetrics": false, "nullHandlingEnabled": false, "rangeIndexColumns": [ "score" ], "rangeIndexVersion": 1, "autoGeneratedInvertedIndex": false, "createInvertedIndexDuringSegmentGeneration": false }, "metadata": { "customConfigs": {} }, "task": { "taskTypeConfigsMap": { "RealtimeToOfflineSegmentsTask": { "bucketTimePeriod": "6h", "bufferTimePeriod": "1d", "roundBucketTimePeriod": "1m", "mergeType": "rollup", "score.aggregationType": "max", "maxNumRecordsPerSegment": "100000" } } }, "routing": { "segmentPrunerTypes": [ "partition" ], "instanceSelectorType": "replicaGroup" }, "instanceAssignmentConfigMap": { "CONSUMING": { "tagPoolConfig": { "tag": "DefaultTenant_REALTIME", "poolBased": false, "numPools": 0 }, "replicaGroupPartitionConfig": { "replicaGroupBased": true, "numInstances": 0, "numReplicaGroups": 2, "numInstancesPerReplicaGroup": 2, "numPartitions": 0, "numInstancesPerPartition": 0 } } }, "isDimTable": false } } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org