GXM2333 opened a new issue #8259:
URL: https://github.com/apache/pinot/issues/8259


   Hello,I'm a beginner of pinot.I found segment is the smallest unit when data 
and index files flushing to disk. I use `SegmentDumpTool` class to parse one 
segment's `v3` directory. 
   
   
![](https://user-images.githubusercontent.com/24871202/155944810-f8919034-c9d0-446b-9b21-49f1303e3b31.png)
   
   And I found `metadata.properties` file contains basic metrics about table 
fields. `index_map` file contains offset and size of per field's  index in 
`columns.psf` file, like field dictionary and forward_index. However, where are 
other indexes stored, like inverted index ,range index, bloom filter, etc? 
   
   In my table config, I configured `invertedIndexColumns`, `sortedColumn` , 
`bloomFilterColumns`. But I only found dictionary and forward_index in 
`index_map` and `columns.psf`.
   
   Here is my table config. Would you mind explain pinot file formats a little 
bit? Thanks a lot.
   
   ```
   {
     "OFFLINE": {
       "tableName": "transcripts_OFFLINE",
       "tableType": "OFFLINE",
       "segmentsConfig": {
         "timeType": "MILLISECONDS",
         "schemaName": "transcripts",
         "retentionTimeUnit": "HOURS",
         "retentionTimeValue": "2",
         "replication": "1",
         "timeColumnName": "timestampInEpoch",
         "allowNullTimeValue": false,
         "replicaGroupStrategyConfig": {
           "partitionColumn": "studentID",
           "numInstancesPerPartition": 2
         }
       },
       "tenants": {
         "broker": "DefaultTenant",
         "server": "DefaultTenant"
       },
       "tableIndexConfig": {
         "invertedIndexColumns": [
           "lastName"
         ],
         "sortedColumn": [
           "firstName"
         ],
         "bloomFilterColumns": [
           "score"
         ],
         "loadMode": "MMAP",
         "enableDefaultStarTree": false,
         "enableDynamicStarTreeCreation": false,
         "segmentPartitionConfig": {
           "columnPartitionMap": {
             "studentID": {
               "functionName": "HashCode",
               "numPartitions": 4
             }
           }
         },
         "aggregateMetrics": false,
         "nullHandlingEnabled": false,
         "rangeIndexColumns": [
           "score"
         ],
         "rangeIndexVersion": 1,
         "autoGeneratedInvertedIndex": false,
         "createInvertedIndexDuringSegmentGeneration": false
       },
       "metadata": {
         "customConfigs": {}
       },
       "task": {
         "taskTypeConfigsMap": {
           "MergeRollupTask": {
             "1hour.mergeType": "rollup",
             "1hour.bucketTimePeriod": "1h",
             "1hour.bufferTimePeriod": "3h",
             "1hour.maxNumRecordsPerSegment": "1000000",
             "1hour.maxNumRecordsPerTask": "5000000",
             "1hour.maxNumParallelBuckets": "5",
             "1day.mergeType": "rollup",
             "1day.bucketTimePeriod": "1d",
             "1day.bufferTimePeriod": "1d",
             "1day.roundBucketTimePeriod": "1d",
             "1day.maxNumRecordsPerSegment": "1000000",
             "1day.maxNumRecordsPerTask": "5000000",
             "score.aggregationType": "sum"
           }
         }
       },
       "routing": {
         "segmentPrunerTypes": [
           "partition"
         ],
         "instanceSelectorType": "replicaGroup"
       },
       "isDimTable": false
     },
     "REALTIME": {
       "tableName": "transcripts_REALTIME",
       "tableType": "REALTIME",
       "segmentsConfig": {
         "timeType": "MILLISECONDS",
         "schemaName": "transcripts",
         "timeColumnName": "timestampInEpoch",
         "allowNullTimeValue": false,
         "replicasPerPartition": "2",
         "replicaGroupStrategyConfig": {
           "partitionColumn": "studentID",
           "numInstancesPerPartition": 2
         }
       },
       "tenants": {
         "broker": "DefaultTenant",
         "server": "DefaultTenant"
       },
       "tableIndexConfig": {
         "invertedIndexColumns": [
           "lastName"
         ],
         "streamConfigs": {
           "streamType": "kafka",
           "stream.kafka.consumer.type": "lowlevel",
           "stream.kafka.topic.name": "transcript-topic_1",
           "stream.kafka.decoder.class.name": 
"org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
           "stream.kafka.consumer.factory.class.name": 
"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory",
           "stream.kafka.broker.list": "aptssd1.safe.lycc.qihoo.net:19092",
           "realtime.segment.flush.threshold.rows": "0",
           "realtime.segment.flush.threshold.time": "3m",
           "realtime.segment.flush.threshold.segment.size": "50M",
           "stream.kafka.consumer.prop.auto.offset.reset": "smallest"
         },
         "sortedColumn": [
           "firstName"
         ],
         "bloomFilterColumns": [
           "score"
         ],
         "loadMode": "MMAP",
         "enableDefaultStarTree": false,
         "enableDynamicStarTreeCreation": false,
         "segmentPartitionConfig": {
           "columnPartitionMap": {
             "studentID": {
               "functionName": "HashCode",
               "numPartitions": 4
             }
           }
         },
         "aggregateMetrics": false,
         "nullHandlingEnabled": false,
         "rangeIndexColumns": [
           "score"
         ],
         "rangeIndexVersion": 1,
         "autoGeneratedInvertedIndex": false,
         "createInvertedIndexDuringSegmentGeneration": false
       },
       "metadata": {
         "customConfigs": {}
       },
       "task": {
         "taskTypeConfigsMap": {
           "RealtimeToOfflineSegmentsTask": {
             "bucketTimePeriod": "6h",
             "bufferTimePeriod": "1d",
             "roundBucketTimePeriod": "1m",
             "mergeType": "rollup",
             "score.aggregationType": "max",
             "maxNumRecordsPerSegment": "100000"
           }
         }
       },
       "routing": {
         "segmentPrunerTypes": [
           "partition"
         ],
         "instanceSelectorType": "replicaGroup"
       },
       "instanceAssignmentConfigMap": {
         "CONSUMING": {
           "tagPoolConfig": {
             "tag": "DefaultTenant_REALTIME",
             "poolBased": false,
             "numPools": 0
           },
           "replicaGroupPartitionConfig": {
             "replicaGroupBased": true,
             "numInstances": 0,
             "numReplicaGroups": 2,
             "numInstancesPerReplicaGroup": 2,
             "numPartitions": 0,
             "numInstancesPerPartition": 0
           }
         }
       },
       "isDimTable": false
     }
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to