rohityadav1993 commented on code in PR #16727:
URL: https://github.com/apache/pinot/pull/16727#discussion_r2432546141
##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java:
##########
@@ -677,4 +706,63 @@ public int getSkippedRowsFound() {
public int getSanitizedRowsFound() {
return _sanitizedRowsFound;
}
+
+ /**
+ * Build segment using columnar approach.
+ * This method builds the segment by processing data column-wise instead of
row-wise.
+ *
+ * @throws Exception if segment building fails
+ */
+ public void buildColumnar() throws Exception {
+ if (!(_dataSource instanceof ColumnarSegmentCreationDataSource)) {
+ throw new IllegalStateException("buildColumnar() can only be called
after initColumnar()");
+ }
+
+ ColumnarSegmentCreationDataSource columnarDataSource =
(ColumnarSegmentCreationDataSource) _dataSource;
+ Map<String, ColumnReader> columnReaders =
columnarDataSource.getColumnReaders();
+
+ LOGGER.info("Starting columnar segment building for {} columns",
columnReaders.size());
+
+ // Reuse existing stats collection and index creation info logic
+ LOGGER.debug("Start building StatsCollector!");
+ collectStatsAndIndexCreationInfo();
+ LOGGER.info("Finished building StatsCollector!");
+ LOGGER.info("Collected stats for {} documents", _totalDocs);
+
+ if (_totalDocs == 0) {
+ LOGGER.warn("No documents found in data source");
+ handlePostCreation();
+ return;
+ }
+
+ try {
+ // Initialize the index creation using the per-column statistics
information
+ _indexCreator.init(_config, _segmentIndexCreationInfo,
_indexCreationInfoMap, _dataSchema, _tempIndexDir, null);
+
+ // Build the indexes column-wise (true column-major approach)
+ LOGGER.info("Start building Index using columnar approach");
+ long indexStartTime = System.nanoTime();
+
+ TreeSet<String> columns = _dataSchema.getPhysicalColumnNames();
+ for (String columnName : columns) {
+ LOGGER.debug("Indexing column: {}", columnName);
+ ColumnReader columnReader = columnReaders.get(columnName);
+ if (columnReader == null) {
+ throw new IllegalStateException("No column reader found for column:
" + columnName);
+ }
+
+ // Index each column independently using true column-major approach
+ // This is similar to how buildByColumn works but uses ColumnReader
instead of IndexSegment
+ ((SegmentColumnarIndexCreator) _indexCreator).indexColumn(columnName,
columnReader);
+ }
+
+ _totalIndexTimeNs = System.nanoTime() - indexStartTime;
+ } catch (Exception e) {
+ _indexCreator.close();
+ throw e;
+ }
Review Comment:
makes sense, added close method to ColumnarSegmentCreationDataSource and
invoked here which will close all columnReaders.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]