anuragrai16 commented on code in PR #17264:
URL: https://github.com/apache/pinot/pull/17264#discussion_r2567378681


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java:
##########
@@ -38,47 +39,80 @@ public class CrcUtils {
   private static final Logger LOGGER = LoggerFactory.getLogger(CrcUtils.class);
   private static final int BUFFER_SIZE = 65536;
   private static final String CRC_FILE_EXTENSTION = ".crc";
+  private static final List<String> DATA_FILE_EXTENSIONS = 
Arrays.asList(".fwd", ".dict", ".inv");
+  private static final String METADATA_FILE = "metadata.properties";
 
   private final List<File> _files;
+  private final List<File> _dataFiles;
 
-  private CrcUtils(List<File> files) {
+  private CrcUtils(List<File> files, List<File> dataFiles) {
     _files = files;
+    _dataFiles = dataFiles;
   }
 
   public static CrcUtils forAllFilesInFolder(File dir) {
-    List<File> normalFiles = new ArrayList<>();
-    getAllNormalFiles(dir, normalFiles);
-    Collections.sort(normalFiles);
-    return new CrcUtils(normalFiles);
+    List<File> allNormalFiles = new ArrayList<>();
+    List<File> dataFiles = new ArrayList<>();
+    collectFiles(dir, allNormalFiles, dataFiles);
+    Collections.sort(allNormalFiles);
+    Collections.sort(dataFiles);
+    return new CrcUtils(allNormalFiles, dataFiles);
   }
 
   /**
-   * Helper method to get all normal (non-directory) files under a directory 
recursively.
+   * Helper method to get all files (normal and data files) in the directory 
to later compute CRC for them.
    * <p>NOTE: do not include the segment creation meta file.
    */
-  private static void getAllNormalFiles(File dir, List<File> normalFiles) {
+  private static void collectFiles(File dir, List<File> normalFiles, 
List<File> dataFiles) {
     File[] files = dir.listFiles();
     Preconditions.checkNotNull(files);
     for (File file : files) {
       if (file.isFile()) {
+        String fileName = file.getName();
         // Certain file systems, e.g. HDFS will create .crc files when perform 
data copy.
         // We should ignore both SEGMENT_CREATION_META and generated '.crc' 
files.
-        if (!file.getName().equals(V1Constants.SEGMENT_CREATION_META) && 
!file.getName()
+        if (!fileName.equals(V1Constants.SEGMENT_CREATION_META) && !fileName
             .endsWith(CRC_FILE_EXTENSTION)) {
+          // add all files to normal files
           normalFiles.add(file);
+          //include data extension files and metadata file to dataFiles
+          // Conditionally add to the data-only list
+          if (isDataFile(fileName)) {
+            dataFiles.add(file);
+          }
         }
       } else {
-        getAllNormalFiles(file, normalFiles);
+        collectFiles(file, normalFiles, dataFiles);
       }
     }
   }
 
-  public long computeCrc()
+  /**
+   * Determines if a file is considered a "Data File" (Metadata or one of 
".fwd", ".dict", ".inv" file types).
+   */
+  private static boolean isDataFile(String fileName) {

Review Comment:
   Agree with not including metadata.properties file, it's not needed. But the 
inverted index is indeed needed. When forward index is disabled for a column, 
we mandate adding both dictionary and inverted index for the column. The data 
for a column can be appropriately represented only using both the dictionary 
and the inverted index. Removing the inverted index file would mean we can no 
longer capture data changes for a column with forward index disabled. eg. Think 
of a case where two segments have the same data and differs only by one value 
in a column that has forward index disabled. The dictionary created for both 
segments is same (unique values are same) - only the inverted index is disabled.
   
   Lmk if you disagree. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to