fx19880617 commented on a change in pull request #5927: URL: https://github.com/apache/incubator-pinot/pull/5927#discussion_r477566730
########## File path: pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark/src/main/java/org/apache/pinot/plugin/ingestion/batch/spark/SparkSegmentGenerationJobRunner.java ########## @@ -200,8 +199,30 @@ public void run() } List<String> pathAndIdxList = new ArrayList<>(); - for (int i = 0; i < filteredFiles.size(); i++) { - pathAndIdxList.add(String.format("%s %d", filteredFiles.get(i), i)); + String localDirectorySequenceIdString = _spec.getSegmentNameGeneratorSpec().getConfigs().get(LOCAL_DIRECTORY_SEQUENCE_ID); + boolean localDirectorySequenceId = false; + if (localDirectorySequenceIdString != null) { + localDirectorySequenceId = Boolean.parseBoolean(localDirectorySequenceIdString); + } + if (localDirectorySequenceId) { + Map<String, List<String>> localDirIndex = new HashMap<>(); + for (String filteredFile : filteredFiles) { + Path filteredParentPath = Paths.get(filteredFile).getParent(); + if (!localDirIndex.containsKey(filteredParentPath.toString())) { + localDirIndex.put(filteredParentPath.toString(), new ArrayList<>()); + } + localDirIndex.get(filteredParentPath.toString()).add(filteredFile); + } + for (String parentPath: localDirIndex.keySet()){ + List<String> siblingFiles = localDirIndex.get(parentPath); Review comment: suggest to sort siblingFiles list, so rerun segment creation job will give same segments list ########## File path: pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark/src/main/java/org/apache/pinot/plugin/ingestion/batch/spark/SparkSegmentGenerationJobRunner.java ########## @@ -29,12 +30,10 @@ import java.io.Serializable; import java.net.URI; import java.nio.file.FileSystems; +import java.nio.file.Path; import java.nio.file.PathMatcher; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.UUID; +import java.util.*; Review comment: expend the .* ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org