zhaih commented on code in PR #11815: URL: https://github.com/apache/lucene/pull/11815#discussion_r1025807882
########## lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java: ########## @@ -60,18 +60,58 @@ public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException { return bits; } - public static List<IndexRearranger.DocumentSelector> createFromExistingIndex( + /** + * Create a selector for the deletes in an index, which can then be applied to a rearranged index + * + * @param field tells which {@link BinaryDocValues} are the unique key + * @param directory where the original index is present + * @return a deletes selector to be passed to {@link IndexRearranger} + */ + public static IndexRearranger.DocumentSelector createDeleteSelectorFromIndex( String field, Directory directory) throws IOException { - List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>(); + + Set<String> keySet = new HashSet<>(); + try (IndexReader reader = DirectoryReader.open(directory)) { for (LeafReaderContext context : reader.leaves()) { - HashSet<String> keySet = new HashSet<>(); Bits liveDocs = context.reader().getLiveDocs(); BinaryDocValues binaryDocValues = context.reader().getBinaryDocValues(field); + for (int i = 0; i < context.reader().maxDoc(); i++) { - if (liveDocs != null && liveDocs.get(i) == false) { + if (liveDocs == null || liveDocs.get(i) == true) { Review Comment: move `liveDocs == null` before the loop so we can terminate earlier? ########## lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java: ########## @@ -32,26 +33,25 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; -/** Select documents using binary doc values */ +/** + * Use this selector to rearrange an index where documents can be uniquely identified based on + * {@link BinaryDocValues} + */ public class BinaryDocValueSelector implements IndexRearranger.DocumentSelector, Serializable { private final String field; - private final HashSet<String> keySet; + private final Set<String> keySet; Review Comment: Since we're anyway rewriting this class quite a lot and is going to only put it under Lucene 10, let's change this `keySet` to `Set<BytesRef>`? Since the identifier doesn't need to be able to be converted to string. (That was my fault! Sorry!) ########## lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java: ########## @@ -60,18 +60,58 @@ public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException { return bits; } - public static List<IndexRearranger.DocumentSelector> createFromExistingIndex( + /** + * Create a selector for the deletes in an index, which can then be applied to a rearranged index + * + * @param field tells which {@link BinaryDocValues} are the unique key + * @param directory where the original index is present + * @return a deletes selector to be passed to {@link IndexRearranger} + */ + public static IndexRearranger.DocumentSelector createDeleteSelectorFromIndex( String field, Directory directory) throws IOException { - List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>(); + + Set<String> keySet = new HashSet<>(); + try (IndexReader reader = DirectoryReader.open(directory)) { for (LeafReaderContext context : reader.leaves()) { - HashSet<String> keySet = new HashSet<>(); Bits liveDocs = context.reader().getLiveDocs(); BinaryDocValues binaryDocValues = context.reader().getBinaryDocValues(field); + for (int i = 0; i < context.reader().maxDoc(); i++) { - if (liveDocs != null && liveDocs.get(i) == false) { + if (liveDocs == null || liveDocs.get(i) == true) { continue; } + + if (binaryDocValues.advanceExact(i)) { + keySet.add(binaryDocValues.binaryValue().utf8ToString()); + } else { + throw new AssertionError("Document don't have selected key"); Review Comment: We can put in the lucene doc id here to make it easier to debug? ########## lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java: ########## @@ -54,59 +68,83 @@ public class IndexRearranger { protected final Directory input, output; protected final IndexWriterConfig config; - protected final List<DocumentSelector> documentSelectors; + + // Each of these selectors will produce a segment in the rearranged index. + // The segments will appear in the index in the order of the selectors that produced them. + protected final List<DocumentSelector> segmentSelectors; + + // Documents selected here will be marked for deletion in the rearranged index, but not merged + // away. + protected final DocumentSelector deletedDocsSelector; /** - * Constructor + * All args constructor * * @param input input dir * @param output output dir * @param config index writer config - * @param documentSelectors specify what document is desired in the rearranged index segments, - * each selector correspond to one segment + * @param segmentSelectors specify what document is desired in the rearranged index segments, each + * selector correspond to one segment + * @param deletedDocsSelector specify which documents are to be marked for deletion in the Review Comment: Let's add a sentence indicating this selector needs to be thread safe? ########## lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java: ########## @@ -139,6 +206,48 @@ private static void addOneSegment( writer.addIndexes(readers); } + private static void applyDeletes( + IndexWriter writer, IndexReader reader, DocumentSelector selector) + throws ExecutionException, InterruptedException { + if (selector == null) { + // There are no deletes to be applied + return; + } + + ExecutorService executor = Review Comment: Let's reuse the executor? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org