stefanvodita commented on code in PR #11815: URL: https://github.com/apache/lucene/pull/11815#discussion_r1027768094
########## lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java: ########## @@ -30,52 +31,94 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; -/** Select documents using binary doc values */ +/** + * Use this selector to rearrange an index where documents can be uniquely identified based on + * {@link BinaryDocValues} + */ public class BinaryDocValueSelector implements IndexRearranger.DocumentSelector, Serializable { private final String field; - private final HashSet<String> keySet; + private final Set<BytesRef> keySet; - public BinaryDocValueSelector(String field, HashSet<String> keySet) { + public BinaryDocValueSelector(String field, Set<BytesRef> keySet) { this.field = field; this.keySet = keySet; } @Override - public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException { + public BitSet getFilteredDocs(CodecReader reader) throws IOException { BinaryDocValues binaryDocValues = reader.getBinaryDocValues(field); - Bits oldLiveDocs = reader.getLiveDocs(); FixedBitSet bits = new FixedBitSet(reader.maxDoc()); - for (int i = 0; i < reader.maxDoc(); i++) { - if (oldLiveDocs != null && oldLiveDocs.get(i) == false) { - continue; - } - if (binaryDocValues.advanceExact(i) - && keySet.contains(binaryDocValues.binaryValue().utf8ToString())) { - bits.set(i); + for (int docid = 0; docid < reader.maxDoc(); docid++) { + if (binaryDocValues.advanceExact(docid) && keySet.contains(binaryDocValues.binaryValue())) { + bits.set(docid); } } return bits; } - public static List<IndexRearranger.DocumentSelector> createFromExistingIndex( + /** + * Create a selector for the deletes in an index, which can then be applied to a rearranged index + * + * @param field tells which {@link BinaryDocValues} are the unique key + * @param directory where the original index is present + * @return a deletes selector to be passed to {@link IndexRearranger} + */ + public static IndexRearranger.DocumentSelector createDeleteSelectorFromIndex( String field, Directory directory) throws IOException { - List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>(); + + Set<BytesRef> keySet = new HashSet<>(); + try (IndexReader reader = DirectoryReader.open(directory)) { for (LeafReaderContext context : reader.leaves()) { - HashSet<String> keySet = new HashSet<>(); Bits liveDocs = context.reader().getLiveDocs(); + if (liveDocs == null) { + continue; + } + BinaryDocValues binaryDocValues = context.reader().getBinaryDocValues(field); - for (int i = 0; i < context.reader().maxDoc(); i++) { - if (liveDocs != null && liveDocs.get(i) == false) { + for (int docid = 0; docid < context.reader().maxDoc(); docid++) { + if (liveDocs.get(docid) == true) { continue; } - if (binaryDocValues.advanceExact(i)) { - keySet.add(binaryDocValues.binaryValue().utf8ToString()); + + if (binaryDocValues.advanceExact(docid)) { + keySet.add(new BytesRef(binaryDocValues.binaryValue().bytes.clone())); + } else { + throw new AssertionError("Document " + docid + " doesn't have key " + field); + } + } + } + } + return new BinaryDocValueSelector(field, keySet); + } + + /** + * Create a list of selectors that will reproduce the index geometry when used with {@link + * IndexRearranger} + * + * @param field tells which {@link BinaryDocValues} are the unique key + * @param directory where the original index is present + * @return a list of selectors to be passed to {@link IndexRearranger} + */ + public static List<IndexRearranger.DocumentSelector> createLiveSelectorsFromIndex( + String field, Directory directory) throws IOException { + + List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>(); + + try (IndexReader reader = DirectoryReader.open(directory)) { + for (LeafReaderContext context : reader.leaves()) { + Set<BytesRef> keySet = new HashSet<>(); + BinaryDocValues binaryDocValues = context.reader().getBinaryDocValues(field); + + for (int docid = 0; docid < context.reader().maxDoc(); docid++) { + if (binaryDocValues.advanceExact(docid)) { + keySet.add(new BytesRef(binaryDocValues.binaryValue().bytes.clone())); Review Comment: Oh, I see. Changed in latest commit. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org