zhaih commented on code in PR #11815:
URL: https://github.com/apache/lucene/pull/11815#discussion_r1027612271


##########
lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java:
##########
@@ -30,52 +31,94 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BitSet;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 
-/** Select documents using binary doc values */
+/**
+ * Use this selector to rearrange an index where documents can be uniquely 
identified based on
+ * {@link BinaryDocValues}
+ */
 public class BinaryDocValueSelector implements 
IndexRearranger.DocumentSelector, Serializable {
 
   private final String field;
-  private final HashSet<String> keySet;
+  private final Set<BytesRef> keySet;
 
-  public BinaryDocValueSelector(String field, HashSet<String> keySet) {
+  public BinaryDocValueSelector(String field, Set<BytesRef> keySet) {
     this.field = field;
     this.keySet = keySet;
   }
 
   @Override
-  public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+  public BitSet getFilteredDocs(CodecReader reader) throws IOException {
     BinaryDocValues binaryDocValues = reader.getBinaryDocValues(field);
-    Bits oldLiveDocs = reader.getLiveDocs();
     FixedBitSet bits = new FixedBitSet(reader.maxDoc());
-    for (int i = 0; i < reader.maxDoc(); i++) {
-      if (oldLiveDocs != null && oldLiveDocs.get(i) == false) {
-        continue;
-      }
-      if (binaryDocValues.advanceExact(i)
-          && keySet.contains(binaryDocValues.binaryValue().utf8ToString())) {
-        bits.set(i);
+    for (int docid = 0; docid < reader.maxDoc(); docid++) {
+      if (binaryDocValues.advanceExact(docid) && 
keySet.contains(binaryDocValues.binaryValue())) {
+        bits.set(docid);
       }
     }
     return bits;
   }
 
-  public static List<IndexRearranger.DocumentSelector> createFromExistingIndex(
+  /**
+   * Create a selector for the deletes in an index, which can then be applied 
to a rearranged index
+   *
+   * @param field tells which {@link BinaryDocValues} are the unique key
+   * @param directory where the original index is present
+   * @return a deletes selector to be passed to {@link IndexRearranger}
+   */
+  public static IndexRearranger.DocumentSelector createDeleteSelectorFromIndex(
       String field, Directory directory) throws IOException {
-    List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>();
+
+    Set<BytesRef> keySet = new HashSet<>();
+
     try (IndexReader reader = DirectoryReader.open(directory)) {
       for (LeafReaderContext context : reader.leaves()) {
-        HashSet<String> keySet = new HashSet<>();
         Bits liveDocs = context.reader().getLiveDocs();
+        if (liveDocs == null) {
+          continue;
+        }
+
         BinaryDocValues binaryDocValues = 
context.reader().getBinaryDocValues(field);
-        for (int i = 0; i < context.reader().maxDoc(); i++) {
-          if (liveDocs != null && liveDocs.get(i) == false) {
+        for (int docid = 0; docid < context.reader().maxDoc(); docid++) {
+          if (liveDocs.get(docid) == true) {
             continue;
           }
-          if (binaryDocValues.advanceExact(i)) {
-            keySet.add(binaryDocValues.binaryValue().utf8ToString());
+
+          if (binaryDocValues.advanceExact(docid)) {
+            keySet.add(new 
BytesRef(binaryDocValues.binaryValue().bytes.clone()));

Review Comment:
   See comment below :)



##########
lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java:
##########
@@ -30,52 +31,94 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BitSet;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 
-/** Select documents using binary doc values */
+/**
+ * Use this selector to rearrange an index where documents can be uniquely 
identified based on
+ * {@link BinaryDocValues}
+ */
 public class BinaryDocValueSelector implements 
IndexRearranger.DocumentSelector, Serializable {
 
   private final String field;
-  private final HashSet<String> keySet;
+  private final Set<BytesRef> keySet;
 
-  public BinaryDocValueSelector(String field, HashSet<String> keySet) {
+  public BinaryDocValueSelector(String field, Set<BytesRef> keySet) {
     this.field = field;
     this.keySet = keySet;
   }
 
   @Override
-  public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+  public BitSet getFilteredDocs(CodecReader reader) throws IOException {
     BinaryDocValues binaryDocValues = reader.getBinaryDocValues(field);
-    Bits oldLiveDocs = reader.getLiveDocs();
     FixedBitSet bits = new FixedBitSet(reader.maxDoc());
-    for (int i = 0; i < reader.maxDoc(); i++) {
-      if (oldLiveDocs != null && oldLiveDocs.get(i) == false) {
-        continue;
-      }
-      if (binaryDocValues.advanceExact(i)
-          && keySet.contains(binaryDocValues.binaryValue().utf8ToString())) {
-        bits.set(i);
+    for (int docid = 0; docid < reader.maxDoc(); docid++) {
+      if (binaryDocValues.advanceExact(docid) && 
keySet.contains(binaryDocValues.binaryValue())) {
+        bits.set(docid);
       }
     }
     return bits;
   }
 
-  public static List<IndexRearranger.DocumentSelector> createFromExistingIndex(
+  /**
+   * Create a selector for the deletes in an index, which can then be applied 
to a rearranged index
+   *
+   * @param field tells which {@link BinaryDocValues} are the unique key
+   * @param directory where the original index is present
+   * @return a deletes selector to be passed to {@link IndexRearranger}
+   */
+  public static IndexRearranger.DocumentSelector createDeleteSelectorFromIndex(
       String field, Directory directory) throws IOException {
-    List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>();
+
+    Set<BytesRef> keySet = new HashSet<>();
+
     try (IndexReader reader = DirectoryReader.open(directory)) {
       for (LeafReaderContext context : reader.leaves()) {
-        HashSet<String> keySet = new HashSet<>();
         Bits liveDocs = context.reader().getLiveDocs();
+        if (liveDocs == null) {
+          continue;
+        }
+
         BinaryDocValues binaryDocValues = 
context.reader().getBinaryDocValues(field);
-        for (int i = 0; i < context.reader().maxDoc(); i++) {
-          if (liveDocs != null && liveDocs.get(i) == false) {
+        for (int docid = 0; docid < context.reader().maxDoc(); docid++) {
+          if (liveDocs.get(docid) == true) {
             continue;
           }
-          if (binaryDocValues.advanceExact(i)) {
-            keySet.add(binaryDocValues.binaryValue().utf8ToString());
+
+          if (binaryDocValues.advanceExact(docid)) {
+            keySet.add(new 
BytesRef(binaryDocValues.binaryValue().bytes.clone()));
+          } else {
+            throw new AssertionError("Document " + docid + " doesn't have key 
" + field);
+          }
+        }
+      }
+    }
+    return new BinaryDocValueSelector(field, keySet);
+  }
+
+  /**
+   * Create a list of selectors that will reproduce the index geometry when 
used with {@link
+   * IndexRearranger}
+   *
+   * @param field tells which {@link BinaryDocValues} are the unique key
+   * @param directory where the original index is present
+   * @return a list of selectors to be passed to {@link IndexRearranger}
+   */
+  public static List<IndexRearranger.DocumentSelector> 
createLiveSelectorsFromIndex(
+      String field, Directory directory) throws IOException {
+
+    List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>();
+
+    try (IndexReader reader = DirectoryReader.open(directory)) {
+      for (LeafReaderContext context : reader.leaves()) {
+        Set<BytesRef> keySet = new HashSet<>();
+        BinaryDocValues binaryDocValues = 
context.reader().getBinaryDocValues(field);
+
+        for (int docid = 0; docid < context.reader().maxDoc(); docid++) {
+          if (binaryDocValues.advanceExact(docid)) {
+            keySet.add(new 
BytesRef(binaryDocValues.binaryValue().bytes.clone()));

Review Comment:
   use `BytesRef.deepCopyOf(binaryDocValues.binaryValue())` instead?
   Directly copy underlying bytes array may cause some error when `offset` is 
not 0.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to