Re: [PR] Add support for index sorting with document blocks [lucene]

via GitHub Tue, 05 Dec 2023 05:43:55 -0800


mikemccand commented on code in PR #12829:
URL: https://github.com/apache/lucene/pull/12829#discussion_r1415586966



##########
lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java:
##########
@@ -1678,6 +1678,51 @@ public void testIllegalIndexSortChange2() throws 
Exception {
     IOUtils.close(r1, dir1, w2, dir2);
   }
 
+  public void testIllegalIndexSortChange3() throws Exception {
+    Directory dir1 = newDirectory();
+    IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random()));
+    iwc1.setIndexSort(new Sort("foobar", new SortField("foo", 
SortField.Type.INT)));
+
+    RandomIndexWriter w1 = new RandomIndexWriter(random(), dir1, iwc1);
+    Document parent = new Document();
+    w1.addDocuments(Arrays.asList(new Document(), new Document(), parent));
+    w1.commit();
+    w1.addDocuments(Arrays.asList(new Document(), new Document(), parent));
+    w1.commit();
+    // so the index sort is in fact burned into the index:
+    w1.forceMerge(1);
+    w1.close();
+
+    Directory dir2 = newDirectory();
+    IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random()));
+    iwc2.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT)));
+    RandomIndexWriter w2 = new RandomIndexWriter(random(), dir2, iwc2);
+
+    IndexReader r1 = DirectoryReader.open(dir1);
+    String message =
+        expectThrows(
+                IllegalArgumentException.class,
+                () -> {
+                  w2.addIndexes((SegmentReader) getOnlyLeafReader(r1));
+                })
+            .getMessage();
+    assertEquals(
+        "cannot change index sort from parent field: foobar <int: \"foo\"> to 
<int: \"foo\">",
+        message);
+
+    message =
+        expectThrows(
+                IllegalArgumentException.class,
+                () -> {
+                  w2.addIndexes(dir1);

Review Comment:
   Can we also test that if you `.addIndexes` with the correct configuration 
(same parent sort field) things are good?



##########
lucene/core/src/java/org/apache/lucene/index/CheckIndex.java:
##########
@@ -1198,33 +1198,42 @@ public static Status.IndexSortStatus testSort(
         comparators[i] = fields[i].getComparator(1, 
Pruning.NONE).getLeafComparator(readerContext);
       }
 
-      int maxDoc = reader.maxDoc();
-
       try {
-
-        for (int docID = 1; docID < maxDoc; docID++) {
-
+        LeafMetaData metaData = reader.getMetaData();
+        if (metaData.hasBlocks()
+            && sort.getParentField() == null
+            && metaData.getCreatedVersionMajor() >= 
Version.LUCENE_10_0_0.major) {
+          throw new IllegalStateException(
+              "parent field is not set but the index has block and was created 
with version: "

Review Comment:
   `has block` -> `has document blocks`?



##########
lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java:
##########
@@ -2164,6 +2166,83 @@ public void testSortedIndex() throws Exception {
     }
   }
 
+  public void testSortedIndexAddDocBlocks() throws Exception {
+    for (String name : oldSortedNames) {
+      Path path = createTempDir("sorted");
+      InputStream resource = 
TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip");
+      assertNotNull("Sorted index index " + name + " not found", resource);
+      TestUtil.unzip(resource, path);
+
+      try (Directory dir = newFSDirectory(path)) {
+        final Sort sort;
+        try (DirectoryReader reader = DirectoryReader.open(dir)) {
+          assertEquals(1, reader.leaves().size());
+          sort = reader.leaves().get(0).reader().getMetaData().getSort();
+          assertNotNull(sort);
+          searchExampleIndex(reader);
+        }
+        // open writer
+        try (IndexWriter writer =
+            new IndexWriter(
+                dir,
+                newIndexWriterConfig(new MockAnalyzer(random()))
+                    .setOpenMode(OpenMode.APPEND)
+                    .setIndexSort(sort)
+                    .setMergePolicy(newLogMergePolicy()))) {
+          // add 10 docs
+          for (int i = 0; i < 10; i++) {
+            Document child = new Document();
+            child.add(new StringField("relation", "child", Field.Store.NO));
+            child.add(new StringField("bid", "" + i, Field.Store.NO));
+            child.add(new NumericDocValuesField("dateDV", i));
+            Document parent = new Document();
+            parent.add(new StringField("relation", "parent", Field.Store.NO));
+            parent.add(new StringField("bid", "" + i, Field.Store.NO));
+            parent.add(new NumericDocValuesField("dateDV", i));
+            writer.addDocuments(Arrays.asList(child, child, parent));
+            if (random().nextBoolean()) {
+              writer.flush();
+            }
+          }
+          if (random().nextBoolean()) {
+            writer.forceMerge(1);
+          }
+          writer.commit();
+          try (IndexReader reader = DirectoryReader.open(dir)) {
+            IndexSearcher searcher = new IndexSearcher(reader);
+            for (int i = 0; i < 10; i++) {
+              TopDocs children =
+                  searcher.search(
+                      new BooleanQuery.Builder()
+                          .add(
+                              new TermQuery(new Term("relation", "child")),
+                              BooleanClause.Occur.MUST)
+                          .add(new TermQuery(new Term("bid", "" + i)), 
BooleanClause.Occur.MUST)
+                          .build(),
+                      2);
+              TopDocs parents =
+                  searcher.search(
+                      new BooleanQuery.Builder()
+                          .add(
+                              new TermQuery(new Term("relation", "parent")),
+                              BooleanClause.Occur.MUST)
+                          .add(new TermQuery(new Term("bid", "" + i)), 
BooleanClause.Occur.MUST)
+                          .build(),
+                      2);
+              assertEquals(2, children.totalHits.value);
+              assertEquals(1, parents.totalHits.value);
+              // make sure it's sorted
+              assertEquals(children.scoreDocs[0].doc + 1, 
children.scoreDocs[1].doc);
+              assertEquals(children.scoreDocs[1].doc + 1, 
parents.scoreDocs[0].doc);
+            }
+          }
+        }
+        // This will confirm the docs are really sorted
+        TestUtil.checkIndex(dir);

Review Comment:
   Hmm can we fix `CheckIndex` to confirm the blocks are also intact?
   
   Edit: yay!  It looks like you did that already :)  Or rather, fixed 
CheckIndex so that when it validates the sort it checks only that the parent 
docs are in the right order.



##########
lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java:
##########
@@ -2164,6 +2166,83 @@ public void testSortedIndex() throws Exception {
     }
   }
 
+  public void testSortedIndexAddDocBlocks() throws Exception {
+    for (String name : oldSortedNames) {
+      Path path = createTempDir("sorted");
+      InputStream resource = 
TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip");
+      assertNotNull("Sorted index index " + name + " not found", resource);
+      TestUtil.unzip(resource, path);
+
+      try (Directory dir = newFSDirectory(path)) {
+        final Sort sort;
+        try (DirectoryReader reader = DirectoryReader.open(dir)) {
+          assertEquals(1, reader.leaves().size());
+          sort = reader.leaves().get(0).reader().getMetaData().getSort();
+          assertNotNull(sort);
+          searchExampleIndex(reader);
+        }
+        // open writer
+        try (IndexWriter writer =
+            new IndexWriter(
+                dir,
+                newIndexWriterConfig(new MockAnalyzer(random()))
+                    .setOpenMode(OpenMode.APPEND)
+                    .setIndexSort(sort)
+                    .setMergePolicy(newLogMergePolicy()))) {
+          // add 10 docs
+          for (int i = 0; i < 10; i++) {

Review Comment:
   This is testing indexing doc blocks into an old (pre-10.0) index right?  In 
this case does IW still add the DV field to the parent doc?  The test here 
seems not to be configuring a parent field, so I guess no?  Only indices 
created 10.0+ will do this?



##########
lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java:
##########
@@ -262,6 +290,44 @@ long updateDocuments(
     }
   }
 
+  private Iterable<? extends IndexableField> filterParentDocField(
+      Iterable<? extends IndexableField> doc,
+      String parentFieldName,
+      NumericDocValuesField parentDocMarker) {
+    return () -> {
+      final Iterator<? extends IndexableField> first = doc.iterator();
+      return new Iterator<>() {

Review Comment:
   I'm concerned about the performance impact of re-iterating / checking every 
Iterable the caller already passed in.  Should we test the performance impact 
of blocks indexing?  `luceneutil` indexes doc blocks for grouping ... we could 
test the impact there?  Once this PR gets closer let me know and I'm happy to 
test.



##########
lucene/core/src/java/org/apache/lucene/index/IndexingChain.java:
##########
@@ -219,15 +222,33 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState 
state) throws IOExcepti
     }
 
     LeafReader docValuesReader = getDocValuesLeafReader();
-
+    Function<IndexSorter.DocComparator, IndexSorter.DocComparator> 
comparatorWrapper = in -> in;
+
+    if (state.segmentInfo.getHasBlocks() && indexSort.getParentField() != 
null) {
+      final DocIdSetIterator readerValues =

Review Comment:
   Check that this is non-null and throw `CorruptIndexException` or so if it is 
null?



##########
lucene/core/src/java/org/apache/lucene/search/Sort.java:
##########
@@ -59,10 +61,28 @@ public Sort() {
    * is still a tie after all SortFields are checked, the internal Lucene 
docid is used to break it.
    */
   public Sort(SortField... fields) {
+    this(null, fields);
+  }
+
+  /**
+   * Sets the sort to the given criteria in succession: the first SortField is 
checked first, but if
+   * it produces a tie, then the second SortField is used to break the tie, 
etc. Finally, if there
+   * is still a tie after all SortFields are checked, the internal Lucene 
docid is used to break it.
+   *
+   * @param parentField the name of a numeric doc values field that marks the 
last document of a

Review Comment:
   I would rather not change `Sort` if possible.  It's weird that it needs to 
know about a `parentField`, and IW should take care during static sort time 
(flush, merge) to validate that what it (IWC) knows as the parent field is 
consistent with the index-time sort?



##########
lucene/core/src/java/org/apache/lucene/index/Sorter.java:
##########
@@ -206,13 +209,28 @@ DocMap sort(LeafReader reader) throws IOException {
     SortField[] fields = sort.getSort();
     final IndexSorter.DocComparator[] comparators = new 
IndexSorter.DocComparator[fields.length];
 
+    Function<IndexSorter.DocComparator, IndexSorter.DocComparator> 
comparatorWrapper = in -> in;
+    LeafMetaData metaData = reader.getMetaData();
+    if (metaData.hasBlocks() && sort.getParentField() != null) {
+      BitSet parents =
+          BitSet.of(reader.getNumericDocValues(sort.getParentField()), 
reader.maxDoc());
+      comparatorWrapper =
+          in ->
+              (docID1, docID2) ->
+                  in.compare(parents.nextSetBit(docID1), 
parents.nextSetBit(docID2));
+    }
+    assert metaData.hasBlocks() == false

Review Comment:
   Real `if` -> `CorruptIndexException`?



##########
lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java:
##########
@@ -231,7 +236,29 @@ long updateDocuments(
       final int docsInRamBefore = numDocsInRAM;
       boolean allDocsIndexed = false;
       try {
-        for (Iterable<? extends IndexableField> doc : docs) {
+        final Iterator<? extends Iterable<? extends IndexableField>> iterator 
= docs.iterator();
+        while (iterator.hasNext()) {
+          Iterable<? extends IndexableField> doc = iterator.next();
+          if (segmentInfo.getIndexSort() != null) {
+            String parentFieldName = 
segmentInfo.getIndexSort().getParentField();
+            if (parentFieldName != null) {
+              final NumericDocValuesField parentField;
+              if (iterator.hasNext() == false) {
+                int numChildren = numDocsInRAM - docsInRamBefore;
+                parentField = new NumericDocValuesField(parentFieldName, 
numChildren);
+              } else {
+                parentField = null;
+              }
+              doc = filterParentDocField(doc, parentFieldName, parentField);
+            } else if (iterator.hasNext() && indexVersionCreated >= 
Version.LUCENE_10_0_0.major) {
+              // sort is configured but parent field is missing, yet we have a 
doc-block

Review Comment:
   So I think this means if you add a doc block, with only one document, it's 
OK to not have a `parentFieldName` configured, even if the index is created 
10.0+?  Maybe clarify that in the comment (why the `iterator.hasNext()` is 
being checked).  It's sort of a degenerate usage of `addDocuments` that behaves 
like `addDocument`?
   
   Hmm, except, it's allowed to add a parent with no children I think?  And 
then we'll fail to catch the misuse?



##########
lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java:
##########
@@ -231,7 +236,29 @@ long updateDocuments(
       final int docsInRamBefore = numDocsInRAM;
       boolean allDocsIndexed = false;
       try {
-        for (Iterable<? extends IndexableField> doc : docs) {
+        final Iterator<? extends Iterable<? extends IndexableField>> iterator 
= docs.iterator();
+        while (iterator.hasNext()) {
+          Iterable<? extends IndexableField> doc = iterator.next();
+          if (segmentInfo.getIndexSort() != null) {
+            String parentFieldName = 
segmentInfo.getIndexSort().getParentField();
+            if (parentFieldName != null) {
+              final NumericDocValuesField parentField;
+              if (iterator.hasNext() == false) {
+                int numChildren = numDocsInRAM - docsInRamBefore;
+                parentField = new NumericDocValuesField(parentFieldName, 
numChildren);

Review Comment:
   Aha!  You are recording `numChildren`, yay.



##########
lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java:
##########
@@ -231,7 +236,29 @@ long updateDocuments(
       final int docsInRamBefore = numDocsInRAM;
       boolean allDocsIndexed = false;
       try {
-        for (Iterable<? extends IndexableField> doc : docs) {
+        final Iterator<? extends Iterable<? extends IndexableField>> iterator 
= docs.iterator();
+        while (iterator.hasNext()) {
+          Iterable<? extends IndexableField> doc = iterator.next();
+          if (segmentInfo.getIndexSort() != null) {
+            String parentFieldName = 
segmentInfo.getIndexSort().getParentField();
+            if (parentFieldName != null) {
+              final NumericDocValuesField parentField;
+              if (iterator.hasNext() == false) {
+                int numChildren = numDocsInRAM - docsInRamBefore;
+                parentField = new NumericDocValuesField(parentFieldName, 
numChildren);
+              } else {
+                parentField = null;
+              }
+              doc = filterParentDocField(doc, parentFieldName, parentField);

Review Comment:
   OK so we are checking all child docs as well (that they do not illegally 
contain parent field), as well as parent doc (same, but also appending the 
parent field).



##########
lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java:
##########
@@ -262,6 +290,44 @@ long updateDocuments(
     }
   }
 
+  private Iterable<? extends IndexableField> filterParentDocField(

Review Comment:
   Can we rename this to `addParentDocField`?  And as a side effect it is also 
validating that this field was not also added by the user?  Maybe add that side 
effect above as a comment?
   
   Edit: I see we call this for child docs too ... maybe rename to 
`maybeAddParentDocField`?  `filter` sounds like it's gonna remove stuff.



##########
lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java:
##########
@@ -262,6 +290,44 @@ long updateDocuments(
     }
   }
 
+  private Iterable<? extends IndexableField> filterParentDocField(
+      Iterable<? extends IndexableField> doc,
+      String parentFieldName,
+      NumericDocValuesField parentDocMarker) {
+    return () -> {
+      final Iterator<? extends IndexableField> first = doc.iterator();
+      return new Iterator<>() {
+        NumericDocValuesField additionalField = parentDocMarker;
+
+        @Override
+        public boolean hasNext() {
+          return first.hasNext() || additionalField != null;
+        }
+
+        @Override
+        public IndexableField next() {
+          if (first.hasNext()) {
+            IndexableField field = first.next();
+            if (parentFieldName.equals(field.name())
+                && DocValuesType.NUMERIC == field.fieldType().docValuesType()) 
{
+              throw new IllegalArgumentException(
+                  "documents must not contain the parent doc values field \""
+                      + parentFieldName
+                      + "\"");
+            }
+            return field;
+          }
+          if (additionalField != null) {
+            IndexableField field = additionalField;
+            additionalField = null;
+            return field;
+          }
+          throw new NoSuchElementException();

Review Comment:
   This might be hot-spotish?  Can we allocate this once and always throw the 
same instance?



##########
lucene/core/src/java/org/apache/lucene/index/IndexingChain.java:
##########
@@ -219,15 +222,33 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState 
state) throws IOExcepti
     }
 
     LeafReader docValuesReader = getDocValuesLeafReader();
-
+    Function<IndexSorter.DocComparator, IndexSorter.DocComparator> 
comparatorWrapper = in -> in;
+
+    if (state.segmentInfo.getHasBlocks() && indexSort.getParentField() != 
null) {
+      final DocIdSetIterator readerValues =
+          docValuesReader.getNumericDocValues(indexSort.getParentField());
+      BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc());
+      comparatorWrapper =
+          in ->
+              (docID1, docID2) ->
+                  in.compare(parents.nextSetBit(docID1), 
parents.nextSetBit(docID2));
+    }
+    assert state.segmentInfo.getHasBlocks() == false

Review Comment:
   Maybe a real `if` --> `CorruptIndexException`?



##########
lucene/core/src/java/org/apache/lucene/index/IndexingChain.java:
##########
@@ -219,15 +222,33 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState 
state) throws IOExcepti
     }
 
     LeafReader docValuesReader = getDocValuesLeafReader();
-
+    Function<IndexSorter.DocComparator, IndexSorter.DocComparator> 
comparatorWrapper = in -> in;
+
+    if (state.segmentInfo.getHasBlocks() && indexSort.getParentField() != 
null) {
+      final DocIdSetIterator readerValues =
+          docValuesReader.getNumericDocValues(indexSort.getParentField());
+      BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc());
+      comparatorWrapper =
+          in ->
+              (docID1, docID2) ->
+                  in.compare(parents.nextSetBit(docID1), 
parents.nextSetBit(docID2));
+    }
+    assert state.segmentInfo.getHasBlocks() == false
+            || indexSort.getParentField() != null
+            || indexCreatedVersionMajor < Version.LUCENE_10_0_0.major
+        : "parent field is not set but the index has blocks. 
indexCreatedVersionMajor: "
+            + indexCreatedVersionMajor;
     List<IndexSorter.DocComparator> comparators = new ArrayList<>();
     for (int i = 0; i < indexSort.getSort().length; i++) {
       SortField sortField = indexSort.getSort()[i];
       IndexSorter sorter = sortField.getIndexSorter();
       if (sorter == null) {
         throw new UnsupportedOperationException("Cannot sort index using sort 
field " + sortField);
       }
-      comparators.add(sorter.getDocComparator(docValuesReader, 
state.segmentInfo.maxDoc()));
+
+      IndexSorter.DocComparator docComparator =

Review Comment:
   It looks like we are now relying on index sorting always being stable, 
because the child docs of one parent will always compare as equal to one 
another since we compare their parents?
   
   Do we have any tests/confirmation all of Lucene's (static) Sort options are 
stable?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Add support for index sorting with document blocks [lucene]

Reply via email to