mikemccand commented on code in PR #12829: URL: https://github.com/apache/lucene/pull/12829#discussion_r1415586966
########## lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java: ########## @@ -1678,6 +1678,51 @@ public void testIllegalIndexSortChange2() throws Exception { IOUtils.close(r1, dir1, w2, dir2); } + public void testIllegalIndexSortChange3() throws Exception { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc1.setIndexSort(new Sort("foobar", new SortField("foo", SortField.Type.INT))); + + RandomIndexWriter w1 = new RandomIndexWriter(random(), dir1, iwc1); + Document parent = new Document(); + w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); + w1.commit(); + w1.addDocuments(Arrays.asList(new Document(), new Document(), parent)); + w1.commit(); + // so the index sort is in fact burned into the index: + w1.forceMerge(1); + w1.close(); + + Directory dir2 = newDirectory(); + IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc2.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); + RandomIndexWriter w2 = new RandomIndexWriter(random(), dir2, iwc2); + + IndexReader r1 = DirectoryReader.open(dir1); + String message = + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes((SegmentReader) getOnlyLeafReader(r1)); + }) + .getMessage(); + assertEquals( + "cannot change index sort from parent field: foobar <int: \"foo\"> to <int: \"foo\">", + message); + + message = + expectThrows( + IllegalArgumentException.class, + () -> { + w2.addIndexes(dir1); Review Comment: Can we also test that if you `.addIndexes` with the correct configuration (same parent sort field) things are good? ########## lucene/core/src/java/org/apache/lucene/index/CheckIndex.java: ########## @@ -1198,33 +1198,42 @@ public static Status.IndexSortStatus testSort( comparators[i] = fields[i].getComparator(1, Pruning.NONE).getLeafComparator(readerContext); } - int maxDoc = reader.maxDoc(); - try { - - for (int docID = 1; docID < maxDoc; docID++) { - + LeafMetaData metaData = reader.getMetaData(); + if (metaData.hasBlocks() + && sort.getParentField() == null + && metaData.getCreatedVersionMajor() >= Version.LUCENE_10_0_0.major) { + throw new IllegalStateException( + "parent field is not set but the index has block and was created with version: " Review Comment: `has block` -> `has document blocks`? ########## lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java: ########## @@ -2164,6 +2166,83 @@ public void testSortedIndex() throws Exception { } } + public void testSortedIndexAddDocBlocks() throws Exception { + for (String name : oldSortedNames) { + Path path = createTempDir("sorted"); + InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip"); + assertNotNull("Sorted index index " + name + " not found", resource); + TestUtil.unzip(resource, path); + + try (Directory dir = newFSDirectory(path)) { + final Sort sort; + try (DirectoryReader reader = DirectoryReader.open(dir)) { + assertEquals(1, reader.leaves().size()); + sort = reader.leaves().get(0).reader().getMetaData().getSort(); + assertNotNull(sort); + searchExampleIndex(reader); + } + // open writer + try (IndexWriter writer = + new IndexWriter( + dir, + newIndexWriterConfig(new MockAnalyzer(random())) + .setOpenMode(OpenMode.APPEND) + .setIndexSort(sort) + .setMergePolicy(newLogMergePolicy()))) { + // add 10 docs + for (int i = 0; i < 10; i++) { + Document child = new Document(); + child.add(new StringField("relation", "child", Field.Store.NO)); + child.add(new StringField("bid", "" + i, Field.Store.NO)); + child.add(new NumericDocValuesField("dateDV", i)); + Document parent = new Document(); + parent.add(new StringField("relation", "parent", Field.Store.NO)); + parent.add(new StringField("bid", "" + i, Field.Store.NO)); + parent.add(new NumericDocValuesField("dateDV", i)); + writer.addDocuments(Arrays.asList(child, child, parent)); + if (random().nextBoolean()) { + writer.flush(); + } + } + if (random().nextBoolean()) { + writer.forceMerge(1); + } + writer.commit(); + try (IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = new IndexSearcher(reader); + for (int i = 0; i < 10; i++) { + TopDocs children = + searcher.search( + new BooleanQuery.Builder() + .add( + new TermQuery(new Term("relation", "child")), + BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST) + .build(), + 2); + TopDocs parents = + searcher.search( + new BooleanQuery.Builder() + .add( + new TermQuery(new Term("relation", "parent")), + BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST) + .build(), + 2); + assertEquals(2, children.totalHits.value); + assertEquals(1, parents.totalHits.value); + // make sure it's sorted + assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc); + assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc); + } + } + } + // This will confirm the docs are really sorted + TestUtil.checkIndex(dir); Review Comment: Hmm can we fix `CheckIndex` to confirm the blocks are also intact? Edit: yay! It looks like you did that already :) Or rather, fixed CheckIndex so that when it validates the sort it checks only that the parent docs are in the right order. ########## lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java: ########## @@ -2164,6 +2166,83 @@ public void testSortedIndex() throws Exception { } } + public void testSortedIndexAddDocBlocks() throws Exception { + for (String name : oldSortedNames) { + Path path = createTempDir("sorted"); + InputStream resource = TestBackwardsCompatibility.class.getResourceAsStream(name + ".zip"); + assertNotNull("Sorted index index " + name + " not found", resource); + TestUtil.unzip(resource, path); + + try (Directory dir = newFSDirectory(path)) { + final Sort sort; + try (DirectoryReader reader = DirectoryReader.open(dir)) { + assertEquals(1, reader.leaves().size()); + sort = reader.leaves().get(0).reader().getMetaData().getSort(); + assertNotNull(sort); + searchExampleIndex(reader); + } + // open writer + try (IndexWriter writer = + new IndexWriter( + dir, + newIndexWriterConfig(new MockAnalyzer(random())) + .setOpenMode(OpenMode.APPEND) + .setIndexSort(sort) + .setMergePolicy(newLogMergePolicy()))) { + // add 10 docs + for (int i = 0; i < 10; i++) { Review Comment: This is testing indexing doc blocks into an old (pre-10.0) index right? In this case does IW still add the DV field to the parent doc? The test here seems not to be configuring a parent field, so I guess no? Only indices created 10.0+ will do this? ########## lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java: ########## @@ -262,6 +290,44 @@ long updateDocuments( } } + private Iterable<? extends IndexableField> filterParentDocField( + Iterable<? extends IndexableField> doc, + String parentFieldName, + NumericDocValuesField parentDocMarker) { + return () -> { + final Iterator<? extends IndexableField> first = doc.iterator(); + return new Iterator<>() { Review Comment: I'm concerned about the performance impact of re-iterating / checking every Iterable the caller already passed in. Should we test the performance impact of blocks indexing? `luceneutil` indexes doc blocks for grouping ... we could test the impact there? Once this PR gets closer let me know and I'm happy to test. ########## lucene/core/src/java/org/apache/lucene/index/IndexingChain.java: ########## @@ -219,15 +222,33 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti } LeafReader docValuesReader = getDocValuesLeafReader(); - + Function<IndexSorter.DocComparator, IndexSorter.DocComparator> comparatorWrapper = in -> in; + + if (state.segmentInfo.getHasBlocks() && indexSort.getParentField() != null) { + final DocIdSetIterator readerValues = Review Comment: Check that this is non-null and throw `CorruptIndexException` or so if it is null? ########## lucene/core/src/java/org/apache/lucene/search/Sort.java: ########## @@ -59,10 +61,28 @@ public Sort() { * is still a tie after all SortFields are checked, the internal Lucene docid is used to break it. */ public Sort(SortField... fields) { + this(null, fields); + } + + /** + * Sets the sort to the given criteria in succession: the first SortField is checked first, but if + * it produces a tie, then the second SortField is used to break the tie, etc. Finally, if there + * is still a tie after all SortFields are checked, the internal Lucene docid is used to break it. + * + * @param parentField the name of a numeric doc values field that marks the last document of a Review Comment: I would rather not change `Sort` if possible. It's weird that it needs to know about a `parentField`, and IW should take care during static sort time (flush, merge) to validate that what it (IWC) knows as the parent field is consistent with the index-time sort? ########## lucene/core/src/java/org/apache/lucene/index/Sorter.java: ########## @@ -206,13 +209,28 @@ DocMap sort(LeafReader reader) throws IOException { SortField[] fields = sort.getSort(); final IndexSorter.DocComparator[] comparators = new IndexSorter.DocComparator[fields.length]; + Function<IndexSorter.DocComparator, IndexSorter.DocComparator> comparatorWrapper = in -> in; + LeafMetaData metaData = reader.getMetaData(); + if (metaData.hasBlocks() && sort.getParentField() != null) { + BitSet parents = + BitSet.of(reader.getNumericDocValues(sort.getParentField()), reader.maxDoc()); + comparatorWrapper = + in -> + (docID1, docID2) -> + in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); + } + assert metaData.hasBlocks() == false Review Comment: Real `if` -> `CorruptIndexException`? ########## lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java: ########## @@ -231,7 +236,29 @@ long updateDocuments( final int docsInRamBefore = numDocsInRAM; boolean allDocsIndexed = false; try { - for (Iterable<? extends IndexableField> doc : docs) { + final Iterator<? extends Iterable<? extends IndexableField>> iterator = docs.iterator(); + while (iterator.hasNext()) { + Iterable<? extends IndexableField> doc = iterator.next(); + if (segmentInfo.getIndexSort() != null) { + String parentFieldName = segmentInfo.getIndexSort().getParentField(); + if (parentFieldName != null) { + final NumericDocValuesField parentField; + if (iterator.hasNext() == false) { + int numChildren = numDocsInRAM - docsInRamBefore; + parentField = new NumericDocValuesField(parentFieldName, numChildren); + } else { + parentField = null; + } + doc = filterParentDocField(doc, parentFieldName, parentField); + } else if (iterator.hasNext() && indexVersionCreated >= Version.LUCENE_10_0_0.major) { + // sort is configured but parent field is missing, yet we have a doc-block Review Comment: So I think this means if you add a doc block, with only one document, it's OK to not have a `parentFieldName` configured, even if the index is created 10.0+? Maybe clarify that in the comment (why the `iterator.hasNext()` is being checked). It's sort of a degenerate usage of `addDocuments` that behaves like `addDocument`? Hmm, except, it's allowed to add a parent with no children I think? And then we'll fail to catch the misuse? ########## lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java: ########## @@ -231,7 +236,29 @@ long updateDocuments( final int docsInRamBefore = numDocsInRAM; boolean allDocsIndexed = false; try { - for (Iterable<? extends IndexableField> doc : docs) { + final Iterator<? extends Iterable<? extends IndexableField>> iterator = docs.iterator(); + while (iterator.hasNext()) { + Iterable<? extends IndexableField> doc = iterator.next(); + if (segmentInfo.getIndexSort() != null) { + String parentFieldName = segmentInfo.getIndexSort().getParentField(); + if (parentFieldName != null) { + final NumericDocValuesField parentField; + if (iterator.hasNext() == false) { + int numChildren = numDocsInRAM - docsInRamBefore; + parentField = new NumericDocValuesField(parentFieldName, numChildren); Review Comment: Aha! You are recording `numChildren`, yay. ########## lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java: ########## @@ -231,7 +236,29 @@ long updateDocuments( final int docsInRamBefore = numDocsInRAM; boolean allDocsIndexed = false; try { - for (Iterable<? extends IndexableField> doc : docs) { + final Iterator<? extends Iterable<? extends IndexableField>> iterator = docs.iterator(); + while (iterator.hasNext()) { + Iterable<? extends IndexableField> doc = iterator.next(); + if (segmentInfo.getIndexSort() != null) { + String parentFieldName = segmentInfo.getIndexSort().getParentField(); + if (parentFieldName != null) { + final NumericDocValuesField parentField; + if (iterator.hasNext() == false) { + int numChildren = numDocsInRAM - docsInRamBefore; + parentField = new NumericDocValuesField(parentFieldName, numChildren); + } else { + parentField = null; + } + doc = filterParentDocField(doc, parentFieldName, parentField); Review Comment: OK so we are checking all child docs as well (that they do not illegally contain parent field), as well as parent doc (same, but also appending the parent field). ########## lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java: ########## @@ -262,6 +290,44 @@ long updateDocuments( } } + private Iterable<? extends IndexableField> filterParentDocField( Review Comment: Can we rename this to `addParentDocField`? And as a side effect it is also validating that this field was not also added by the user? Maybe add that side effect above as a comment? Edit: I see we call this for child docs too ... maybe rename to `maybeAddParentDocField`? `filter` sounds like it's gonna remove stuff. ########## lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java: ########## @@ -262,6 +290,44 @@ long updateDocuments( } } + private Iterable<? extends IndexableField> filterParentDocField( + Iterable<? extends IndexableField> doc, + String parentFieldName, + NumericDocValuesField parentDocMarker) { + return () -> { + final Iterator<? extends IndexableField> first = doc.iterator(); + return new Iterator<>() { + NumericDocValuesField additionalField = parentDocMarker; + + @Override + public boolean hasNext() { + return first.hasNext() || additionalField != null; + } + + @Override + public IndexableField next() { + if (first.hasNext()) { + IndexableField field = first.next(); + if (parentFieldName.equals(field.name()) + && DocValuesType.NUMERIC == field.fieldType().docValuesType()) { + throw new IllegalArgumentException( + "documents must not contain the parent doc values field \"" + + parentFieldName + + "\""); + } + return field; + } + if (additionalField != null) { + IndexableField field = additionalField; + additionalField = null; + return field; + } + throw new NoSuchElementException(); Review Comment: This might be hot-spotish? Can we allocate this once and always throw the same instance? ########## lucene/core/src/java/org/apache/lucene/index/IndexingChain.java: ########## @@ -219,15 +222,33 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti } LeafReader docValuesReader = getDocValuesLeafReader(); - + Function<IndexSorter.DocComparator, IndexSorter.DocComparator> comparatorWrapper = in -> in; + + if (state.segmentInfo.getHasBlocks() && indexSort.getParentField() != null) { + final DocIdSetIterator readerValues = + docValuesReader.getNumericDocValues(indexSort.getParentField()); + BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc()); + comparatorWrapper = + in -> + (docID1, docID2) -> + in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); + } + assert state.segmentInfo.getHasBlocks() == false Review Comment: Maybe a real `if` --> `CorruptIndexException`? ########## lucene/core/src/java/org/apache/lucene/index/IndexingChain.java: ########## @@ -219,15 +222,33 @@ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOExcepti } LeafReader docValuesReader = getDocValuesLeafReader(); - + Function<IndexSorter.DocComparator, IndexSorter.DocComparator> comparatorWrapper = in -> in; + + if (state.segmentInfo.getHasBlocks() && indexSort.getParentField() != null) { + final DocIdSetIterator readerValues = + docValuesReader.getNumericDocValues(indexSort.getParentField()); + BitSet parents = BitSet.of(readerValues, state.segmentInfo.maxDoc()); + comparatorWrapper = + in -> + (docID1, docID2) -> + in.compare(parents.nextSetBit(docID1), parents.nextSetBit(docID2)); + } + assert state.segmentInfo.getHasBlocks() == false + || indexSort.getParentField() != null + || indexCreatedVersionMajor < Version.LUCENE_10_0_0.major + : "parent field is not set but the index has blocks. indexCreatedVersionMajor: " + + indexCreatedVersionMajor; List<IndexSorter.DocComparator> comparators = new ArrayList<>(); for (int i = 0; i < indexSort.getSort().length; i++) { SortField sortField = indexSort.getSort()[i]; IndexSorter sorter = sortField.getIndexSorter(); if (sorter == null) { throw new UnsupportedOperationException("Cannot sort index using sort field " + sortField); } - comparators.add(sorter.getDocComparator(docValuesReader, state.segmentInfo.maxDoc())); + + IndexSorter.DocComparator docComparator = Review Comment: It looks like we are now relying on index sorting always being stable, because the child docs of one parent will always compare as equal to one another since we compare their parents? Do we have any tests/confirmation all of Lucene's (static) Sort options are stable? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org