mikemccand commented on a change in pull request #443: URL: https://github.com/apache/lucene/pull/443#discussion_r751373424
########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingLeafReader.java ########## @@ -107,6 +113,64 @@ public BytesRef binaryValue() { } } + private class OrdinalMappingSortedNumericDocValues extends FilterSortedNumericDocValues { + private final IntArrayList currentValues; + private int currIndex; + + OrdinalMappingSortedNumericDocValues(SortedNumericDocValues in) { + super(in); + currentValues = new IntArrayList(32); + } + + @Override + public boolean advanceExact(int target) throws IOException { + boolean result = in.advanceExact(target); + if (result) { + reloadValues(); + } + return result; + } + + @Override + public int advance(int target) throws IOException { + int result = in.advance(target); + if (result != DocIdSetIterator.NO_MORE_DOCS) { + reloadValues(); + } + return result; + } + + @Override + public int nextDoc() throws IOException { + int result = in.nextDoc(); + if (result != DocIdSetIterator.NO_MORE_DOCS) { + reloadValues(); + } + return result; + } + + @Override + public int docValueCount() { + return currentValues.elementsCount; + } + + private void reloadValues() throws IOException { + currIndex = 0; + currentValues.clear(); + for (int i = 0; i < in.docValueCount(); i++) { + currentValues.add(ordinalMap[(int) in.nextValue()]); Review comment: Maybe (paranoia) `Math.toIntExact(...)` instead of straight `(int)` cast? ########## File path: lucene/facet/src/java/org/apache/lucene/facet/FacetUtils.java ########## @@ -81,4 +82,17 @@ public long cost() { } }; } + + /** + * Determine whether-or-not an index segment is using the older-style binary format or the newer + * NumericDocValues format for storing taxonomy faceting ordinals (for the specified field). + * + * @deprecated Please do not rely on this method. It is added as a temporary measure for providing + * index backwards-compatibility with Lucene 8 and earlier indexes, and will be removed in + * Lucene 10. + */ + @Deprecated + public static boolean usesOlderBinaryOrdinals(LeafReader reader) { + return reader.getMetaData().getCreatedVersionMajor() <= 8; Review comment: This is checking if the particular segment was created with version 8 or earlier? Hmm, or, is this method `.getCreatedVersionMajor()` referring to the whole index, even though it is this one segment that stored it? ########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingLeafReader.java ########## @@ -107,6 +113,64 @@ public BytesRef binaryValue() { } } + private class OrdinalMappingSortedNumericDocValues extends FilterSortedNumericDocValues { + private final IntArrayList currentValues; + private int currIndex; + + OrdinalMappingSortedNumericDocValues(SortedNumericDocValues in) { + super(in); + currentValues = new IntArrayList(32); + } + + @Override + public boolean advanceExact(int target) throws IOException { + boolean result = in.advanceExact(target); + if (result) { + reloadValues(); + } + return result; + } + + @Override + public int advance(int target) throws IOException { + int result = in.advance(target); + if (result != DocIdSetIterator.NO_MORE_DOCS) { + reloadValues(); + } + return result; + } + + @Override + public int nextDoc() throws IOException { + int result = in.nextDoc(); + if (result != DocIdSetIterator.NO_MORE_DOCS) { + reloadValues(); + } + return result; + } + + @Override + public int docValueCount() { + return currentValues.elementsCount; + } + + private void reloadValues() throws IOException { + currIndex = 0; + currentValues.clear(); + for (int i = 0; i < in.docValueCount(); i++) { + currentValues.add(ordinalMap[(int) in.nextValue()]); + } + Arrays.sort(currentValues.buffer, 0, currentValues.elementsCount); + } + + @Override + public long nextValue() { + int actual = currentValues.get(currIndex); Review comment: Maybe `assert currIndex < currentValues.elementCount`? ########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingLeafReader.java ########## @@ -107,6 +113,64 @@ public BytesRef binaryValue() { } } + private class OrdinalMappingSortedNumericDocValues extends FilterSortedNumericDocValues { + private final IntArrayList currentValues; + private int currIndex; + + OrdinalMappingSortedNumericDocValues(SortedNumericDocValues in) { + super(in); + currentValues = new IntArrayList(32); + } + + @Override + public boolean advanceExact(int target) throws IOException { + boolean result = in.advanceExact(target); + if (result) { + reloadValues(); + } + return result; + } + + @Override + public int advance(int target) throws IOException { + int result = in.advance(target); + if (result != DocIdSetIterator.NO_MORE_DOCS) { + reloadValues(); + } + return result; + } + + @Override + public int nextDoc() throws IOException { + int result = in.nextDoc(); + if (result != DocIdSetIterator.NO_MORE_DOCS) { + reloadValues(); + } + return result; + } + + @Override + public int docValueCount() { + return currentValues.elementsCount; + } + + private void reloadValues() throws IOException { + currIndex = 0; + currentValues.clear(); + for (int i = 0; i < in.docValueCount(); i++) { + currentValues.add(ordinalMap[(int) in.nextValue()]); + } + Arrays.sort(currentValues.buffer, 0, currentValues.elementsCount); Review comment: I assume we would never expect to see dups here right? The `OrdinalMap` should always be one to one? I wonder if anything enforces that. ########## File path: lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java ########## @@ -409,9 +410,26 @@ private void processFacetFields( indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel); } - // Facet counts: - // DocValues are considered stored fields: - doc.add(new BinaryDocValuesField(indexFieldName, dedupAndEncode(ordinals.get()))); + // Store the taxonomy ordinals associated with each doc. Prefer to use SortedNumericDocValues + // but "fall back" to a custom binary format to maintain backwards compatibility with Lucene 8 + // indexes. + if (taxoWriter.useNumericDocValuesForOrdinals()) { + // Dedupe and encode the ordinals. It's not important that we sort here + // (SortedNumericDocValuesField will handle this internally), but we + // sort to identify dups (since SNDVF doesn't dedupe): + IntsRef ords = ordinals.get(); Review comment: Minor: you could pull this line out and then use `ords` below and in the `else` clause. ########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java ########## @@ -69,31 +69,34 @@ public FastTaxonomyFacetCounts( } private final void count(List<MatchingDocs> matchingDocs) throws IOException { + if (matchingDocs.isEmpty()) { + return; + } + + boolean useBinaryDv = FacetUtils.usesOlderBinaryOrdinals(matchingDocs.get(0).context.reader()); Review comment: Maybe rename to `useOlderBinaryDv` to make it clear this is back-compat logic? ########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CachedOrdinalsReader.java ########## @@ -45,7 +45,11 @@ * * <p><b>NOTE:</b> create one instance of this and re-use it for all facet implementations (the * cache is per-instance, not static). + * + * @deprecated Custom binary encodings for taxonomy ordinals are no longer supported starting with Review comment: Maybe we should add a `MIGRATE` entry indicating that carrying forward a Lucene 8 index will continue to use the older/slower format even for newly indexed documents. To get the new way, you must create a new index with Lucene 9? In fact, we should generalize this entry for the other changes (moving off of stored fields for ord->label, moving to doc values for parent ords, etc.). This is a sort of dark side / trappiness to doing back compat by index created version. ########## File path: lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java ########## @@ -409,9 +410,26 @@ private void processFacetFields( indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel); } - // Facet counts: - // DocValues are considered stored fields: - doc.add(new BinaryDocValuesField(indexFieldName, dedupAndEncode(ordinals.get()))); + // Store the taxonomy ordinals associated with each doc. Prefer to use SortedNumericDocValues + // but "fall back" to a custom binary format to maintain backwards compatibility with Lucene 8 + // indexes. + if (taxoWriter.useNumericDocValuesForOrdinals()) { + // Dedupe and encode the ordinals. It's not important that we sort here + // (SortedNumericDocValuesField will handle this internally), but we Review comment: I wonder whether `SNDVF`'s internal sort is efficient when it's given already pre-sorted input? ########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetLabels.java ########## @@ -62,7 +62,16 @@ public TaxonomyFacetLabels(TaxonomyReader taxoReader, String indexFieldName) thr * @throws IOException when a low-level IO issue occurs */ public FacetLabelReader getFacetLabelReader(LeafReaderContext readerContext) throws IOException { - return new FacetLabelReader(ordsReader, readerContext); + // Support older binary format by leveraging an OrdinalsReader, which can still support both + // formats for now: + if (FacetUtils.usesOlderBinaryOrdinals(readerContext.reader())) { + OrdinalsReader ordsReader = new DocValuesOrdinalsReader(indexFieldName); + return new FacetLabelReader(ordsReader, readerContext); + } + + SortedNumericDocValues ordinalValues = Review comment: Can we `assert ordinalValues != null`? ########## File path: lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java ########## @@ -138,4 +331,51 @@ private Path getIndexDir() { path != null); return Paths.get(path); } + + private static class DummyDoubleValuesSource extends DoubleValuesSource { Review comment: You could maybe use `DoubleValuesSource.constant(1d)`? ########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java ########## @@ -69,31 +69,34 @@ public FastTaxonomyFacetCounts( } private final void count(List<MatchingDocs> matchingDocs) throws IOException { + if (matchingDocs.isEmpty()) { + return; + } + + boolean useBinaryDv = FacetUtils.usesOlderBinaryOrdinals(matchingDocs.get(0).context.reader()); + for (MatchingDocs hits : matchingDocs) { - BinaryDocValues dv = hits.context.reader().getBinaryDocValues(indexFieldName); - if (dv == null) { // this reader does not have DocValues for the requested category list + assert useBinaryDv == FacetUtils.usesOlderBinaryOrdinals(hits.context.reader()); Review comment: OK it must indeed be that that per-segment-seeming metadata is in fact global to the index. Maybe improve its javadocs :) ########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/BackCompatSortedNumericDocValues.java ########## @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.facet.taxonomy; + +import java.io.IOException; +import java.util.function.BiConsumer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +/** + * Wraps a {@link BinaryDocValues} instance, providing a {@link SortedNumericDocValues} interface Review comment: Whoa, I like this approach! Pushes the back compat conversion "down low". ########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingLeafReader.java ########## @@ -107,6 +113,64 @@ public BytesRef binaryValue() { } } + private class OrdinalMappingSortedNumericDocValues extends FilterSortedNumericDocValues { + private final IntArrayList currentValues; + private int currIndex; + + OrdinalMappingSortedNumericDocValues(SortedNumericDocValues in) { + super(in); + currentValues = new IntArrayList(32); + } + + @Override + public boolean advanceExact(int target) throws IOException { + boolean result = in.advanceExact(target); + if (result) { + reloadValues(); + } + return result; + } + + @Override + public int advance(int target) throws IOException { + int result = in.advance(target); + if (result != DocIdSetIterator.NO_MORE_DOCS) { + reloadValues(); + } + return result; + } + + @Override + public int nextDoc() throws IOException { + int result = in.nextDoc(); + if (result != DocIdSetIterator.NO_MORE_DOCS) { + reloadValues(); + } + return result; + } + + @Override + public int docValueCount() { + return currentValues.elementsCount; + } + + private void reloadValues() throws IOException { + currIndex = 0; + currentValues.clear(); + for (int i = 0; i < in.docValueCount(); i++) { + currentValues.add(ordinalMap[(int) in.nextValue()]); + } + Arrays.sort(currentValues.buffer, 0, currentValues.elementsCount); + } + + @Override + public long nextValue() { + int actual = currentValues.get(currIndex); + currIndex++; Review comment: Thank you for pulling the increment out to its own line :) ########## File path: lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java ########## @@ -50,43 +74,192 @@ // Then move the zip file to your trunk checkout and use it in your test cases public static final String oldTaxonomyIndexName = "taxonomy.8.10.0-cfs"; + private static final String OLD_INDEX_NAME = "index.8.10.0-cfs"; Review comment: Ooooh excellent -- so we are also testing a full Lucene index with the old encoding. Great! ########## File path: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetLabels.java ########## @@ -168,24 +234,61 @@ public FacetLabel nextFacetLabel(int docId, String facetDimension) throws IOExce throw new IllegalArgumentException( "docs out of order: previous docId=" + currentDocId + " current docId=" + docId); } - ordinalsSegmentReader.get(docId, decodedOrds); - currentPos = decodedOrds.offset; currentDocId = docId; - } - if (parents == null) { - parents = taxoReader.getParallelTaxonomyArrays().parents(); + if (ordinalsSegmentReader != null) { + ordinalsSegmentReader.get(docId, decodedOrds); + currentPos = decodedOrds.offset; + } else { + currentDocHasValues = ordinalValues.advanceExact(docId); + if (currentDocHasValues) { Review comment: Interesting how the "new way" can reliably say that "this doc had no ordinals", but the old way would just say "this doc has 0 ordinals" :) ########## File path: lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestBackCompatSortedNumericDocValues.java ########## @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.facet.taxonomy; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TopFieldDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LuceneTestCase; + +public class TestBackCompatSortedNumericDocValues extends LuceneTestCase { + + private static class FacetsConfigWrapper extends FacetsConfig { + public BytesRef encodeValues(IntsRef values) { + return dedupAndEncode(values); + } + } + + public void testRandom() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + // sorta big scratch so we don't have to think about reallocating: + IntsRef scratch = new IntsRef(100); + + // used to access default binary encoding easily: + FacetsConfigWrapper facetsConfig = new FacetsConfigWrapper(); + + // keep track of the values we expect to see for each doc: + Map<String, List<Integer>> expectedValues = new HashMap<>(); + + int numDocs = atLeast(100); + for (int i = 0; i < numDocs; i++) { + int numValues = RandomNumbers.randomIntBetween(random(), 1, 50); + scratch.length = 0; + scratch.offset = 0; + Set<Integer> values = new HashSet<>(); + for (int j = 0; j < numValues; j++) { + int value = random().nextInt(Integer.MAX_VALUE); + values.add(value); + // we might have dups in here, which is fine (encoding takes care of deduping and sorting): + scratch.ints[j] = value; + scratch.length++; + } + // we expect to get sorted and deduped values back out: + expectedValues.put(String.valueOf(i), values.stream().sorted().collect(Collectors.toList())); + + Document doc = new Document(); + doc.add(new StoredField("id", String.valueOf(i))); + doc.add(new BinaryDocValuesField("bdv", facetsConfig.encodeValues(scratch))); + writer.addDocument(doc); + } + + writer.forceMerge(1); + writer.commit(); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + writer.close(); + + assert reader.leaves().size() == 1; + BinaryDocValues binaryDocValues = reader.leaves().get(0).reader().getBinaryDocValues("bdv"); + assertNotNull(binaryDocValues); + SortedNumericDocValues docValues = BackCompatSortedNumericDocValues.wrap(binaryDocValues); Review comment: It's also nice how easily/separately testable this is too -- no need to make zip files, etc. > -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org