[GitHub] [lucene] mikemccand commented on a change in pull request #443: LUCENE-10062: Switch to numeric doc values for encoding taxonomy ordinals

GitBox Wed, 17 Nov 2021 08:20:35 -0800


mikemccand commented on a change in pull request #443:
URL: https://github.com/apache/lucene/pull/443#discussion_r751373424




##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingLeafReader.java
##########
@@ -107,6 +113,64 @@ public BytesRef binaryValue() {
     }
   }
 
+  private class OrdinalMappingSortedNumericDocValues extends 
FilterSortedNumericDocValues {
+    private final IntArrayList currentValues;
+    private int currIndex;
+
+    OrdinalMappingSortedNumericDocValues(SortedNumericDocValues in) {
+      super(in);
+      currentValues = new IntArrayList(32);
+    }
+
+    @Override
+    public boolean advanceExact(int target) throws IOException {
+      boolean result = in.advanceExact(target);
+      if (result) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      int result = in.advance(target);
+      if (result != DocIdSetIterator.NO_MORE_DOCS) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      int result = in.nextDoc();
+      if (result != DocIdSetIterator.NO_MORE_DOCS) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int docValueCount() {
+      return currentValues.elementsCount;
+    }
+
+    private void reloadValues() throws IOException {
+      currIndex = 0;
+      currentValues.clear();
+      for (int i = 0; i < in.docValueCount(); i++) {
+        currentValues.add(ordinalMap[(int) in.nextValue()]);

Review comment:
       Maybe (paranoia) `Math.toIntExact(...)` instead of straight `(int)` cast?

##########
File path: lucene/facet/src/java/org/apache/lucene/facet/FacetUtils.java
##########
@@ -81,4 +82,17 @@ public long cost() {
       }
     };
   }
+
+  /**
+   * Determine whether-or-not an index segment is using the older-style binary 
format or the newer
+   * NumericDocValues format for storing taxonomy faceting ordinals (for the 
specified field).
+   *
+   * @deprecated Please do not rely on this method. It is added as a temporary 
measure for providing
+   *     index backwards-compatibility with Lucene 8 and earlier indexes, and 
will be removed in
+   *     Lucene 10.
+   */
+  @Deprecated
+  public static boolean usesOlderBinaryOrdinals(LeafReader reader) {
+    return reader.getMetaData().getCreatedVersionMajor() <= 8;

Review comment:
       This is checking if the particular segment was created with version 8 or 
earlier?  Hmm, or, is this method `.getCreatedVersionMajor()` referring to the 
whole index, even though it is this one segment that stored it?

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingLeafReader.java
##########
@@ -107,6 +113,64 @@ public BytesRef binaryValue() {
     }
   }
 
+  private class OrdinalMappingSortedNumericDocValues extends 
FilterSortedNumericDocValues {
+    private final IntArrayList currentValues;
+    private int currIndex;
+
+    OrdinalMappingSortedNumericDocValues(SortedNumericDocValues in) {
+      super(in);
+      currentValues = new IntArrayList(32);
+    }
+
+    @Override
+    public boolean advanceExact(int target) throws IOException {
+      boolean result = in.advanceExact(target);
+      if (result) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      int result = in.advance(target);
+      if (result != DocIdSetIterator.NO_MORE_DOCS) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      int result = in.nextDoc();
+      if (result != DocIdSetIterator.NO_MORE_DOCS) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int docValueCount() {
+      return currentValues.elementsCount;
+    }
+
+    private void reloadValues() throws IOException {
+      currIndex = 0;
+      currentValues.clear();
+      for (int i = 0; i < in.docValueCount(); i++) {
+        currentValues.add(ordinalMap[(int) in.nextValue()]);
+      }
+      Arrays.sort(currentValues.buffer, 0, currentValues.elementsCount);
+    }
+
+    @Override
+    public long nextValue() {
+      int actual = currentValues.get(currIndex);

Review comment:
       Maybe `assert currIndex < currentValues.elementCount`?

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingLeafReader.java
##########
@@ -107,6 +113,64 @@ public BytesRef binaryValue() {
     }
   }
 
+  private class OrdinalMappingSortedNumericDocValues extends 
FilterSortedNumericDocValues {
+    private final IntArrayList currentValues;
+    private int currIndex;
+
+    OrdinalMappingSortedNumericDocValues(SortedNumericDocValues in) {
+      super(in);
+      currentValues = new IntArrayList(32);
+    }
+
+    @Override
+    public boolean advanceExact(int target) throws IOException {
+      boolean result = in.advanceExact(target);
+      if (result) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      int result = in.advance(target);
+      if (result != DocIdSetIterator.NO_MORE_DOCS) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      int result = in.nextDoc();
+      if (result != DocIdSetIterator.NO_MORE_DOCS) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int docValueCount() {
+      return currentValues.elementsCount;
+    }
+
+    private void reloadValues() throws IOException {
+      currIndex = 0;
+      currentValues.clear();
+      for (int i = 0; i < in.docValueCount(); i++) {
+        currentValues.add(ordinalMap[(int) in.nextValue()]);
+      }
+      Arrays.sort(currentValues.buffer, 0, currentValues.elementsCount);

Review comment:
       I assume we would never expect to see dups here right?  The `OrdinalMap` 
should always be one to one?  I wonder if anything enforces that.

##########
File path: lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java
##########
@@ -409,9 +410,26 @@ private void processFacetFields(
         indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel);
       }
 
-      // Facet counts:
-      // DocValues are considered stored fields:
-      doc.add(new BinaryDocValuesField(indexFieldName, 
dedupAndEncode(ordinals.get())));
+      // Store the taxonomy ordinals associated with each doc. Prefer to use 
SortedNumericDocValues
+      // but "fall back" to a custom binary format to maintain backwards 
compatibility with Lucene 8
+      // indexes.
+      if (taxoWriter.useNumericDocValuesForOrdinals()) {
+        // Dedupe and encode the ordinals. It's not important that we sort here
+        // (SortedNumericDocValuesField will handle this internally), but we
+        // sort to identify dups (since SNDVF doesn't dedupe):
+        IntsRef ords = ordinals.get();

Review comment:
       Minor: you could pull this line out and then use `ords` below and in the 
`else` clause.

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
##########
@@ -69,31 +69,34 @@ public FastTaxonomyFacetCounts(
   }
 
   private final void count(List<MatchingDocs> matchingDocs) throws IOException 
{
+    if (matchingDocs.isEmpty()) {
+      return;
+    }
+
+    boolean useBinaryDv = 
FacetUtils.usesOlderBinaryOrdinals(matchingDocs.get(0).context.reader());

Review comment:
       Maybe rename to `useOlderBinaryDv` to make it clear this is back-compat 
logic?

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CachedOrdinalsReader.java
##########
@@ -45,7 +45,11 @@
  *
  * <p><b>NOTE:</b> create one instance of this and re-use it for all facet 
implementations (the
  * cache is per-instance, not static).
+ *
+ * @deprecated Custom binary encodings for taxonomy ordinals are no longer 
supported starting with

Review comment:
       Maybe we should add a `MIGRATE` entry indicating that carrying forward a 
Lucene 8 index will continue to use the older/slower format even for newly 
indexed documents.  To get the new way, you must create a new index with Lucene 
9?  In fact, we should generalize this entry for the other changes (moving off 
of stored fields for ord->label, moving to doc values for parent ords, etc.).  
This is a sort of dark side / trappiness to doing back compat by index created 
version.

##########
File path: lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java
##########
@@ -409,9 +410,26 @@ private void processFacetFields(
         indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel);
       }
 
-      // Facet counts:
-      // DocValues are considered stored fields:
-      doc.add(new BinaryDocValuesField(indexFieldName, 
dedupAndEncode(ordinals.get())));
+      // Store the taxonomy ordinals associated with each doc. Prefer to use 
SortedNumericDocValues
+      // but "fall back" to a custom binary format to maintain backwards 
compatibility with Lucene 8
+      // indexes.
+      if (taxoWriter.useNumericDocValuesForOrdinals()) {
+        // Dedupe and encode the ordinals. It's not important that we sort here
+        // (SortedNumericDocValuesField will handle this internally), but we

Review comment:
       I wonder whether `SNDVF`'s internal sort is efficient when it's given 
already pre-sorted input?

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetLabels.java
##########
@@ -62,7 +62,16 @@ public TaxonomyFacetLabels(TaxonomyReader taxoReader, String 
indexFieldName) thr
    * @throws IOException when a low-level IO issue occurs
    */
   public FacetLabelReader getFacetLabelReader(LeafReaderContext readerContext) 
throws IOException {
-    return new FacetLabelReader(ordsReader, readerContext);
+    // Support older binary format by leveraging an OrdinalsReader, which can 
still support both
+    // formats for now:
+    if (FacetUtils.usesOlderBinaryOrdinals(readerContext.reader())) {
+      OrdinalsReader ordsReader = new DocValuesOrdinalsReader(indexFieldName);
+      return new FacetLabelReader(ordsReader, readerContext);
+    }
+
+    SortedNumericDocValues ordinalValues =

Review comment:
       Can we `assert ordinalValues != null`?

##########
File path: 
lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java
##########
@@ -138,4 +331,51 @@ private Path getIndexDir() {
         path != null);
     return Paths.get(path);
   }
+
+  private static class DummyDoubleValuesSource extends DoubleValuesSource {

Review comment:
       You could maybe use `DoubleValuesSource.constant(1d)`?

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
##########
@@ -69,31 +69,34 @@ public FastTaxonomyFacetCounts(
   }
 
   private final void count(List<MatchingDocs> matchingDocs) throws IOException 
{
+    if (matchingDocs.isEmpty()) {
+      return;
+    }
+
+    boolean useBinaryDv = 
FacetUtils.usesOlderBinaryOrdinals(matchingDocs.get(0).context.reader());
+
     for (MatchingDocs hits : matchingDocs) {
-      BinaryDocValues dv = 
hits.context.reader().getBinaryDocValues(indexFieldName);
-      if (dv == null) { // this reader does not have DocValues for the 
requested category list
+      assert useBinaryDv == 
FacetUtils.usesOlderBinaryOrdinals(hits.context.reader());

Review comment:
       OK it must indeed be that that per-segment-seeming metadata is in fact 
global to the index.  Maybe improve its javadocs :)

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/BackCompatSortedNumericDocValues.java
##########
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.facet.taxonomy;
+
+import java.io.IOException;
+import java.util.function.BiConsumer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
+/**
+ * Wraps a {@link BinaryDocValues} instance, providing a {@link 
SortedNumericDocValues} interface

Review comment:
       Whoa, I like this approach!  Pushes the back compat conversion "down 
low".

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingLeafReader.java
##########
@@ -107,6 +113,64 @@ public BytesRef binaryValue() {
     }
   }
 
+  private class OrdinalMappingSortedNumericDocValues extends 
FilterSortedNumericDocValues {
+    private final IntArrayList currentValues;
+    private int currIndex;
+
+    OrdinalMappingSortedNumericDocValues(SortedNumericDocValues in) {
+      super(in);
+      currentValues = new IntArrayList(32);
+    }
+
+    @Override
+    public boolean advanceExact(int target) throws IOException {
+      boolean result = in.advanceExact(target);
+      if (result) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      int result = in.advance(target);
+      if (result != DocIdSetIterator.NO_MORE_DOCS) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      int result = in.nextDoc();
+      if (result != DocIdSetIterator.NO_MORE_DOCS) {
+        reloadValues();
+      }
+      return result;
+    }
+
+    @Override
+    public int docValueCount() {
+      return currentValues.elementsCount;
+    }
+
+    private void reloadValues() throws IOException {
+      currIndex = 0;
+      currentValues.clear();
+      for (int i = 0; i < in.docValueCount(); i++) {
+        currentValues.add(ordinalMap[(int) in.nextValue()]);
+      }
+      Arrays.sort(currentValues.buffer, 0, currentValues.elementsCount);
+    }
+
+    @Override
+    public long nextValue() {
+      int actual = currentValues.get(currIndex);
+      currIndex++;

Review comment:
       Thank you for pulling the increment out to its own line :)

##########
File path: 
lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java
##########
@@ -50,43 +74,192 @@
   // Then move the zip file to your trunk checkout and use it in your test 
cases
 
   public static final String oldTaxonomyIndexName = "taxonomy.8.10.0-cfs";
+  private static final String OLD_INDEX_NAME = "index.8.10.0-cfs";

Review comment:
       Ooooh excellent -- so we are also testing a full Lucene index with the 
old encoding.  Great!

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetLabels.java
##########
@@ -168,24 +234,61 @@ public FacetLabel nextFacetLabel(int docId, String 
facetDimension) throws IOExce
           throw new IllegalArgumentException(
               "docs out of order: previous docId=" + currentDocId + " current 
docId=" + docId);
         }
-        ordinalsSegmentReader.get(docId, decodedOrds);
-        currentPos = decodedOrds.offset;
         currentDocId = docId;
-      }
 
-      if (parents == null) {
-        parents = taxoReader.getParallelTaxonomyArrays().parents();
+        if (ordinalsSegmentReader != null) {
+          ordinalsSegmentReader.get(docId, decodedOrds);
+          currentPos = decodedOrds.offset;
+        } else {
+          currentDocHasValues = ordinalValues.advanceExact(docId);
+          if (currentDocHasValues) {

Review comment:
       Interesting how the "new way" can reliably say that "this doc had no 
ordinals", but the old way would just say "this doc has 0 ordinals" :)

##########
File path: 
lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestBackCompatSortedNumericDocValues.java
##########
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.facet.taxonomy;
+
+import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.facet.FacetsConfig;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.TopFieldDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestBackCompatSortedNumericDocValues extends LuceneTestCase {
+
+  private static class FacetsConfigWrapper extends FacetsConfig {
+    public BytesRef encodeValues(IntsRef values) {
+      return dedupAndEncode(values);
+    }
+  }
+
+  public void testRandom() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+
+    // sorta big scratch so we don't have to think about reallocating:
+    IntsRef scratch = new IntsRef(100);
+
+    // used to access default binary encoding easily:
+    FacetsConfigWrapper facetsConfig = new FacetsConfigWrapper();
+
+    // keep track of the values we expect to see for each doc:
+    Map<String, List<Integer>> expectedValues = new HashMap<>();
+
+    int numDocs = atLeast(100);
+    for (int i = 0; i < numDocs; i++) {
+      int numValues = RandomNumbers.randomIntBetween(random(), 1, 50);
+      scratch.length = 0;
+      scratch.offset = 0;
+      Set<Integer> values = new HashSet<>();
+      for (int j = 0; j < numValues; j++) {
+        int value = random().nextInt(Integer.MAX_VALUE);
+        values.add(value);
+        // we might have dups in here, which is fine (encoding takes care of 
deduping and sorting):
+        scratch.ints[j] = value;
+        scratch.length++;
+      }
+      // we expect to get sorted and deduped values back out:
+      expectedValues.put(String.valueOf(i), 
values.stream().sorted().collect(Collectors.toList()));
+
+      Document doc = new Document();
+      doc.add(new StoredField("id", String.valueOf(i)));
+      doc.add(new BinaryDocValuesField("bdv", 
facetsConfig.encodeValues(scratch)));
+      writer.addDocument(doc);
+    }
+
+    writer.forceMerge(1);
+    writer.commit();
+
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    writer.close();
+
+    assert reader.leaves().size() == 1;
+    BinaryDocValues binaryDocValues = 
reader.leaves().get(0).reader().getBinaryDocValues("bdv");
+    assertNotNull(binaryDocValues);
+    SortedNumericDocValues docValues = 
BackCompatSortedNumericDocValues.wrap(binaryDocValues);

Review comment:
       It's also nice how easily/separately testable this is too -- no need to 
make zip files, etc.
   
   > 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene] mikemccand commented on a change in pull request #443: LUCENE-10062: Switch to numeric doc values for encoding taxonomy ordinals

Reply via email to