Re: [PR] Compute facets while collecting [lucene]

via GitHub Fri, 09 Aug 2024 05:12:59 -0700


epotyom commented on code in PR #13568:
URL: https://github.com/apache/lucene/pull/13568#discussion_r1711353146



##########
lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/SandboxFacetTestCase.java:
##########
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.sandbox.facet;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.lucene.facet.FacetResult;
+import org.apache.lucene.facet.FacetsCollector;
+import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
+import org.apache.lucene.facet.FacetsConfig;
+import org.apache.lucene.facet.LabelAndValue;
+import org.apache.lucene.facet.taxonomy.FacetLabel;
+import org.apache.lucene.facet.taxonomy.TaxonomyFacetLabels;
+import org.apache.lucene.facet.taxonomy.TaxonomyFacetLabels.FacetLabelReader;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.sandbox.facet.abstracts.OrdLabelBiMap;
+import org.apache.lucene.sandbox.facet.abstracts.OrdToComparable;
+import org.apache.lucene.sandbox.facet.abstracts.OrdinalIterator;
+import org.apache.lucene.sandbox.facet.ordinal_iterators.TopnOrdinalIterator;
+import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder;
+import 
org.apache.lucene.sandbox.facet.taxonomy.TaxonomyChildrenOrdinalIterator;
+import org.apache.lucene.sandbox.facet.taxonomy.TaxonomyOrdLabelBiMap;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.BytesRef;
+
+public abstract class SandboxFacetTestCase extends LuceneTestCase {
+  // we don't have access to overall count for all facets from count recorder,
+  // and we can't compute it as a SUM of values for each facet ID because we 
need to respect cases
+  // where
+  // the same doc belongs to multiple facets (e.g. overlapping ranges and
+  // multi value fields). We can add an extra range that includes everything,
+  // or consider supporting overall count in CountFacetRecorder. But it is not 
exactly the value
+  // we can get now, as this value wouldn't respect top-n cutoff. Is this 
value a must have facets
+  // feature?
+  static final int VALUE_CANT_BE_COMPUTED = -5;
+
+  /**
+   * Utility method that uses {@link FacetLabelReader} to get facet labels for 
each hit in {@link
+   * MatchingDocs}. The method returns {@code List<List<FacetLabel>>} where 
outer list has one entry
+   * per document and inner list has all {@link FacetLabel} entries that 
belong to a document. The
+   * inner list may be empty if no {@link FacetLabel} are found for a hit.
+   *
+   * @param taxoReader {@link TaxonomyReader} used to read taxonomy during 
search. This instance is
+   *     expected to be open for reading.
+   * @param fc {@link FacetsCollector} A collector with matching hits.
+   * @param dimension facet dimension for which labels are requested. A null 
value fetches labels
+   *     for all dimensions.
+   * @return {@code List<List<FacetLabel>} where outer list has one non-null 
entry per document. and
+   *     inner list contain all {@link FacetLabel} entries that belong to a 
document.
+   * @throws IOException when a low-level IO issue occurs.
+   */
+  public List<List<FacetLabel>> getAllTaxonomyFacetLabels(
+      String dimension, TaxonomyReader taxoReader, FacetsCollector fc) throws 
IOException {
+    List<List<FacetLabel>> actualLabels = new ArrayList<>();
+    TaxonomyFacetLabels taxoLabels =
+        new TaxonomyFacetLabels(taxoReader, 
FacetsConfig.DEFAULT_INDEX_FIELD_NAME);
+    for (MatchingDocs m : fc.getMatchingDocs()) {
+      FacetLabelReader facetLabelReader = 
taxoLabels.getFacetLabelReader(m.context);
+      DocIdSetIterator disi = m.bits.iterator();
+      while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+        actualLabels.add(allFacetLabels(disi.docID(), dimension, 
facetLabelReader));
+      }
+    }
+    return actualLabels;
+  }
+
+  /**
+   * Utility method to get all facet labels for an input docId and dimension 
using the supplied
+   * {@link FacetLabelReader}.
+   *
+   * @param docId docId for which facet labels are needed.
+   * @param dimension Retain facet labels for supplied dimension only. A null 
value fetches all
+   *     facet labels.
+   * @param facetLabelReader {@FacetLabelReader} instance use to get facet 
labels for input docId.
+   * @return {@code List<FacetLabel>} containing matching facet labels.
+   * @throws IOException when a low-level IO issue occurs while reading facet 
labels.
+   */
+  List<FacetLabel> allFacetLabels(int docId, String dimension, 
FacetLabelReader facetLabelReader)
+      throws IOException {
+    List<FacetLabel> facetLabels = new ArrayList<>();
+    FacetLabel facetLabel;
+    if (dimension != null) {
+      for (facetLabel = facetLabelReader.nextFacetLabel(docId, dimension); 
facetLabel != null; ) {
+        facetLabels.add(facetLabel);
+        facetLabel = facetLabelReader.nextFacetLabel(docId, dimension);
+      }
+    } else {
+      for (facetLabel = facetLabelReader.nextFacetLabel(docId); facetLabel != 
null; ) {
+        facetLabels.add(facetLabel);
+        facetLabel = facetLabelReader.nextFacetLabel(docId);
+      }
+    }
+    return facetLabels;
+  }
+
+  protected String[] getRandomTokens(int count) {
+    String[] tokens = new String[count];
+    for (int i = 0; i < tokens.length; i++) {
+      tokens[i] = TestUtil.randomRealisticUnicodeString(random(), 1, 10);
+      // tokens[i] = _TestUtil.randomSimpleString(random(), 1, 10);
+    }
+    return tokens;
+  }
+
+  protected String pickToken(String[] tokens) {

Review Comment:
   I removed the method - thanks!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Compute facets while collecting [lucene]

Reply via email to