[GitHub] [lucene] mdmarshmallow commented on a change in pull request #509: LUCENE-10250: Add support for arbitrary length hierarchical SSDV facets

GitBox Fri, 07 Jan 2022 13:32:11 -0800


mdmarshmallow commented on a change in pull request #509:
URL: https://github.com/apache/lucene/pull/509#discussion_r779974107




##########
File path: lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java
##########
@@ -471,19 +471,35 @@ private void indexDrillDownTerms(
 
   private void processSSDVFacetFields(
       Map<String, List<SortedSetDocValuesFacetField>> byField, Document doc) {
+
     for (Map.Entry<String, List<SortedSetDocValuesFacetField>> ent : 
byField.entrySet()) {
 
       String indexFieldName = ent.getKey();
 
       for (SortedSetDocValuesFacetField facetField : ent.getValue()) {
-        FacetLabel facetLabel = new FacetLabel(facetField.dim, 
facetField.label);
-        String fullPath = pathToString(facetLabel.components, 
facetLabel.length);
+        FacetLabel facetLabel = new FacetLabel(facetField.dim, 
facetField.path);
+        if (getDimConfig(facetField.dim).hierarchical) {
+          for (int i = 0; i < facetLabel.length; i++) {
+            String fullPath = pathToString(facetLabel.components, i + 1);
+            // For facet counts:
+            doc.add(new SortedSetDocValuesField(indexFieldName, new 
BytesRef(fullPath)));
+
+            // For drill-down:
+            indexDrillDownTerms(doc, indexFieldName, 
getDimConfig(facetField.dim), facetLabel);
+          }
+        } else {
+          if (facetLabel.length != 2) {
+            throw new IllegalArgumentException(

Review comment:
       Changed error message to match taxo-facets.

##########
File path: lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java
##########
@@ -471,19 +471,35 @@ private void indexDrillDownTerms(
 
   private void processSSDVFacetFields(
       Map<String, List<SortedSetDocValuesFacetField>> byField, Document doc) {
+
     for (Map.Entry<String, List<SortedSetDocValuesFacetField>> ent : 
byField.entrySet()) {
 
       String indexFieldName = ent.getKey();
 
       for (SortedSetDocValuesFacetField facetField : ent.getValue()) {
-        FacetLabel facetLabel = new FacetLabel(facetField.dim, 
facetField.label);
-        String fullPath = pathToString(facetLabel.components, 
facetLabel.length);
+        FacetLabel facetLabel = new FacetLabel(facetField.dim, 
facetField.path);
+        if (getDimConfig(facetField.dim).hierarchical) {
+          for (int i = 0; i < facetLabel.length; i++) {
+            String fullPath = pathToString(facetLabel.components, i + 1);
+            // For facet counts:
+            doc.add(new SortedSetDocValuesField(indexFieldName, new 
BytesRef(fullPath)));
+
+            // For drill-down:
+            indexDrillDownTerms(doc, indexFieldName, 
getDimConfig(facetField.dim), facetLabel);

Review comment:
       Yeah that is correct, that should not be in the loop.

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java
##########
@@ -51,20 +54,42 @@
 
   private final Map<String, OrdinalMap> cachedOrdMaps = new HashMap<>();
 
+  private final FacetsConfig config;
+
+  /*** Used for hierarchical dims ***/

Review comment:
       Oh I think I was copying and pasting that from a comment that was trying 
to signify a section or something, but yeah I'll make those normal haha.

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java
##########
@@ -79,38 +104,160 @@ public DefaultSortedSetDocValuesReaderState(IndexReader 
reader, String field) th
     }
     valueCount = (int) dv.getValueCount();
 
-    // TODO: we can make this more efficient if eg we can be
-    // "involved" when OrdinalMap is being created?  Ie see
-    // each term/ord it's assigning as it goes...
-    String lastDim = null;
-    int startOrd = -1;
-
-    // TODO: this approach can work for full hierarchy?;
-    // TaxoReader can't do this since ords are not in
-    // "sorted order" ... but we should generalize this to
-    // support arbitrary hierarchy:
-    for (int ord = 0; ord < valueCount; ord++) {
-      final BytesRef term = dv.lookupOrd(ord);
+    int ord = 0;
+    while (ord != valueCount) {
+      BytesRef term = dv.lookupOrd(ord);
       String[] components = FacetsConfig.stringToPath(term.utf8ToString());
-      if (components.length != 2) {
-        throw new IllegalArgumentException(
-            "this class can only handle 2 level hierarchy (dim/value); got: "
-                + Arrays.toString(components)
-                + " "
-                + term.utf8ToString());
+      String dim = components[0];
+      if (config != null && config.getDimConfig(dim).hierarchical) {
+        ord = createOneHierarchicalFacetDimState(dv, ord) + 1;
+      } else {
+        ord = createOneFlatFacetDimState(dv, ord) + 1;
+      }
+    }
+  }
+
+  // returns last ord of dimension
+  private int createOneHierarchicalFacetDimState(SortedSetDocValues dv, int 
dimStartOrd)
+      throws IOException {
+    List<Boolean> hasChildren = new ArrayList<>();
+    List<Integer> siblings = new ArrayList<>();
+
+    String dim = null;
+
+    // stack of paths with unfulfilled siblings
+    Stack<OrdAndComponent> siblingStack = new Stack<>();
+
+    String[] nextComponents = null;
+    int dimEndOrd = dimStartOrd;
+    while (true) {
+      String[] components;
+
+      int ord = dimEndOrd - dimStartOrd;
+
+      if (nextComponents == null) {

Review comment:
       Initialized this before loop.

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -95,17 +96,34 @@ public FacetResult getTopChildren(int topN, String dim, 
String... path) throws I
     if (topN <= 0) {
       throw new IllegalArgumentException("topN must be > 0 (got: " + topN + 
")");
     }
-    if (path.length > 0) {
-      throw new IllegalArgumentException("path should be 0 length");
-    }
-    OrdRange ordRange = state.getOrdRange(dim);
-    if (ordRange == null) {
-      return null; // means dimension was never indexed
+
+    if (state.isHierarchicalDim(dim)) {
+      int pathOrd = (int) dv.lookupTerm(new 
BytesRef(FacetsConfig.pathToString(dim, path)));
+      if (pathOrd == -1) {
+        // path was never indexed
+        return null;
+      }
+      DimTree dimTree = state.getDimTree(dim);
+      if (dimTree == null) {

Review comment:
       That is true, removed null check.

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java
##########
@@ -62,15 +159,27 @@ protected SortedSetDocValuesReaderState() {}
   /** Indexed field we are reading. */
   public abstract String getField();
 
+  /** Returns top-level index reader. */
+  public abstract IndexReader getReader();
+
+  /** Number of unique labels. */
+  public abstract int getSize();
+
+  /** Returns if dim is configured as hierarchical ** */
+  public abstract boolean isHierarchicalDim(String dim);
+
+  /*** Only used for flat facets (dim/value) ***/
+
   /** Returns the {@link OrdRange} for this dimension. */
   public abstract OrdRange getOrdRange(String dim);
 
   /** Returns mapping from prefix to {@link OrdRange}. */
   public abstract Map<String, OrdRange> getPrefixToOrdRange();
 
-  /** Returns top-level index reader. */
-  public abstract IndexReader getReader();
+  /*** Only used for hierarchical facets ***/
 
-  /** Number of unique labels. */
-  public abstract int getSize();
+  public abstract DimTree getDimTree(String dim);
+
+  /** Returns a list of all dimensions and their respective ordinals */

Review comment:
       Yes correct I left that there on accident from a previous revision.

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -317,10 +343,19 @@ public Number getSpecificValue(String dim, String... 
path) throws IOException {
   public List<FacetResult> getAllDims(int topN) throws IOException {
 
     List<FacetResult> results = new ArrayList<>();
-    for (Map.Entry<String, OrdRange> ent : 
state.getPrefixToOrdRange().entrySet()) {
-      FacetResult fr = getDim(ent.getKey(), ent.getValue(), topN);
-      if (fr != null) {
-        results.add(fr);
+    for (String dim : state.getDims()) {
+      if (state.isHierarchicalDim(dim)) {
+        DimTree dimTree = state.getDimTree(dim);
+        FacetResult fr = getDim(dim, new String[0], dimTree.dimStartOrd, 
dimTree.iterator(), topN);

Review comment:
       If we pass `-1` in the code would just use `dimCount` for the value, 
which could be inaccurate in the multi-valued case. Given that, I think it 
would be better to pass in `dimStartOrd` so we can guarantee that hierarchical 
SSDV facets have accurate multi-valued counts, given that we are already doing 
ancestral indexing. What do you think?
   ```
   if (pathOrd == -1) {
       // not hierarchical facet
       return new FacetResult(dim, emptyPath, dimCount, labelValues, 
childCount);
   } else {
       // hierarchical facet
       return new FacetResult(dim, path, counts[pathOrd], labelValues, 
childCount);
   }
   ```

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java
##########
@@ -62,15 +159,27 @@ protected SortedSetDocValuesReaderState() {}
   /** Indexed field we are reading. */
   public abstract String getField();
 
+  /** Returns top-level index reader. */
+  public abstract IndexReader getReader();
+
+  /** Number of unique labels. */
+  public abstract int getSize();
+
+  /** Returns if dim is configured as hierarchical ** */
+  public abstract boolean isHierarchicalDim(String dim);

Review comment:
       Yeah that makes sense.

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -95,17 +96,34 @@ public FacetResult getTopChildren(int topN, String dim, 
String... path) throws I
     if (topN <= 0) {
       throw new IllegalArgumentException("topN must be > 0 (got: " + topN + 
")");
     }
-    if (path.length > 0) {
-      throw new IllegalArgumentException("path should be 0 length");
-    }
-    OrdRange ordRange = state.getOrdRange(dim);
-    if (ordRange == null) {
-      return null; // means dimension was never indexed
+
+    if (state.isHierarchicalDim(dim)) {
+      int pathOrd = (int) dv.lookupTerm(new 
BytesRef(FacetsConfig.pathToString(dim, path)));
+      if (pathOrd == -1) {

Review comment:
       I added a unit test and it actually still did return `null` in `getDims` 
due to the dim tree iterator not iterating over anything. With that being said, 
I did change this check to just be a negative check (and added a unit test).

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java
##########
@@ -79,38 +104,160 @@ public DefaultSortedSetDocValuesReaderState(IndexReader 
reader, String field) th
     }
     valueCount = (int) dv.getValueCount();
 
-    // TODO: we can make this more efficient if eg we can be
-    // "involved" when OrdinalMap is being created?  Ie see
-    // each term/ord it's assigning as it goes...
-    String lastDim = null;
-    int startOrd = -1;
-
-    // TODO: this approach can work for full hierarchy?;
-    // TaxoReader can't do this since ords are not in
-    // "sorted order" ... but we should generalize this to
-    // support arbitrary hierarchy:
-    for (int ord = 0; ord < valueCount; ord++) {
-      final BytesRef term = dv.lookupOrd(ord);
+    int ord = 0;
+    while (ord != valueCount) {
+      BytesRef term = dv.lookupOrd(ord);
       String[] components = FacetsConfig.stringToPath(term.utf8ToString());
-      if (components.length != 2) {
-        throw new IllegalArgumentException(
-            "this class can only handle 2 level hierarchy (dim/value); got: "
-                + Arrays.toString(components)
-                + " "
-                + term.utf8ToString());
+      String dim = components[0];
+      if (config != null && config.getDimConfig(dim).hierarchical) {
+        ord = createOneHierarchicalFacetDimState(dv, ord) + 1;
+      } else {
+        ord = createOneFlatFacetDimState(dv, ord) + 1;
+      }
+    }
+  }
+
+  // returns last ord of dimension
+  private int createOneHierarchicalFacetDimState(SortedSetDocValues dv, int 
dimStartOrd)
+      throws IOException {
+    List<Boolean> hasChildren = new ArrayList<>();
+    List<Integer> siblings = new ArrayList<>();
+
+    String dim = null;
+
+    // stack of paths with unfulfilled siblings
+    Stack<OrdAndComponent> siblingStack = new Stack<>();
+
+    String[] nextComponents = null;
+    int dimEndOrd = dimStartOrd;
+    while (true) {
+      String[] components;
+
+      int ord = dimEndOrd - dimStartOrd;
+
+      if (nextComponents == null) {
+        BytesRef term = dv.lookupOrd(dimEndOrd);
+        components = FacetsConfig.stringToPath(term.utf8ToString());
+        dim = components[0];
+      } else {
+        components = nextComponents;
       }
-      if (!components[0].equals(lastDim)) {
-        if (lastDim != null) {
-          prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord - 1));
+
+      while (siblingStack.empty() == false
+          && siblingStack.peek().component.length >= components.length) {
+        OrdAndComponent possibleSibling = siblingStack.pop();
+        if (possibleSibling.component.length == components.length) {
+          // lengths are equal
+          boolean isSibling = true;
+          for (int i = 0; i < possibleSibling.component.length - 1; i++) {
+            if (possibleSibling.component[i].equals(components[i]) == false) {
+              isSibling = false;
+              break;
+            }
+          }
+          if (isSibling) {
+            siblings.set(possibleSibling.ord, ord);
+          }
         }
-        startOrd = ord;
-        lastDim = components[0];
       }
+
+      if (dimEndOrd + 1 == valueCount) {
+        // current ord needs to be added, can't have children or siblings
+        siblings.add(-1);
+        hasChildren.add(false);
+        break;
+      }
+
+      BytesRef nextTerm = dv.lookupOrd(dimEndOrd + 1);
+      nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString());
+
+      if (nextComponents[0].equals(components[0]) == false) {
+        // current ord needs to be added, can't have children or siblings
+        siblings.add(-1);
+        hasChildren.add(false);
+        break;
+      }
+
+      if (components.length < nextComponents.length) {
+        // next ord must be a direct child of current ord, this is because we 
are indexing all
+        // ancestral paths
+        hasChildren.add(ord, true);
+        // we don't know if this ord has a sibling or where it's sibling could 
be yet
+        siblingStack.push(new OrdAndComponent(ord, components));
+        // we still add INVALID_ORDINAL, which will be replaced if a valid 
sibling is found
+        siblings.add(ord, INVALID_ORDINAL);
+      } else if (components.length == nextComponents.length) {
+        // next ord must be a sibling of current and there are no direct 
children or current, this
+        // is because we
+        // are indexing all ancestral paths
+        siblings.add(ord, ord + 1);
+        hasChildren.add(ord, false);
+      } else {
+        // components.length > nextComponents.length
+        // next ord is neither sibling nor child
+        siblings.add(ord, INVALID_ORDINAL);
+        hasChildren.add(ord, false);
+      }
+
+      dimEndOrd++;
     }
 
-    if (lastDim != null) {
-      prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount - 1));
+    prefixToDimTree.put(dim, new DimTree(dimStartOrd, siblings, hasChildren));
+
+    return dimEndOrd;
+  }
+
+  // returns last ord of dimension
+  private int createOneFlatFacetDimState(SortedSetDocValues dv, int 
dimStartOrd)
+      throws IOException {
+
+    String dim = null;
+
+    String[] nextComponents = null;
+    int dimEndOrd = dimStartOrd;
+
+    while (true) {
+      String[] components;
+
+      if (nextComponents == null) {

Review comment:
       Yeah, I moved the initialization of this out of the loop (as with the 
other case of this you mentioned).

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java
##########
@@ -79,38 +104,160 @@ public DefaultSortedSetDocValuesReaderState(IndexReader 
reader, String field) th
     }
     valueCount = (int) dv.getValueCount();
 
-    // TODO: we can make this more efficient if eg we can be
-    // "involved" when OrdinalMap is being created?  Ie see
-    // each term/ord it's assigning as it goes...
-    String lastDim = null;
-    int startOrd = -1;
-
-    // TODO: this approach can work for full hierarchy?;
-    // TaxoReader can't do this since ords are not in
-    // "sorted order" ... but we should generalize this to
-    // support arbitrary hierarchy:
-    for (int ord = 0; ord < valueCount; ord++) {
-      final BytesRef term = dv.lookupOrd(ord);
+    int ord = 0;
+    while (ord != valueCount) {
+      BytesRef term = dv.lookupOrd(ord);
       String[] components = FacetsConfig.stringToPath(term.utf8ToString());
-      if (components.length != 2) {
-        throw new IllegalArgumentException(
-            "this class can only handle 2 level hierarchy (dim/value); got: "
-                + Arrays.toString(components)
-                + " "
-                + term.utf8ToString());
+      String dim = components[0];
+      if (config != null && config.getDimConfig(dim).hierarchical) {
+        ord = createOneHierarchicalFacetDimState(dv, ord) + 1;
+      } else {
+        ord = createOneFlatFacetDimState(dv, ord) + 1;
+      }
+    }
+  }
+
+  // returns last ord of dimension
+  private int createOneHierarchicalFacetDimState(SortedSetDocValues dv, int 
dimStartOrd)
+      throws IOException {
+    List<Boolean> hasChildren = new ArrayList<>();
+    List<Integer> siblings = new ArrayList<>();
+
+    String dim = null;
+
+    // stack of paths with unfulfilled siblings
+    Stack<OrdAndComponent> siblingStack = new Stack<>();
+
+    String[] nextComponents = null;
+    int dimEndOrd = dimStartOrd;
+    while (true) {
+      String[] components;
+
+      int ord = dimEndOrd - dimStartOrd;
+
+      if (nextComponents == null) {
+        BytesRef term = dv.lookupOrd(dimEndOrd);
+        components = FacetsConfig.stringToPath(term.utf8ToString());
+        dim = components[0];
+      } else {
+        components = nextComponents;
       }
-      if (!components[0].equals(lastDim)) {
-        if (lastDim != null) {
-          prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord - 1));
+
+      while (siblingStack.empty() == false
+          && siblingStack.peek().component.length >= components.length) {
+        OrdAndComponent possibleSibling = siblingStack.pop();
+        if (possibleSibling.component.length == components.length) {
+          // lengths are equal
+          boolean isSibling = true;
+          for (int i = 0; i < possibleSibling.component.length - 1; i++) {
+            if (possibleSibling.component[i].equals(components[i]) == false) {

Review comment:
       Yeah I thought about it for a bit and you're correct. Thanks! That 
should make the state construction at least a bit faster.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene] mdmarshmallow commented on a change in pull request #509: LUCENE-10250: Add support for arbitrary length hierarchical SSDV facets

Reply via email to