Shradha26 commented on code in PR #12966: URL: https://github.com/apache/lucene/pull/12966#discussion_r1449034518
########## lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java: ########## @@ -17,20 +17,44 @@ package org.apache.lucene.facet.taxonomy; +import com.carrotsearch.hppc.IntArrayList; +import com.carrotsearch.hppc.IntIntHashMap; +import com.carrotsearch.hppc.cursors.IntIntCursor; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.Locale; +import java.util.Map; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.FacetsConfig.DimConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.TopOrdAndIntQueue; +import org.apache.lucene.facet.TopOrdAndNumberQueue; +import org.apache.lucene.util.PriorityQueue; /** Base class for all taxonomy-based facets impls. */ abstract class TaxonomyFacets extends Facets { + /** Intermediate result to store top children for a given path before resolving labels, etc. */ + record TopChildrenForPath(Number pathValue, int childCount, TopOrdAndNumberQueue childQueue) {} + + private static class DimValue { Review Comment: [nit] should we call this just `Dim` and `String dimPath` instead of `String dim`? I see later that we've used `int dimValue` and this is getting quickly overloaded? ########## lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java: ########## @@ -76,6 +111,78 @@ public int compare(FacetResult a, FacetResult b) { this.config = config; this.fc = fc; parents = taxoReader.getParallelTaxonomyArrays().parents(); + valueComparator = Comparator.comparingInt((x) -> (int) x); + } + + /** Return true if a sparse hash table should be used for counting, instead of a dense int[]. */ + private boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) { + if (taxoReader.getSize() < 1024) { + // small number of unique values: use an array + return false; + } + + if (fc == null) { + // counting all docs: use an array + return false; + } + + int maxDoc = 0; + int sumTotalHits = 0; + for (FacetsCollector.MatchingDocs docs : fc.getMatchingDocs()) { + sumTotalHits += docs.totalHits; + maxDoc += docs.context.reader().maxDoc(); + } + + // if our result set is < 10% of the index, we collect sparsely (use hash map): + return sumTotalHits < maxDoc / 10; + } + + protected void initializeValueCounters() { + if (initialized) { + return; + } + initialized = true; + assert sparseCounts == null && counts == null; + if (useHashTable(fc, taxoReader)) { + sparseCounts = new IntIntHashMap(); + } else { + counts = new int[taxoReader.getSize()]; + } + } + + /** Set the count for this ordinal to {@code newValue}. */ + protected void setCount(int ordinal, int newValue) { + if (sparseCounts != null) { + sparseCounts.put(ordinal, newValue); + } else { + counts[ordinal] = newValue; + } + } + + /** Get the count for this ordinal. */ + protected int getCount(int ordinal) { + if (sparseCounts != null) { + return sparseCounts.get(ordinal); + } else { + return counts[ordinal]; + } + } + + /** Get the aggregation value for this ordinal. */ + protected Number getAggregationValue(int ordinal) { + // By default, this is just the count. + return getCount(ordinal); + } + + /** Apply an aggregation to the two values and return the result. */ + protected Number aggregate(Number existingVal, Number newVal) { + // By default, we are computing counts, so the values are interpreted as integers and summed. + return (int) existingVal + (int) newVal; Review Comment: Can we use the concept of an aggregation function while combining in this method. (In line with my previous comment about making the logic for `IntTaxonomyFacets` and `FloatTaxonomyFacets` the default) ########## lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java: ########## @@ -76,6 +111,78 @@ public int compare(FacetResult a, FacetResult b) { this.config = config; this.fc = fc; parents = taxoReader.getParallelTaxonomyArrays().parents(); + valueComparator = Comparator.comparingInt((x) -> (int) x); + } + + /** Return true if a sparse hash table should be used for counting, instead of a dense int[]. */ + private boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) { + if (taxoReader.getSize() < 1024) { + // small number of unique values: use an array + return false; + } + + if (fc == null) { + // counting all docs: use an array + return false; + } + + int maxDoc = 0; + int sumTotalHits = 0; + for (FacetsCollector.MatchingDocs docs : fc.getMatchingDocs()) { + sumTotalHits += docs.totalHits; + maxDoc += docs.context.reader().maxDoc(); + } + + // if our result set is < 10% of the index, we collect sparsely (use hash map): + return sumTotalHits < maxDoc / 10; + } + + protected void initializeValueCounters() { + if (initialized) { + return; + } + initialized = true; + assert sparseCounts == null && counts == null; + if (useHashTable(fc, taxoReader)) { + sparseCounts = new IntIntHashMap(); + } else { + counts = new int[taxoReader.getSize()]; + } + } + + /** Set the count for this ordinal to {@code newValue}. */ + protected void setCount(int ordinal, int newValue) { + if (sparseCounts != null) { + sparseCounts.put(ordinal, newValue); + } else { + counts[ordinal] = newValue; + } + } + + /** Get the count for this ordinal. */ + protected int getCount(int ordinal) { + if (sparseCounts != null) { + return sparseCounts.get(ordinal); + } else { + return counts[ordinal]; + } + } + + /** Get the aggregation value for this ordinal. */ + protected Number getAggregationValue(int ordinal) { + // By default, this is just the count. Review Comment: Can the default implementation of this method and `getValue` should be same as that in `IntTaxonomyFacets` and `FloatTaxonomyFacets` to reduce duplication further? `FastTaxonomyFacets` can either extend from `IntTaxonomyFacets` or do this sort of a count based customisation to these methods. ########## lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java: ########## @@ -142,6 +249,301 @@ DimConfig verifyDim(String dim) { return dimConfig; } + /** + * Roll-up the aggregation values from {@code childOrdinal} to {@code ordinal}. Overrides should + * probably call this to update the counts. Overriding allows us to work with primitive types for + * the aggregation values, keeping aggregation efficient. + */ + protected void updateValueFromRollup(int ordinal, int childOrdinal) throws IOException { + setCount(ordinal, getCount(ordinal) + rollup(childOrdinal)); Review Comment: Shall we assume an aggregationFunction is passed in this parent class and implement this method similar to `IntTaxonomyFacets` and `FloatTaxonomyFacets` since this bit seems to be duplicated in both? Further, `FastTaxonomyFacetCounts` can either override this and do a count based `updateValuefromRollup` since it doesn't use an aggregation function or even continue to extend from `IntTaxonomyFacets`. ########## lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java: ########## @@ -67,6 +91,17 @@ public int compare(FacetResult a, FacetResult b) { /** Maps an ordinal to its parent, or -1 if there is no parent (root node). */ final int[] parents; + /** Dense ordinal counts. */ + int[] counts; Review Comment: Can we make this `Number[] values` so that `IntTaxonomyFacets` and `FloatTaxonomyFacets` don't need to define their own `values` data structure and this class is generic? ########## lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java: ########## @@ -188,8 +190,10 @@ private void aggregateValues( offset += 4; float value = (float) BitUtil.VH_BE_FLOAT.get(bytes, offset); offset += 4; - float newValue = aggregationFunction.aggregate(values[ord], value); - values[ord] = newValue; + float currentValue = getValue(ord); + float newValue = aggregationFunction.aggregate(currentValue, value); + setValue(ord, newValue); + setCount(ord, getCount(ord) + 1); Review Comment: Why do we want to always track counts too? ########## lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java: ########## @@ -142,6 +249,301 @@ DimConfig verifyDim(String dim) { return dimConfig; } + /** + * Roll-up the aggregation values from {@code childOrdinal} to {@code ordinal}. Overrides should + * probably call this to update the counts. Overriding allows us to work with primitive types for + * the aggregation values, keeping aggregation efficient. + */ + protected void updateValueFromRollup(int ordinal, int childOrdinal) throws IOException { + setCount(ordinal, getCount(ordinal) + rollup(childOrdinal)); + } + + /** + * Return a {@link TopOrdAndNumberQueue} of the appropriate type, i.e. a {@link TopOrdAndIntQueue} + * or a {@link org.apache.lucene.facet.TopOrdAndFloatQueue}. + */ + protected TopOrdAndNumberQueue makeTopOrdAndNumberQueue(int topN) { + return new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); + } + + // TODO: We don't need this if we're okay with having an integer -1 in the results even for float + // aggregations. + /** Return the value for a missing aggregation, i.e. {@code -1} or {@code -1f}. */ + protected Number missingAggregationValue() { + return -1; + } + + /** Rolls up any single-valued hierarchical dimensions. */ + void rollup() throws IOException { + if (initialized == false) { + return; + } + + // Rollup any necessary dims: + int[] children = null; + for (Map.Entry<String, FacetsConfig.DimConfig> ent : config.getDimConfigs().entrySet()) { + String dim = ent.getKey(); + FacetsConfig.DimConfig ft = ent.getValue(); + if (ft.hierarchical && ft.multiValued == false) { + int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim)); + // It can be -1 if this field was declared in the + // config but never indexed: + if (dimRootOrd > 0) { + if (children == null) { + // lazy init + children = getChildren(); + } + updateValueFromRollup(dimRootOrd, children[dimRootOrd]); + } + } + } + } + + private int rollup(int ord) throws IOException { + int[] children = getChildren(); + int[] siblings = getSiblings(); + int aggregatedValue = 0; + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int currentValue = getCount(ord); + int newValue = currentValue + rollup(children[ord]); + setCount(ord, newValue); + aggregatedValue += getCount(ord); + ord = siblings[ord]; + } + return aggregatedValue; + } + + /** + * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work + * of resolving ordinals -> labels, etc. Will return null if there are no children. + */ + private FacetResult createFacetResult( + TopChildrenForPath topChildrenForPath, String dim, String... path) throws IOException { + // If the intermediate result is null or there are no children, we return null: + if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { + return null; + } + + TopOrdAndNumberQueue q = topChildrenForPath.childQueue; + assert q != null; + + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + Number[] values = new Number[labelValues.length]; + + for (int i = labelValues.length - 1; i >= 0; i--) { + TopOrdAndNumberQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.value; + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + // The path component we're interested in is the one immediately after the provided path. We + // add 1 here to also account for the dim: + int childComponentIdx = path.length + 1; + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); + } + + return new FacetResult( + dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); + } + + @Override + public FacetResult getAllChildren(String dim, String... path) throws IOException { + DimConfig dimConfig = verifyDim(dim); + FacetLabel cp = new FacetLabel(dim, path); + int dimOrd = taxoReader.getOrdinal(cp); + if (dimOrd == -1) { + return null; + } + + if (initialized == false) { + return null; + } + + Number aggregatedValue = 0; + int aggregatedCount = 0; + + IntArrayList ordinals = new IntArrayList(); + List<Number> ordValues = new ArrayList<>(); + + if (sparseCounts != null) { + for (IntIntCursor ordAndCount : sparseCounts) { + int ord = ordAndCount.key; + int count = ordAndCount.value; + Number value = getAggregationValue(ord); + if (parents[ord] == dimOrd && count > 0) { + aggregatedCount += count; + aggregatedValue = aggregate(aggregatedValue, value); + ordinals.add(ord); + ordValues.add(value); + } + } + } else { + int[] children = getChildren(); + int[] siblings = getSiblings(); + int ord = children[dimOrd]; + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int count = counts[ord]; + Number value = getAggregationValue(ord); + if (count > 0) { + aggregatedCount += count; + aggregatedValue = aggregate(aggregatedValue, value); + ordinals.add(ord); + ordValues.add(value); + } + ord = siblings[ord]; + } + } + + if (aggregatedCount == 0) { + return null; + } + + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + aggregatedValue = getAggregationValue(dimOrd); + } else { + // Our aggregated value is not correct, in general: + aggregatedValue = missingAggregationValue(); + } + } else { + // Our aggregateddim value is accurate, so we keep it + } + + // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to + // do an array copy here: + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray()); + + LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()]; + for (int i = 0; i < ordValues.size(); i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i)); + } + return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size()); + } + + private TopOrdAndNumberQueue.OrdAndValue insertIntoQueue( + TopOrdAndNumberQueue q, + int topN, + TopOrdAndNumberQueue.OrdAndValue bottomOrdAndValue, + TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue, + int ord, + Number value) { + if (incomingOrdAndValue == null) { + incomingOrdAndValue = new TopOrdAndNumberQueue.OrdAndValue(); + } + incomingOrdAndValue.ord = ord; + incomingOrdAndValue.value = value; + + if (q.size() < topN || q.lessThan(bottomOrdAndValue, incomingOrdAndValue)) { + incomingOrdAndValue = q.insertWithOverflow(incomingOrdAndValue); + bottomOrdAndValue.ord = q.top().ord; + bottomOrdAndValue.value = q.top().value; + } + return incomingOrdAndValue; + } + + /** + * Determine the top-n children for a specified dimension + path. Results are in an intermediate + * form. + */ + protected TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) Review Comment: Let's add an abstract signature for this method to the `Facets` class? ########## lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java: ########## @@ -142,6 +249,301 @@ DimConfig verifyDim(String dim) { return dimConfig; } + /** + * Roll-up the aggregation values from {@code childOrdinal} to {@code ordinal}. Overrides should + * probably call this to update the counts. Overriding allows us to work with primitive types for + * the aggregation values, keeping aggregation efficient. + */ + protected void updateValueFromRollup(int ordinal, int childOrdinal) throws IOException { + setCount(ordinal, getCount(ordinal) + rollup(childOrdinal)); + } + + /** + * Return a {@link TopOrdAndNumberQueue} of the appropriate type, i.e. a {@link TopOrdAndIntQueue} + * or a {@link org.apache.lucene.facet.TopOrdAndFloatQueue}. + */ + protected TopOrdAndNumberQueue makeTopOrdAndNumberQueue(int topN) { + return new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); + } + + // TODO: We don't need this if we're okay with having an integer -1 in the results even for float + // aggregations. + /** Return the value for a missing aggregation, i.e. {@code -1} or {@code -1f}. */ + protected Number missingAggregationValue() { + return -1; + } + + /** Rolls up any single-valued hierarchical dimensions. */ + void rollup() throws IOException { + if (initialized == false) { + return; + } + + // Rollup any necessary dims: + int[] children = null; + for (Map.Entry<String, FacetsConfig.DimConfig> ent : config.getDimConfigs().entrySet()) { + String dim = ent.getKey(); + FacetsConfig.DimConfig ft = ent.getValue(); + if (ft.hierarchical && ft.multiValued == false) { + int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim)); + // It can be -1 if this field was declared in the + // config but never indexed: + if (dimRootOrd > 0) { + if (children == null) { + // lazy init + children = getChildren(); + } + updateValueFromRollup(dimRootOrd, children[dimRootOrd]); + } + } + } + } + + private int rollup(int ord) throws IOException { + int[] children = getChildren(); + int[] siblings = getSiblings(); + int aggregatedValue = 0; + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int currentValue = getCount(ord); + int newValue = currentValue + rollup(children[ord]); + setCount(ord, newValue); + aggregatedValue += getCount(ord); + ord = siblings[ord]; + } + return aggregatedValue; + } + + /** + * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work + * of resolving ordinals -> labels, etc. Will return null if there are no children. + */ + private FacetResult createFacetResult( + TopChildrenForPath topChildrenForPath, String dim, String... path) throws IOException { + // If the intermediate result is null or there are no children, we return null: + if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { + return null; + } + + TopOrdAndNumberQueue q = topChildrenForPath.childQueue; + assert q != null; + + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + Number[] values = new Number[labelValues.length]; + + for (int i = labelValues.length - 1; i >= 0; i--) { + TopOrdAndNumberQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.value; + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + // The path component we're interested in is the one immediately after the provided path. We + // add 1 here to also account for the dim: + int childComponentIdx = path.length + 1; + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); + } + + return new FacetResult( + dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); + } + + @Override + public FacetResult getAllChildren(String dim, String... path) throws IOException { + DimConfig dimConfig = verifyDim(dim); + FacetLabel cp = new FacetLabel(dim, path); + int dimOrd = taxoReader.getOrdinal(cp); + if (dimOrd == -1) { + return null; + } + + if (initialized == false) { + return null; + } + + Number aggregatedValue = 0; + int aggregatedCount = 0; + + IntArrayList ordinals = new IntArrayList(); + List<Number> ordValues = new ArrayList<>(); + + if (sparseCounts != null) { + for (IntIntCursor ordAndCount : sparseCounts) { + int ord = ordAndCount.key; + int count = ordAndCount.value; + Number value = getAggregationValue(ord); + if (parents[ord] == dimOrd && count > 0) { + aggregatedCount += count; + aggregatedValue = aggregate(aggregatedValue, value); + ordinals.add(ord); + ordValues.add(value); + } + } + } else { + int[] children = getChildren(); + int[] siblings = getSiblings(); + int ord = children[dimOrd]; + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int count = counts[ord]; + Number value = getAggregationValue(ord); + if (count > 0) { + aggregatedCount += count; + aggregatedValue = aggregate(aggregatedValue, value); + ordinals.add(ord); + ordValues.add(value); + } + ord = siblings[ord]; + } + } + + if (aggregatedCount == 0) { + return null; + } + + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + aggregatedValue = getAggregationValue(dimOrd); + } else { + // Our aggregated value is not correct, in general: + aggregatedValue = missingAggregationValue(); + } + } else { + // Our aggregateddim value is accurate, so we keep it + } + + // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to + // do an array copy here: + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray()); + + LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()]; + for (int i = 0; i < ordValues.size(); i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i)); + } + return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size()); + } + + private TopOrdAndNumberQueue.OrdAndValue insertIntoQueue( Review Comment: This is great! This bit was often duplicated. Can we make this a utility method or maybe even a method like `insert*` method on the Queue so `StringValueFacetCounts` and `AbstractSortedSetDocValue` can use it too? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org