gsmiller commented on a change in pull request #133: URL: https://github.com/apache/lucene/pull/133#discussion_r633752063
########## File path: lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java ########## @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.OrdinalMap; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.search.ConjunctionDISI; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongValues; + +/** + * Compute facet counts from a previously indexed {@link SortedSetDocValues} or {@link + * org.apache.lucene.index.SortedDocValues} field. This approach will execute facet counting against + * the string values found in the specified field, with no assumptions on their format. Unlike + * {@link org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts}, no assumption is made + * about a "dimension" path component being indexed. Because of this, the field itself is + * effectively treated as the "dimension", and counts for all unique string values are produced. + * This approach is meant to complement {@link LongValueFacetCounts} in that they both provide facet + * counting on a doc value field with no assumptions of content. + * + * <p>This implementation is useful if you want to dynamically count against any string doc value + * field without relying on {@link FacetField} and {@link FacetsConfig}. The disadvantage is that a + * separate field is required for each "dimension". If you want to pack multiple dimensions into the + * same doc values field, you probably want one of {@link + * org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts} or {@link + * org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts}. + * + * <p>Note that there is an added cost on every {@link IndexReader} open to create a new {@link + * StringDocValuesReaderState}. Also note that this class should be instantiated and used from a + * single thread, because it holds a thread-private instance of {@link SortedSetDocValues}. + * + * <p>Also note that counting does not use a sparse data structure, so heap memory cost scales with + * the number of unique ordinals for the field being counting. For high-cardinality fields, this + * could be costly. + * + * @lucene.experimental + */ +// TODO: Add a concurrent version much like ConcurrentSortedSetDocValuesFacetCounts? +public class StringValueFacetCounts extends Facets { + + private final IndexReader reader; + private final String field; + private final OrdinalMap ordinalMap; + private final SortedSetDocValues docValues; + + // TODO: There's an optimization opportunity here to use a sparse counting structure in some + // cases, + // much like what IntTaxonomyFacetCounts does. + /** Dense counting array indexed by ordinal. */ + private final int[] counts; + + private int totalDocCount; + + /** + * Returns all facet counts for the field, same result as searching on {@link MatchAllDocsQuery} + * but faster. + */ + public StringValueFacetCounts(StringDocValuesReaderState state) throws IOException { + this(state, null); + } + + /** Counts facets across the provided hits. */ + public StringValueFacetCounts(StringDocValuesReaderState state, FacetsCollector facetsCollector) + throws IOException { + reader = state.reader; + field = state.field; + ordinalMap = state.ordinalMap; + docValues = getDocValues(); + + // Since we accumulate counts in an array, we need to ensure the number of unique ordinals + // doesn't overflow an integer: + if (docValues.getValueCount() > Integer.MAX_VALUE) { + throw new IllegalArgumentException( + "can only handle valueCount < Integer.MAX_VALUE; got " + docValues.getValueCount()); + } + + counts = new int[(int) docValues.getValueCount()]; + + if (facetsCollector == null) { + countAll(); + } else { + count(facetsCollector); + } + } + + @Override + public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { + if (topN <= 0) { + throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")"); + } + if (dim.equals(field) == false) { + throw new IllegalArgumentException( + "invalid dim \"" + dim + "\"; should be \"" + field + "\""); + } + if (path.length != 0) { + throw new IllegalArgumentException("path.length should be 0"); + } + + topN = Math.min(topN, counts.length); + TopOrdAndIntQueue q = null; + TopOrdAndIntQueue.OrdAndValue reuse = null; + int bottomCount = 0; + int childCount = 0; // total number of labels with non-zero count + + for (int i = 0; i < counts.length; i++) { + int count = counts[i]; + if (count != 0) { + childCount++; + if (count > bottomCount) { + if (reuse == null) { + reuse = new TopOrdAndIntQueue.OrdAndValue(); + } + reuse.ord = i; + reuse.value = count; + if (q == null) { + // Lazy init for sparse case: + q = new TopOrdAndIntQueue(topN); + } + reuse = q.insertWithOverflow(reuse); + if (q.size() == topN) { + bottomCount = q.top().value; + } + } + } + } + + int resultCount = q == null ? 0 : q.size(); + LabelAndValue[] labelValues = new LabelAndValue[resultCount]; + for (int i = labelValues.length - 1; i >= 0; i--) { + TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + final BytesRef term = docValues.lookupOrd(ordAndValue.ord); + labelValues[i] = new LabelAndValue(term.utf8ToString(), ordAndValue.value); + } + + return new FacetResult(field, new String[0], totalDocCount, labelValues, childCount); + } + + @Override + public Number getSpecificValue(String dim, String... path) throws IOException { + if (dim.equals(field) == false) { + throw new IllegalArgumentException( + "invalid dim \"" + dim + "\"; should be \"" + field + "\""); + } + if (path.length != 1) { + throw new IllegalArgumentException("path must be length=1"); + } + int ord = (int) docValues.lookupTerm(new BytesRef(path[0])); + if (ord < 0) { + return -1; + } + + return counts[ord]; + } + + @Override + public List<FacetResult> getAllDims(int topN) throws IOException { + return Collections.singletonList(getTopChildren(topN, field)); + } + + private SortedSetDocValues getDocValues() throws IOException { + + List<LeafReaderContext> leaves = reader.leaves(); + int leafCount = leaves.size(); + + if (leafCount == 0) { + return DocValues.emptySortedSet(); + } + + if (leafCount == 1) { + return DocValues.getSortedSet(leaves.get(0).reader(), field); + } + + // A good bit of this logic is forked from MultiDocValues so we can re-use an ordinal map + SortedSetDocValues[] docValues = new SortedSetDocValues[leafCount]; + int[] starts = new int[leafCount + 1]; + long cost = 0; + for (int i = 0; i < leafCount; i++) { + LeafReaderContext context = leaves.get(i); + SortedSetDocValues dv = DocValues.getSortedSet(context.reader(), field); + docValues[i] = dv; + starts[i] = context.docBase; + cost += dv.cost(); + } + starts[leafCount] = reader.maxDoc(); + + return new MultiDocValues.MultiSortedSetDocValues(docValues, starts, ordinalMap, cost); + } + + private void count(FacetsCollector facetsCollector) throws IOException { + + List<FacetsCollector.MatchingDocs> matchingDocs = facetsCollector.getMatchingDocs(); + + if (matchingDocs.isEmpty()) { + return; + } + + if (matchingDocs.size() == 1) { + + FacetsCollector.MatchingDocs hits = matchingDocs.get(0); + + // Validate state before doing anything else: + validateState(hits.context); + + // Assuming the state is valid, ordinalMap should be null since we have one segment: + assert ordinalMap == null; + + countOneSegment(docValues, hits.context.ord, hits); + } else { + for (int i = 0; i < matchingDocs.size(); i++) { + + FacetsCollector.MatchingDocs hits = matchingDocs.get(i); + + // Validate state before doing anything else: + validateState(hits.context); Review comment: Thanks @mikemccand! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org