mikemccand commented on a change in pull request #133: URL: https://github.com/apache/lucene/pull/133#discussion_r632584798
########## File path: lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java ########## @@ -0,0 +1,371 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.OrdinalMap; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.search.ConjunctionDISI; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongValues; + +/** + * Compute facet counts from a previously indexed {@link SortedSetDocValues} or {@link + * org.apache.lucene.index.SortedDocValues} field. This approach will execute facet counting against + * the string values found in the specified field, with no assumptions on their format. Unlike + * {@link org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts}, no assumption is made + * about a "dimension" path component being indexed. Because of this, the field itself is + * effectively treated as the "dimension", and counts for all unique string values are produced. + * This approach is meant to compliment {@link LongValueFacetCounts} in that they both provide facet + * counting on a doc value field with no assumptions of content. + * + * <p>This implementation is useful if you want to dynamically count against any string doc value + * field without relying on {@link FacetField} and {@link FacetsConfig}. The disadvantage is that a + * separate field is required for each "dimension". If you want to pack multiple dimensions into the + * same doc values field, you probably want one of {@link + * org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts} or {@link + * org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts}. + * + * <p>Note that there is an added cost on every {@link IndexReader} open to create a new {@link + * StringDocValuesReaderState}. Also note that this class should be instantiated and used from a + * single thread, because it holds a thread-private instance of {@link SortedSetDocValues}. + * + * @lucene.experimental + */ +// TODO: Add a concurrent version much like ConcurrentSortedSetDocValuesFacetCounts? +public class StringValueFacetCounts extends Facets { + + private final IndexReader reader; + private final String field; + private final OrdinalMap ordinalMap; + private final SortedSetDocValues docValues; + + private final int[] counts; + + private int totalDocCount = 0; Review comment: You don't need the `= 0` -- it's java's default already. ########## File path: lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java ########## @@ -0,0 +1,371 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.OrdinalMap; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.search.ConjunctionDISI; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongValues; + +/** + * Compute facet counts from a previously indexed {@link SortedSetDocValues} or {@link + * org.apache.lucene.index.SortedDocValues} field. This approach will execute facet counting against + * the string values found in the specified field, with no assumptions on their format. Unlike + * {@link org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts}, no assumption is made + * about a "dimension" path component being indexed. Because of this, the field itself is + * effectively treated as the "dimension", and counts for all unique string values are produced. + * This approach is meant to compliment {@link LongValueFacetCounts} in that they both provide facet + * counting on a doc value field with no assumptions of content. + * + * <p>This implementation is useful if you want to dynamically count against any string doc value Review comment: Maybe call out that the counting is not sparse, so callers are aware of the HEAP cost for high-cardinality fields? ########## File path: lucene/facet/src/java/org/apache/lucene/facet/StringDocValuesReaderState.java ########## @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.OrdinalMap; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Stores an {@link OrdinalMap} created for a specific {@link IndexReader} ({@code reader}) + {@code + * field}. Enables re-use of the {@code ordinalMap} once created since creation is costly. + * + * <p>Note: It's important that callers confirm the ordinal map is still valid for their cases. + * Specifically, callers should confirm that the reader used to create the map ({@code reader}) + * matches their use-case. + */ +class StringDocValuesReaderState { Review comment: Hmm, shouldn't this class be `public`, since it is part of the `public` API in `StringValueFacetCounts`? I've long wondered why we don't make unit tests outside of the packages we are testing, for this reason :) It'd catch accidentally missing `public` ... though I think our javadoc linters may also catch this? Of course, there are also really important reasons to put unit tests *inside* the same package, so the tests can access internal things for validation ... ########## File path: lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java ########## @@ -0,0 +1,371 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.OrdinalMap; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.search.ConjunctionDISI; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongValues; + +/** + * Compute facet counts from a previously indexed {@link SortedSetDocValues} or {@link + * org.apache.lucene.index.SortedDocValues} field. This approach will execute facet counting against + * the string values found in the specified field, with no assumptions on their format. Unlike + * {@link org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts}, no assumption is made + * about a "dimension" path component being indexed. Because of this, the field itself is + * effectively treated as the "dimension", and counts for all unique string values are produced. + * This approach is meant to compliment {@link LongValueFacetCounts} in that they both provide facet + * counting on a doc value field with no assumptions of content. + * + * <p>This implementation is useful if you want to dynamically count against any string doc value + * field without relying on {@link FacetField} and {@link FacetsConfig}. The disadvantage is that a + * separate field is required for each "dimension". If you want to pack multiple dimensions into the + * same doc values field, you probably want one of {@link + * org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts} or {@link + * org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts}. + * + * <p>Note that there is an added cost on every {@link IndexReader} open to create a new {@link + * StringDocValuesReaderState}. Also note that this class should be instantiated and used from a + * single thread, because it holds a thread-private instance of {@link SortedSetDocValues}. + * + * @lucene.experimental + */ +// TODO: Add a concurrent version much like ConcurrentSortedSetDocValuesFacetCounts? +public class StringValueFacetCounts extends Facets { + + private final IndexReader reader; + private final String field; + private final OrdinalMap ordinalMap; + private final SortedSetDocValues docValues; + + private final int[] counts; Review comment: Hmm maybe a comment explaining what this array is? I think it is non-sparse, indexed by `SSDV` ordinal? We might want to (later optimization) better handle the (likely more common?) sparse case, e.g. using `IntIntScatterMap` or so from `HPPC`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org