romseygeek commented on code in PR #12881: URL: https://github.com/apache/lucene/pull/12881#discussion_r1417479284
########## lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java: ########## @@ -199,34 +343,95 @@ public void highlightDocument( LeafReaderContext leafReaderContext, int contextDocId, FieldValueProvider doc, - Predicate<String> acceptField, + ToIntFunction<String> maxHitsPerField, Map<String, List<OffsetRange>> outputHighlights) throws IOException { Matches matches = weight.matches(leafReaderContext, contextDocId); if (matches == null) { return; } - for (String field : affectedFields) { - if (acceptField.test(field)) { - MatchesIterator matchesIterator = matches.getMatches(field); - if (matchesIterator == null) { - // No matches on this field, even though the field was part of the query. This may be - // possible - // with complex queries that source non-text fields (have no "hit regions" in any textual - // representation). Skip. - } else { - OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field); - if (offsetStrategy == null) { - throw new IOException( - "Non-empty matches but no offset retrieval strategy for field: " + field); - } - List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc); - if (!ranges.isEmpty()) { - outputHighlights.put(field, ranges); - } + for (String field : queryAffectedHighlightedFields) { + MatchesIterator matchesIterator = matches.getMatches(field); + if (matchesIterator == null) { + // No matches on this field, even though the field was part of the query. This may be + // possible + // with complex queries that source non-text fields (have no "hit regions" in any textual + // representation). Skip. + } else { + OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field); + if (offsetStrategy == null) { + throw new IOException( + "Non-empty matches but no offset retrieval strategy for field: " + field); } + var delegate = offsetStrategy; + + // Limit the number of hits so that we're not extracting dozens just to trim them to a few + // in the end. + final int maxHits = maxHitsPerField.applyAsInt(field); + if (maxHits != Integer.MAX_VALUE) { + offsetStrategy = + (matchesIterator1, doc1) -> + delegate.get(new MatchesIteratorWithLimit(matchesIterator1, maxHits), doc1); + } + + List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc); + if (!ranges.isEmpty()) { + outputHighlights.put(field, ranges); + } + } + } + } + + private static class MatchesIteratorWithLimit implements MatchesIterator { Review Comment: You can use FilterMatchesIterator here to save some typing ########## lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java: ########## @@ -53,99 +58,173 @@ public class MatchRegionRetriever { private final List<LeafReaderContext> leaves; private final Weight weight; - private final TreeSet<String> affectedFields; private final Map<String, OffsetsRetrievalStrategy> offsetStrategies; - private final Set<String> preloadFields; + private final TreeSet<String> queryAffectedHighlightedFields; + private final Predicate<String> shouldLoadStoredField; + private final IndexSearcher searcher; /** - * A callback for accepting a single document (and its associated leaf reader, leaf document ID) - * and its match offset ranges, as indicated by the {@link Matches} interface retrieved for the - * query. + * A callback invoked for each document selected by the query. The callback receives a list of hit + * ranges, document field value accessor, the leaf reader and document ID of the document. */ @FunctionalInterface public interface MatchOffsetsConsumer { + /** + * @param docId Document id (global). + * @param leafReader Document's {@link LeafReader}. + * @param leafDocId Document id (within the {@link LeafReader}). + * @param fieldValueProvider Access to preloaded document fields. See the {@link + * MatchRegionRetriever#MatchRegionRetriever(IndexSearcher, Query, + * OffsetsRetrievalStrategySupplier, Predicate, Predicate)} constructor's documentation for + * guidelines on which fields are available through this interface. + * @param hits A map of field names and offset ranges with query hits. + */ void accept( - int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits) + int docId, + LeafReader leafReader, + int leafDocId, + FieldValueProvider fieldValueProvider, + Map<String, List<OffsetRange>> hits) throws IOException; } /** - * An abstraction that provides document values for a given field. The default implementation just - * reaches to a preloaded {@link Document}. It is possible to write a more efficient - * implementation on top of a reusable character buffer (that reuses the buffer while retrieving - * hit regions for documents). + * Access to field values of the highlighted document. See the {@link + * MatchRegionRetriever#MatchRegionRetriever(IndexSearcher, Query, + * OffsetsRetrievalStrategySupplier, Predicate, Predicate)} constructor's documentation for + * guidelines on which fields are available through this interface. */ - @FunctionalInterface - public interface FieldValueProvider { - List<CharSequence> getValues(String field) throws IOException; + public interface FieldValueProvider extends Iterable<String> { + /** + * @return Return a list of values for the provided field name or {@code null} if the field is + * not loaded or does not exist for the field. + */ + List<String> getValues(String field); } /** - * A constructor with the default offset strategy supplier. + * This constructor uses the default offset strategy supplier from {@link + * #computeOffsetRetrievalStrategies(IndexReader, Analyzer)}. * + * @param searcher The {@link IndexSearcher} to use for executing the query. + * @param query The query for which highlights should be returned. * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields in the * absence of position offsets in the index. Note that the analyzer must return tokens * (positions and offsets) identical to the ones stored in the index. + * @param fieldsToLoadUnconditionally A custom predicate that should return {@code true} for any + * field that should be preloaded and made available through {@link FieldValueProvider}, + * regardless of whether the query affected the field or not. This predicate can be used to + * load additional fields during field highlighting, making them available to {@link + * MatchOffsetsConsumer}s. + * @param fieldsToLoadIfWithHits A custom predicate that should return {@code true} for fields + * that should be highlighted. Typically, this would always return {@code true} indicating any + * field affected by the query should be highlighted. However, sometimes highlights may not be + * needed: for example, if they affect fields that are only used for filtering purposes. + * Returning {@code false} for such fields saves the costs of loading those fields into memory + * and scanning through field matches. */ - public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) + public MatchRegionRetriever( + IndexSearcher searcher, + Query query, + Analyzer analyzer, + Predicate<String> fieldsToLoadUnconditionally, + Predicate<String> fieldsToLoadIfWithHits) throws IOException { - this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer)); + this( + searcher, + query, + computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer), + fieldsToLoadUnconditionally, + fieldsToLoadIfWithHits); } /** - * @param searcher Index searcher to be used for retrieving matches. + * @param searcher The {@link IndexSearcher} to use for executing the query. * @param query The query for which matches should be retrieved. The query should be rewritten * against the provided searcher. * @param fieldOffsetStrategySupplier A custom supplier of per-field {@link * OffsetsRetrievalStrategy} instances. + * @param fieldsToLoadUnconditionally A custom predicate that should return {@code true} for any + * field that should be preloaded and made available through {@link FieldValueProvider}, + * regardless of whether the query affected the field or not. This predicate can be used to + * load additional fields during field highlighting, making them available to {@link + * MatchOffsetsConsumer}s. + * @param fieldsToLoadIfWithHits A custom predicate that should return {@code true} for fields + * that should be highlighted. Typically, this would always return {@code true} indicating any + * field affected by the query should be highlighted. However, sometimes highlights may not be + * needed: for example, if they affect fields that are only used for filtering purposes. + * Returning {@code false} for such fields saves the costs of loading those fields into memory + * and scanning through field matches. */ public MatchRegionRetriever( IndexSearcher searcher, Query query, - OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier) + OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier, + Predicate<String> fieldsToLoadUnconditionally, + Predicate<String> fieldsToLoadIfWithHits) throws IOException { + this.searcher = searcher; leaves = searcher.getIndexReader().leaves(); assert checkOrderConsistency(leaves); // We need full scoring mode so that we can receive matches from all sub-clauses // (no optimizations in Boolean queries take place). weight = searcher.createWeight(query, ScoreMode.COMPLETE, 0); - // Compute the subset of fields affected by this query so that we don't load or scan - // fields that are irrelevant. - affectedFields = new TreeSet<>(); + // Compute a subset of fields affected by this query and for which highlights should be + // returned, so that we don't load or scan fields that are irrelevant. + queryAffectedHighlightedFields = new TreeSet<>(); query.visit( new QueryVisitor() { @Override public boolean acceptField(String field) { - affectedFields.add(field); + if (fieldsToLoadIfWithHits.test(field)) { + queryAffectedHighlightedFields.add(field); + } return false; } }); // Compute value offset retrieval strategy for all affected fields. offsetStrategies = new HashMap<>(); - for (String field : affectedFields) { + for (String field : queryAffectedHighlightedFields) { offsetStrategies.put(field, fieldOffsetStrategySupplier.apply(field)); } - // Ask offset strategies if they'll need field values. - preloadFields = new HashSet<>(); - offsetStrategies.forEach( - (field, strategy) -> { - preloadFields.add(field); - }); - - // Only preload those field values that can be affected by the query and are required - // by strategies. - preloadFields.retainAll(affectedFields); + shouldLoadStoredField = + (field) -> { + return fieldsToLoadUnconditionally.test(field) + || queryAffectedHighlightedFields.contains(field); + }; } public void highlightDocuments(TopDocs topDocs, MatchOffsetsConsumer consumer) throws IOException { highlightDocuments( Arrays.stream(topDocs.scoreDocs).mapToInt(scoreDoc -> scoreDoc.doc).sorted().iterator(), - consumer); + consumer, + field -> Integer.MAX_VALUE); + } + + static class DocHighlightData { Review Comment: If this is for 10 only you can use a `record` here I think? ########## lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java: ########## @@ -53,99 +58,173 @@ public class MatchRegionRetriever { private final List<LeafReaderContext> leaves; private final Weight weight; - private final TreeSet<String> affectedFields; private final Map<String, OffsetsRetrievalStrategy> offsetStrategies; - private final Set<String> preloadFields; + private final TreeSet<String> queryAffectedHighlightedFields; + private final Predicate<String> shouldLoadStoredField; + private final IndexSearcher searcher; /** - * A callback for accepting a single document (and its associated leaf reader, leaf document ID) - * and its match offset ranges, as indicated by the {@link Matches} interface retrieved for the - * query. + * A callback invoked for each document selected by the query. The callback receives a list of hit + * ranges, document field value accessor, the leaf reader and document ID of the document. */ @FunctionalInterface public interface MatchOffsetsConsumer { + /** + * @param docId Document id (global). + * @param leafReader Document's {@link LeafReader}. + * @param leafDocId Document id (within the {@link LeafReader}). + * @param fieldValueProvider Access to preloaded document fields. See the {@link + * MatchRegionRetriever#MatchRegionRetriever(IndexSearcher, Query, + * OffsetsRetrievalStrategySupplier, Predicate, Predicate)} constructor's documentation for + * guidelines on which fields are available through this interface. + * @param hits A map of field names and offset ranges with query hits. + */ void accept( - int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits) + int docId, + LeafReader leafReader, + int leafDocId, + FieldValueProvider fieldValueProvider, + Map<String, List<OffsetRange>> hits) throws IOException; } /** - * An abstraction that provides document values for a given field. The default implementation just - * reaches to a preloaded {@link Document}. It is possible to write a more efficient - * implementation on top of a reusable character buffer (that reuses the buffer while retrieving - * hit regions for documents). + * Access to field values of the highlighted document. See the {@link + * MatchRegionRetriever#MatchRegionRetriever(IndexSearcher, Query, + * OffsetsRetrievalStrategySupplier, Predicate, Predicate)} constructor's documentation for + * guidelines on which fields are available through this interface. */ - @FunctionalInterface - public interface FieldValueProvider { - List<CharSequence> getValues(String field) throws IOException; + public interface FieldValueProvider extends Iterable<String> { + /** + * @return Return a list of values for the provided field name or {@code null} if the field is + * not loaded or does not exist for the field. + */ + List<String> getValues(String field); } /** - * A constructor with the default offset strategy supplier. + * This constructor uses the default offset strategy supplier from {@link + * #computeOffsetRetrievalStrategies(IndexReader, Analyzer)}. * + * @param searcher The {@link IndexSearcher} to use for executing the query. Review Comment: I wonder if its worth calling out that we'll use the IndexSearcher's task executor to try and parallelize the work here? It may be that in some applications you don't want highlighting to use threads that could otherwise be running simple queries, in which case clients should use different IndexSearcher instances for querying and highlighting. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org