dsmiley commented on a change in pull request #412: URL: https://github.com/apache/lucene/pull/412#discussion_r762251761
########## File path: lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java ########## @@ -278,8 +274,10 @@ public void testCustomB() throws Exception { iw.close(); IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter.Builder concreteBuilder = Review comment: lets call this `uhBuilder` ########## File path: lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java ########## @@ -330,8 +328,11 @@ public void testCustomK1() throws Exception { iw.close(); IndexSearcher searcher = newSearcher(ir); + + UnifiedHighlighter.Builder concreteBuilder = Review comment: again, maybe `uhBuilder` ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -113,118 +112,239 @@ protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0]; - protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher + protected final IndexSearcher searcher; protected final Analyzer indexAnalyzer; - private boolean defaultHandleMtq = true; // e.g. wildcards + private final int maxLength; - private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging" + private final Supplier<BreakIterator> defaultBreakIterator; - // For analysis, prefer MemoryIndexOffsetStrategy - private boolean defaultPassageRelevancyOverSpeed = true; + private final Predicate<String> defaultFieldMatcher; - private int maxLength = DEFAULT_MAX_LENGTH; + private final PassageScorer defaultScorer; - // BreakIterator is stateful so we use a Supplier factory method - private Supplier<BreakIterator> defaultBreakIterator = - () -> BreakIterator.getSentenceInstance(Locale.ROOT); + private final PassageFormatter defaultFormatter; - private Predicate<String> defaultFieldMatcher; + private final int defaultMaxNoHighlightPassages; - private PassageScorer defaultScorer = new PassageScorer(); + // lazy initialized with double-check locking; protected so subclass can init + protected volatile FieldInfos fieldInfos; - private PassageFormatter defaultFormatter = new DefaultPassageFormatter(); + private final int cacheFieldValCharsThreshold; - private int defaultMaxNoHighlightPassages = -1; + private final Set<HighlightFlag> flags; - // lazy initialized with double-check locking; protected so subclass can init - protected volatile FieldInfos fieldInfos; + /** Builder for UnifiedHighlighter. */ + public static class Builder { + /** If null, can only use highlightWithoutSearcher. */ + private IndexSearcher searcher; - private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Analyzer indexAnalyzer; + private boolean handleMultiTermQuery = true; + private boolean highlightPhrasesStrictly = true; + private boolean passageRelevancyOverSpeed = true; + private boolean weightMatches = true; + private int maxLength = DEFAULT_MAX_LENGTH; - /** Extracts matching terms after rewriting against an empty index */ - protected static Set<Term> extractTerms(Query query) throws IOException { - Set<Term> queryTerms = new HashSet<>(); - EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms)); - return queryTerms; - } + /** BreakIterator is stateful so we use a Supplier factory method. */ + private Supplier<BreakIterator> breakIterator = + () -> BreakIterator.getSentenceInstance(Locale.ROOT); - /** - * Constructs the highlighter with the given index searcher and analyzer. - * - * @param indexSearcher Usually required, unless {@link #highlightWithoutSearcher(String, Query, - * String, int)} is used, in which case this needs to be null. - * @param indexAnalyzer Required, even if in some circumstances it isn't used. - */ - public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) { - this.searcher = indexSearcher; // TODO: make non nullable - this.indexAnalyzer = - Objects.requireNonNull( - indexAnalyzer, - "indexAnalyzer is required" + " (even if in some circumstances it isn't used)"); - } + private Predicate<String> fieldMatcher; + private PassageScorer scorer = new PassageScorer(); + private PassageFormatter formatter = new DefaultPassageFormatter(); + private int maxNoHighlightPassages = -1; + private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Set<HighlightFlag> flags; - public void setHandleMultiTermQuery(boolean handleMtq) { - this.defaultHandleMtq = handleMtq; - } + /** + * Usually required, unless {@link #highlightWithoutSearcher(String, Query, String, int)} is + * used, in which case this needs to be null. + */ + public Builder withSearcher(IndexSearcher value) { Review comment: Given that the searcher is often required (albeit not always), it would be a good candidate for putting in the builder(IndexSearcher) param. Same with the Analyzer. I've seen this pattern before for builders. This way the caller is forced to think of it, but would otherwise easily forget many of the "with" methods on the builder. Since there are two primary scenarios -- with and without a searcher, we could have two builder methods: build(IndexSearcher, Analyzer) and buildWithoutSearcher(Analyzer) which thus makes them explicit when reading code. ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -823,24 +943,13 @@ public void visitLeaf(Query query) { return filteredTerms.toArray(new BytesRef[filteredTerms.size()]); } - /** Customize the highlighting flags to use by field. */ + /** + * Customize the highlighting flags to use by field. Here the user can either specify the set of + * {@link HighlightFlag}s to be applied or use the boolean flags to populate final list of {@link Review comment: I think this is incorrect now. The boolean flags only exist in the builder. ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -113,118 +112,239 @@ protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0]; - protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher + protected final IndexSearcher searcher; protected final Analyzer indexAnalyzer; - private boolean defaultHandleMtq = true; // e.g. wildcards + private final int maxLength; - private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging" + private final Supplier<BreakIterator> defaultBreakIterator; - // For analysis, prefer MemoryIndexOffsetStrategy - private boolean defaultPassageRelevancyOverSpeed = true; + private final Predicate<String> defaultFieldMatcher; - private int maxLength = DEFAULT_MAX_LENGTH; + private final PassageScorer defaultScorer; - // BreakIterator is stateful so we use a Supplier factory method - private Supplier<BreakIterator> defaultBreakIterator = - () -> BreakIterator.getSentenceInstance(Locale.ROOT); + private final PassageFormatter defaultFormatter; - private Predicate<String> defaultFieldMatcher; + private final int defaultMaxNoHighlightPassages; - private PassageScorer defaultScorer = new PassageScorer(); + // lazy initialized with double-check locking; protected so subclass can init + protected volatile FieldInfos fieldInfos; - private PassageFormatter defaultFormatter = new DefaultPassageFormatter(); + private final int cacheFieldValCharsThreshold; - private int defaultMaxNoHighlightPassages = -1; + private final Set<HighlightFlag> flags; - // lazy initialized with double-check locking; protected so subclass can init - protected volatile FieldInfos fieldInfos; + /** Builder for UnifiedHighlighter. */ + public static class Builder { + /** If null, can only use highlightWithoutSearcher. */ + private IndexSearcher searcher; - private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Analyzer indexAnalyzer; + private boolean handleMultiTermQuery = true; + private boolean highlightPhrasesStrictly = true; + private boolean passageRelevancyOverSpeed = true; + private boolean weightMatches = true; + private int maxLength = DEFAULT_MAX_LENGTH; - /** Extracts matching terms after rewriting against an empty index */ - protected static Set<Term> extractTerms(Query query) throws IOException { - Set<Term> queryTerms = new HashSet<>(); - EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms)); - return queryTerms; - } + /** BreakIterator is stateful so we use a Supplier factory method. */ + private Supplier<BreakIterator> breakIterator = + () -> BreakIterator.getSentenceInstance(Locale.ROOT); - /** - * Constructs the highlighter with the given index searcher and analyzer. - * - * @param indexSearcher Usually required, unless {@link #highlightWithoutSearcher(String, Query, - * String, int)} is used, in which case this needs to be null. - * @param indexAnalyzer Required, even if in some circumstances it isn't used. - */ - public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) { - this.searcher = indexSearcher; // TODO: make non nullable - this.indexAnalyzer = - Objects.requireNonNull( - indexAnalyzer, - "indexAnalyzer is required" + " (even if in some circumstances it isn't used)"); - } + private Predicate<String> fieldMatcher; + private PassageScorer scorer = new PassageScorer(); + private PassageFormatter formatter = new DefaultPassageFormatter(); + private int maxNoHighlightPassages = -1; + private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Set<HighlightFlag> flags; - public void setHandleMultiTermQuery(boolean handleMtq) { - this.defaultHandleMtq = handleMtq; - } + /** + * Usually required, unless {@link #highlightWithoutSearcher(String, Query, String, int)} is + * used, in which case this needs to be null. + */ + public Builder withSearcher(IndexSearcher value) { + this.searcher = value; + return self(); + } - public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) { - this.defaultHighlightPhrasesStrictly = highlightPhrasesStrictly; - } + /** + * This method sets the analyzer for the UH object. Required, even if in some circumstances it + * isn' used. The null check is performed in the constructor. + */ + public Builder withIndexAnalyzer(Analyzer value) { + this.indexAnalyzer = value; + return self(); + } - public void setMaxLength(int maxLength) { - if (maxLength < 0 || maxLength == Integer.MAX_VALUE) { - // two reasons: no overflow problems in BreakIterator.preceding(offset+1), - // our sentinel in the offsets queue uses this value to terminate. - throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); + /** + * User-defined set of {@link HighlightFlag} values which will override the flags set by {@link + * #withHandleMultiTermQuery(boolean)}, {@link #withHighlightPhrasesStrictly(boolean)}, {@link + * #withPassageRelevancyOverSpeed(boolean)} and {@link #withWeightMatches(boolean)}. + * + * @param values - set of {@link HighlightFlag} values. + */ + public Builder withFlags(Set<HighlightFlag> values) { + this.flags = values; + return self(); } - this.maxLength = maxLength; - } - public void setBreakIterator(Supplier<BreakIterator> breakIterator) { - this.defaultBreakIterator = breakIterator; - } + /** + * Here position sensitive queries (e.g. phrases and {@link SpanQuery}ies) are highlighted + * strictly based on query matches (slower) versus any/all occurrences of the underlying terms. + * By default it's enabled, but there's no overhead if such queries aren't used. + */ + public Builder withHighlightPhrasesStrictly(boolean value) { + this.highlightPhrasesStrictly = value; + return self(); + } - public void setScorer(PassageScorer scorer) { - this.defaultScorer = scorer; - } + /** + * Here {@link org.apache.lucene.search.MultiTermQuery} derivatives will be highlighted. By + * default it's enabled. MTQ highlighting can be expensive, particularly when using offsets in + * postings. + */ + public Builder withHandleMultiTermQuery(boolean value) { + this.handleMultiTermQuery = value; + return self(); + } - public void setFormatter(PassageFormatter formatter) { - this.defaultFormatter = formatter; - } + /** Passage relevancy is more important than speed. True by default. */ + public Builder withPassageRelevancyOverSpeed(boolean value) { + this.passageRelevancyOverSpeed = value; + return self(); + } - public void setMaxNoHighlightPassages(int defaultMaxNoHighlightPassages) { - this.defaultMaxNoHighlightPassages = defaultMaxNoHighlightPassages; - } + /** + * Internally use the {@link Weight#matches(LeafReaderContext, int)} API for highlighting. It's + * more accurate to the query, and the snippets can be a little different for phrases because + * the whole phrase is marked up instead of each word. The passage relevancy calculation can be + * different (maybe worse?) and it's slower when highlighting many fields. Use of this flag + * requires {@link HighlightFlag#MULTI_TERM_QUERY} and {@link HighlightFlag#PHRASES} and {@link + * HighlightFlag#PASSAGE_RELEVANCY_OVER_SPEED}. True by default because those booleans are true + * by default. + */ + public Builder withWeightMatches(boolean value) { + this.weightMatches = value; + return self(); + } + + public Builder withMaxLength(int value) { Review comment: Suggested javadoc: > The text to be highlight is effectively truncated by this length. ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -113,118 +112,239 @@ protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0]; - protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher + protected final IndexSearcher searcher; protected final Analyzer indexAnalyzer; - private boolean defaultHandleMtq = true; // e.g. wildcards + private final int maxLength; - private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging" + private final Supplier<BreakIterator> defaultBreakIterator; Review comment: I suggest we remove the `default` prefix from the fields. They are all effectively defaults that are often over-rideable. ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -113,118 +112,239 @@ protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0]; - protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher + protected final IndexSearcher searcher; protected final Analyzer indexAnalyzer; - private boolean defaultHandleMtq = true; // e.g. wildcards + private final int maxLength; - private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging" + private final Supplier<BreakIterator> defaultBreakIterator; - // For analysis, prefer MemoryIndexOffsetStrategy - private boolean defaultPassageRelevancyOverSpeed = true; + private final Predicate<String> defaultFieldMatcher; - private int maxLength = DEFAULT_MAX_LENGTH; + private final PassageScorer defaultScorer; - // BreakIterator is stateful so we use a Supplier factory method - private Supplier<BreakIterator> defaultBreakIterator = - () -> BreakIterator.getSentenceInstance(Locale.ROOT); + private final PassageFormatter defaultFormatter; - private Predicate<String> defaultFieldMatcher; + private final int defaultMaxNoHighlightPassages; - private PassageScorer defaultScorer = new PassageScorer(); + // lazy initialized with double-check locking; protected so subclass can init + protected volatile FieldInfos fieldInfos; - private PassageFormatter defaultFormatter = new DefaultPassageFormatter(); + private final int cacheFieldValCharsThreshold; - private int defaultMaxNoHighlightPassages = -1; + private final Set<HighlightFlag> flags; - // lazy initialized with double-check locking; protected so subclass can init - protected volatile FieldInfos fieldInfos; + /** Builder for UnifiedHighlighter. */ + public static class Builder { + /** If null, can only use highlightWithoutSearcher. */ + private IndexSearcher searcher; - private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Analyzer indexAnalyzer; + private boolean handleMultiTermQuery = true; + private boolean highlightPhrasesStrictly = true; + private boolean passageRelevancyOverSpeed = true; + private boolean weightMatches = true; + private int maxLength = DEFAULT_MAX_LENGTH; - /** Extracts matching terms after rewriting against an empty index */ - protected static Set<Term> extractTerms(Query query) throws IOException { - Set<Term> queryTerms = new HashSet<>(); - EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms)); - return queryTerms; - } + /** BreakIterator is stateful so we use a Supplier factory method. */ + private Supplier<BreakIterator> breakIterator = + () -> BreakIterator.getSentenceInstance(Locale.ROOT); - /** - * Constructs the highlighter with the given index searcher and analyzer. - * - * @param indexSearcher Usually required, unless {@link #highlightWithoutSearcher(String, Query, - * String, int)} is used, in which case this needs to be null. - * @param indexAnalyzer Required, even if in some circumstances it isn't used. - */ - public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) { - this.searcher = indexSearcher; // TODO: make non nullable - this.indexAnalyzer = - Objects.requireNonNull( - indexAnalyzer, - "indexAnalyzer is required" + " (even if in some circumstances it isn't used)"); - } + private Predicate<String> fieldMatcher; + private PassageScorer scorer = new PassageScorer(); + private PassageFormatter formatter = new DefaultPassageFormatter(); + private int maxNoHighlightPassages = -1; + private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Set<HighlightFlag> flags; - public void setHandleMultiTermQuery(boolean handleMtq) { - this.defaultHandleMtq = handleMtq; - } + /** + * Usually required, unless {@link #highlightWithoutSearcher(String, Query, String, int)} is + * used, in which case this needs to be null. + */ + public Builder withSearcher(IndexSearcher value) { + this.searcher = value; + return self(); + } - public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) { - this.defaultHighlightPhrasesStrictly = highlightPhrasesStrictly; - } + /** + * This method sets the analyzer for the UH object. Required, even if in some circumstances it Review comment: BTW in Javadoc style, don't start javadocs with "this method" or "this class" -- instead get right to the point and succinctly. ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -1163,23 +1272,16 @@ public CacheHelper getReaderCacheHelper() { /** Flags for controlling highlighting behavior. */ public enum HighlightFlag { - /** @see UnifiedHighlighter#setHighlightPhrasesStrictly(boolean) */ + /** Ref: {@link Builder#withHighlightPhrasesStrictly(boolean)} */ Review comment: What's wrong with `@see` ? ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -113,118 +112,239 @@ protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0]; - protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher + protected final IndexSearcher searcher; Review comment: the comment you removed was useful. Oh I see you moved it to the builder. I think internal comments are best kept here. ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -113,118 +112,239 @@ protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0]; - protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher + protected final IndexSearcher searcher; protected final Analyzer indexAnalyzer; - private boolean defaultHandleMtq = true; // e.g. wildcards + private final int maxLength; - private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging" + private final Supplier<BreakIterator> defaultBreakIterator; - // For analysis, prefer MemoryIndexOffsetStrategy - private boolean defaultPassageRelevancyOverSpeed = true; + private final Predicate<String> defaultFieldMatcher; - private int maxLength = DEFAULT_MAX_LENGTH; + private final PassageScorer defaultScorer; - // BreakIterator is stateful so we use a Supplier factory method - private Supplier<BreakIterator> defaultBreakIterator = - () -> BreakIterator.getSentenceInstance(Locale.ROOT); + private final PassageFormatter defaultFormatter; - private Predicate<String> defaultFieldMatcher; + private final int defaultMaxNoHighlightPassages; - private PassageScorer defaultScorer = new PassageScorer(); + // lazy initialized with double-check locking; protected so subclass can init + protected volatile FieldInfos fieldInfos; - private PassageFormatter defaultFormatter = new DefaultPassageFormatter(); + private final int cacheFieldValCharsThreshold; - private int defaultMaxNoHighlightPassages = -1; + private final Set<HighlightFlag> flags; - // lazy initialized with double-check locking; protected so subclass can init - protected volatile FieldInfos fieldInfos; + /** Builder for UnifiedHighlighter. */ + public static class Builder { + /** If null, can only use highlightWithoutSearcher. */ + private IndexSearcher searcher; - private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Analyzer indexAnalyzer; + private boolean handleMultiTermQuery = true; + private boolean highlightPhrasesStrictly = true; + private boolean passageRelevancyOverSpeed = true; + private boolean weightMatches = true; + private int maxLength = DEFAULT_MAX_LENGTH; - /** Extracts matching terms after rewriting against an empty index */ - protected static Set<Term> extractTerms(Query query) throws IOException { - Set<Term> queryTerms = new HashSet<>(); - EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms)); - return queryTerms; - } + /** BreakIterator is stateful so we use a Supplier factory method. */ + private Supplier<BreakIterator> breakIterator = + () -> BreakIterator.getSentenceInstance(Locale.ROOT); - /** - * Constructs the highlighter with the given index searcher and analyzer. - * - * @param indexSearcher Usually required, unless {@link #highlightWithoutSearcher(String, Query, - * String, int)} is used, in which case this needs to be null. - * @param indexAnalyzer Required, even if in some circumstances it isn't used. - */ - public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) { - this.searcher = indexSearcher; // TODO: make non nullable - this.indexAnalyzer = - Objects.requireNonNull( - indexAnalyzer, - "indexAnalyzer is required" + " (even if in some circumstances it isn't used)"); - } + private Predicate<String> fieldMatcher; + private PassageScorer scorer = new PassageScorer(); + private PassageFormatter formatter = new DefaultPassageFormatter(); + private int maxNoHighlightPassages = -1; + private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Set<HighlightFlag> flags; - public void setHandleMultiTermQuery(boolean handleMtq) { - this.defaultHandleMtq = handleMtq; - } + /** + * Usually required, unless {@link #highlightWithoutSearcher(String, Query, String, int)} is + * used, in which case this needs to be null. + */ + public Builder withSearcher(IndexSearcher value) { + this.searcher = value; + return self(); + } - public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) { - this.defaultHighlightPhrasesStrictly = highlightPhrasesStrictly; - } + /** + * This method sets the analyzer for the UH object. Required, even if in some circumstances it + * isn' used. The null check is performed in the constructor. + */ + public Builder withIndexAnalyzer(Analyzer value) { + this.indexAnalyzer = value; + return self(); + } - public void setMaxLength(int maxLength) { - if (maxLength < 0 || maxLength == Integer.MAX_VALUE) { - // two reasons: no overflow problems in BreakIterator.preceding(offset+1), - // our sentinel in the offsets queue uses this value to terminate. - throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); + /** + * User-defined set of {@link HighlightFlag} values which will override the flags set by {@link + * #withHandleMultiTermQuery(boolean)}, {@link #withHighlightPhrasesStrictly(boolean)}, {@link + * #withPassageRelevancyOverSpeed(boolean)} and {@link #withWeightMatches(boolean)}. + * + * @param values - set of {@link HighlightFlag} values. + */ + public Builder withFlags(Set<HighlightFlag> values) { + this.flags = values; + return self(); } - this.maxLength = maxLength; - } - public void setBreakIterator(Supplier<BreakIterator> breakIterator) { - this.defaultBreakIterator = breakIterator; - } + /** + * Here position sensitive queries (e.g. phrases and {@link SpanQuery}ies) are highlighted + * strictly based on query matches (slower) versus any/all occurrences of the underlying terms. + * By default it's enabled, but there's no overhead if such queries aren't used. + */ + public Builder withHighlightPhrasesStrictly(boolean value) { + this.highlightPhrasesStrictly = value; + return self(); + } - public void setScorer(PassageScorer scorer) { - this.defaultScorer = scorer; - } + /** + * Here {@link org.apache.lucene.search.MultiTermQuery} derivatives will be highlighted. By + * default it's enabled. MTQ highlighting can be expensive, particularly when using offsets in + * postings. + */ + public Builder withHandleMultiTermQuery(boolean value) { + this.handleMultiTermQuery = value; + return self(); + } - public void setFormatter(PassageFormatter formatter) { - this.defaultFormatter = formatter; - } + /** Passage relevancy is more important than speed. True by default. */ + public Builder withPassageRelevancyOverSpeed(boolean value) { + this.passageRelevancyOverSpeed = value; + return self(); + } - public void setMaxNoHighlightPassages(int defaultMaxNoHighlightPassages) { - this.defaultMaxNoHighlightPassages = defaultMaxNoHighlightPassages; - } + /** + * Internally use the {@link Weight#matches(LeafReaderContext, int)} API for highlighting. It's + * more accurate to the query, and the snippets can be a little different for phrases because + * the whole phrase is marked up instead of each word. The passage relevancy calculation can be + * different (maybe worse?) and it's slower when highlighting many fields. Use of this flag + * requires {@link HighlightFlag#MULTI_TERM_QUERY} and {@link HighlightFlag#PHRASES} and {@link + * HighlightFlag#PASSAGE_RELEVANCY_OVER_SPEED}. True by default because those booleans are true + * by default. + */ + public Builder withWeightMatches(boolean value) { + this.weightMatches = value; + return self(); + } + + public Builder withMaxLength(int value) { + if (value < 0 || value == Integer.MAX_VALUE) { + // two reasons: no overflow problems in BreakIterator.preceding(offset+1), + // our sentinel in the offsets queue uses this value to terminate. + throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); + } + this.maxLength = value; + return self(); + } + + public Builder withBreakIterator(Supplier<BreakIterator> value) { + this.breakIterator = value; + return self(); + } - public void setCacheFieldValCharsThreshold(int cacheFieldValCharsThreshold) { - this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold; + public Builder withFieldMatcher(Predicate<String> value) { + this.fieldMatcher = value; + return self(); + } + + public Builder withScorer(PassageScorer value) { + this.scorer = value; + return self(); + } + + public Builder withFormatter(PassageFormatter value) { + this.formatter = value; + return self(); + } + + public Builder withMaxNoHighlightPassages(int value) { + this.maxNoHighlightPassages = value; + return self(); + } + + public Builder withCacheFieldValCharsThreshold(int value) { + this.cacheFieldValCharsThreshold = value; + return self(); + } + + /** + * This method returns the set of of {@link HighlightFlag}s, which will be applied to the UH + * object. The output depends on the values provided to {@link + * #withHandleMultiTermQuery(boolean)}, {@link #withHighlightPhrasesStrictly(boolean)}, {@link + * #withPassageRelevancyOverSpeed(boolean)} and {@link #withWeightMatches(boolean)}. + * + * @return a set of {@link HighlightFlag}s. + */ + protected Set<HighlightFlag> evaluateFlags() { + if (Objects.nonNull(flags) && !flags.isEmpty()) { + return flags; + } + + Set<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class); + if (handleMultiTermQuery) { + highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY); + } + if (highlightPhrasesStrictly) { + highlightFlags.add(HighlightFlag.PHRASES); + } + if (passageRelevancyOverSpeed) { + highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED); + } + + // Evaluate if WEIGHT_MATCHES can be added as a flag. + final boolean applyWeightMatches = + highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY) + && highlightFlags.contains(HighlightFlag.PHRASES) + && highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED) + // User can also opt-out of WEIGHT_MATCHES. + && weightMatches; + + if (applyWeightMatches) { + highlightFlags.add(HighlightFlag.WEIGHT_MATCHES); + } + return highlightFlags; + } + + protected Builder self() { + return this; + } + + public UnifiedHighlighter build() { + return new UnifiedHighlighter(this); + } } - public void setFieldMatcher(Predicate<String> predicate) { - this.defaultFieldMatcher = predicate; + public static Builder builder() { + return new Builder(); } - /** - * Returns whether {@link MultiTermQuery} derivatives will be highlighted. By default it's - * enabled. MTQ highlighting can be expensive, particularly when using offsets in postings. - */ - protected boolean shouldHandleMultiTermQuery(String field) { - return defaultHandleMtq; + /** Extracts matching terms after rewriting against an empty index */ + protected static Set<Term> extractTerms(Query query) throws IOException { + Set<Term> queryTerms = new HashSet<>(); + EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms)); + return queryTerms; } /** - * Returns whether position sensitive queries (e.g. phrases and {@link SpanQuery}ies) should be - * highlighted strictly based on query matches (slower) versus any/all occurrences of the - * underlying terms. By default it's enabled, but there's no overhead if such queries aren't used. + * Constructs the highlighter with the given the {@link Builder}. Review comment: ```suggestion * Constructs the highlighter with the given {@link Builder}. ``` ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -143,6 +143,91 @@ private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + /** Builder for UnifiedHighlighter. */ + public static class Builder { + private final IndexSearcher searcher; + private final Analyzer indexAnalyzer; + private boolean handleMultiTermQuery = true; + private boolean highlightPhrasesStrictly = true; + private boolean passageRelevancyOverSpeed = true; + private int maxLength = DEFAULT_MAX_LENGTH; + private Supplier<BreakIterator> breakIterator = + () -> BreakIterator.getSentenceInstance(Locale.ROOT); + private Predicate<String> fieldMatcher; + private PassageScorer scorer = new PassageScorer(); + private PassageFormatter formatter = new DefaultPassageFormatter(); + private int maxNoHighlightPassages = -1; + private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + + public Builder(IndexSearcher searcher, Analyzer indexAnalyzer) { + this.searcher = Objects.requireNonNull(searcher); + this.indexAnalyzer = Objects.requireNonNull(indexAnalyzer); + } + + public Builder withHandleMultiTermQuery(boolean value) { + this.handleMultiTermQuery = value; + return self(); + } + + public Builder withHighlightPhrasesStrictly(boolean value) { + this.highlightPhrasesStrictly = value; + return self(); + } + + public Builder withPassageRelevancyOverSpeed(boolean value) { + this.passageRelevancyOverSpeed = value; + return self(); + } + + public Builder withMaxLength(int value) { + if (value < 0 || value == Integer.MAX_VALUE) { + // two reasons: no overflow problems in BreakIterator.preceding(offset+1), + // our sentinel in the offsets queue uses this value to terminate. + throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); + } + this.maxLength = value; + return self(); + } + + public Builder withBreakIterator(Supplier<BreakIterator> value) { + this.breakIterator = value; + return self(); + } + + public Builder withFieldMatcher(Predicate<String> value) { + this.fieldMatcher = value; + return self(); + } + + public Builder withScorer(PassageScorer value) { + this.scorer = value; + return self(); + } + + public Builder withFormatter(PassageFormatter value) { + this.formatter = value; + return self(); + } + + public Builder withMaxNoHighlightPassages(int value) { + this.maxNoHighlightPassages = value; + return self(); + } + + public Builder withCacheFieldValCharsThreshold(int value) { + this.cacheFieldValCharsThreshold = value; + return self(); + } + + protected Builder self() { Review comment: Seems unnecessary now? ########## File path: lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java ########## @@ -113,118 +112,239 @@ protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0]; - protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher + protected final IndexSearcher searcher; protected final Analyzer indexAnalyzer; - private boolean defaultHandleMtq = true; // e.g. wildcards + private final int maxLength; - private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging" + private final Supplier<BreakIterator> defaultBreakIterator; - // For analysis, prefer MemoryIndexOffsetStrategy - private boolean defaultPassageRelevancyOverSpeed = true; + private final Predicate<String> defaultFieldMatcher; - private int maxLength = DEFAULT_MAX_LENGTH; + private final PassageScorer defaultScorer; - // BreakIterator is stateful so we use a Supplier factory method - private Supplier<BreakIterator> defaultBreakIterator = - () -> BreakIterator.getSentenceInstance(Locale.ROOT); + private final PassageFormatter defaultFormatter; - private Predicate<String> defaultFieldMatcher; + private final int defaultMaxNoHighlightPassages; - private PassageScorer defaultScorer = new PassageScorer(); + // lazy initialized with double-check locking; protected so subclass can init + protected volatile FieldInfos fieldInfos; - private PassageFormatter defaultFormatter = new DefaultPassageFormatter(); + private final int cacheFieldValCharsThreshold; - private int defaultMaxNoHighlightPassages = -1; + private final Set<HighlightFlag> flags; - // lazy initialized with double-check locking; protected so subclass can init - protected volatile FieldInfos fieldInfos; + /** Builder for UnifiedHighlighter. */ + public static class Builder { + /** If null, can only use highlightWithoutSearcher. */ + private IndexSearcher searcher; - private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Analyzer indexAnalyzer; + private boolean handleMultiTermQuery = true; + private boolean highlightPhrasesStrictly = true; + private boolean passageRelevancyOverSpeed = true; + private boolean weightMatches = true; + private int maxLength = DEFAULT_MAX_LENGTH; - /** Extracts matching terms after rewriting against an empty index */ - protected static Set<Term> extractTerms(Query query) throws IOException { - Set<Term> queryTerms = new HashSet<>(); - EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms)); - return queryTerms; - } + /** BreakIterator is stateful so we use a Supplier factory method. */ + private Supplier<BreakIterator> breakIterator = + () -> BreakIterator.getSentenceInstance(Locale.ROOT); - /** - * Constructs the highlighter with the given index searcher and analyzer. - * - * @param indexSearcher Usually required, unless {@link #highlightWithoutSearcher(String, Query, - * String, int)} is used, in which case this needs to be null. - * @param indexAnalyzer Required, even if in some circumstances it isn't used. - */ - public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) { - this.searcher = indexSearcher; // TODO: make non nullable - this.indexAnalyzer = - Objects.requireNonNull( - indexAnalyzer, - "indexAnalyzer is required" + " (even if in some circumstances it isn't used)"); - } + private Predicate<String> fieldMatcher; + private PassageScorer scorer = new PassageScorer(); + private PassageFormatter formatter = new DefaultPassageFormatter(); + private int maxNoHighlightPassages = -1; + private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Set<HighlightFlag> flags; - public void setHandleMultiTermQuery(boolean handleMtq) { - this.defaultHandleMtq = handleMtq; - } + /** + * Usually required, unless {@link #highlightWithoutSearcher(String, Query, String, int)} is + * used, in which case this needs to be null. + */ + public Builder withSearcher(IndexSearcher value) { + this.searcher = value; + return self(); + } - public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) { - this.defaultHighlightPhrasesStrictly = highlightPhrasesStrictly; - } + /** + * This method sets the analyzer for the UH object. Required, even if in some circumstances it + * isn' used. The null check is performed in the constructor. + */ + public Builder withIndexAnalyzer(Analyzer value) { + this.indexAnalyzer = value; + return self(); + } - public void setMaxLength(int maxLength) { - if (maxLength < 0 || maxLength == Integer.MAX_VALUE) { - // two reasons: no overflow problems in BreakIterator.preceding(offset+1), - // our sentinel in the offsets queue uses this value to terminate. - throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); + /** + * User-defined set of {@link HighlightFlag} values which will override the flags set by {@link + * #withHandleMultiTermQuery(boolean)}, {@link #withHighlightPhrasesStrictly(boolean)}, {@link + * #withPassageRelevancyOverSpeed(boolean)} and {@link #withWeightMatches(boolean)}. + * + * @param values - set of {@link HighlightFlag} values. + */ + public Builder withFlags(Set<HighlightFlag> values) { + this.flags = values; + return self(); } - this.maxLength = maxLength; - } - public void setBreakIterator(Supplier<BreakIterator> breakIterator) { - this.defaultBreakIterator = breakIterator; - } + /** + * Here position sensitive queries (e.g. phrases and {@link SpanQuery}ies) are highlighted + * strictly based on query matches (slower) versus any/all occurrences of the underlying terms. + * By default it's enabled, but there's no overhead if such queries aren't used. + */ + public Builder withHighlightPhrasesStrictly(boolean value) { + this.highlightPhrasesStrictly = value; + return self(); + } - public void setScorer(PassageScorer scorer) { - this.defaultScorer = scorer; - } + /** + * Here {@link org.apache.lucene.search.MultiTermQuery} derivatives will be highlighted. By + * default it's enabled. MTQ highlighting can be expensive, particularly when using offsets in + * postings. + */ + public Builder withHandleMultiTermQuery(boolean value) { + this.handleMultiTermQuery = value; + return self(); + } - public void setFormatter(PassageFormatter formatter) { - this.defaultFormatter = formatter; - } + /** Passage relevancy is more important than speed. True by default. */ + public Builder withPassageRelevancyOverSpeed(boolean value) { + this.passageRelevancyOverSpeed = value; + return self(); + } - public void setMaxNoHighlightPassages(int defaultMaxNoHighlightPassages) { - this.defaultMaxNoHighlightPassages = defaultMaxNoHighlightPassages; - } + /** + * Internally use the {@link Weight#matches(LeafReaderContext, int)} API for highlighting. It's + * more accurate to the query, and the snippets can be a little different for phrases because + * the whole phrase is marked up instead of each word. The passage relevancy calculation can be + * different (maybe worse?) and it's slower when highlighting many fields. Use of this flag + * requires {@link HighlightFlag#MULTI_TERM_QUERY} and {@link HighlightFlag#PHRASES} and {@link + * HighlightFlag#PASSAGE_RELEVANCY_OVER_SPEED}. True by default because those booleans are true + * by default. + */ + public Builder withWeightMatches(boolean value) { + this.weightMatches = value; + return self(); + } + + public Builder withMaxLength(int value) { + if (value < 0 || value == Integer.MAX_VALUE) { + // two reasons: no overflow problems in BreakIterator.preceding(offset+1), + // our sentinel in the offsets queue uses this value to terminate. + throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); + } + this.maxLength = value; + return self(); + } + + public Builder withBreakIterator(Supplier<BreakIterator> value) { + this.breakIterator = value; + return self(); + } - public void setCacheFieldValCharsThreshold(int cacheFieldValCharsThreshold) { - this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold; + public Builder withFieldMatcher(Predicate<String> value) { + this.fieldMatcher = value; + return self(); + } + + public Builder withScorer(PassageScorer value) { + this.scorer = value; + return self(); + } + + public Builder withFormatter(PassageFormatter value) { + this.formatter = value; + return self(); + } + + public Builder withMaxNoHighlightPassages(int value) { + this.maxNoHighlightPassages = value; + return self(); + } + + public Builder withCacheFieldValCharsThreshold(int value) { + this.cacheFieldValCharsThreshold = value; + return self(); + } + + /** + * This method returns the set of of {@link HighlightFlag}s, which will be applied to the UH + * object. The output depends on the values provided to {@link + * #withHandleMultiTermQuery(boolean)}, {@link #withHighlightPhrasesStrictly(boolean)}, {@link + * #withPassageRelevancyOverSpeed(boolean)} and {@link #withWeightMatches(boolean)}. + * + * @return a set of {@link HighlightFlag}s. + */ + protected Set<HighlightFlag> evaluateFlags() { + if (Objects.nonNull(flags) && !flags.isEmpty()) { Review comment: Isn't empty valid? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org