chenboat commented on code in PR #12680: URL: https://github.com/apache/pinot/pull/12680#discussion_r1538229272
########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java: ########## @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.utils; + +import java.util.ArrayList; +import org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.queries.spans.SpanNearQuery; +import org.apache.lucene.queries.spans.SpanQuery; +import org.apache.lucene.queries.spans.SpanTermQuery; +import org.apache.lucene.search.AutomatonQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; +import org.slf4j.LoggerFactory; + + +public class LuceneTextIndexUtils { + private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(LuceneTextIndexUtils.class); + + private LuceneTextIndexUtils() { + } + + // Convert a boolean query with literal matching for each clause to a span query which enables prefix and suffix + // matching for terms with .* prefix or suffix. + public static Query convertToMultiTermSpanQuery(Query query) { + if (!(query instanceof BooleanQuery)) { + return query; + } + LOGGER.info("Perform rewriting for the phrase query {}.", query); + ArrayList<SpanQuery> spanQueryLst = new ArrayList<>(); + boolean prefixOrSuffixQueryFound = false; + for (BooleanClause clause : ((BooleanQuery) query).clauses()) { + Query q = clause.getQuery(); + if (q instanceof WildcardQuery || q instanceof PrefixQuery) { + prefixOrSuffixQueryFound = true; + spanQueryLst.add(new SpanMultiTermQueryWrapper<>((AutomatonQuery) q)); + } else if (q instanceof TermQuery) { + spanQueryLst.add(new SpanTermQuery(((TermQuery) q).getTerm())); + } else { + LOGGER.info("query can not be handled currently {} ", q); + return query; + } + } + if (!prefixOrSuffixQueryFound) { + return query; + } + SpanNearQuery spanNearQuery = new SpanNearQuery(spanQueryLst.toArray(new SpanQuery[0]), 0, true); + IndexSearcher.setMaxClauseCount(Integer.MAX_VALUE); Review Comment: Remove this call for now. For most queries, the 1024 default limit should be enough. Those requiring an increase will risk search performance issue. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org