[GitHub] [lucene-solr] bruno-roustant commented on a change in pull request #1270: LUCENE-9237: Faster UniformSplit IntersectBlockReader.

GitBox Fri, 28 Feb 2020 00:52:41 -0800

bruno-roustant commented on a change in pull request #1270: LUCENE-9237: Faster 
UniformSplit IntersectBlockReader.
URL: https://github.com/apache/lucene-solr/pull/1270#discussion_r385573506


 ##########
 File path: 
lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IntersectBlockReader.java
 ##########
 @@ -18,260 +18,337 @@
 package org.apache.lucene.codecs.uniformsplit;
 
 import java.io.IOException;
-import java.util.Objects;
+import java.util.Arrays;
 
 import org.apache.lucene.codecs.PostingsReaderBase;
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.Transition;
 
 /**
  * The "intersect" {@link TermsEnum} response to {@link 
UniformSplitTerms#intersect(CompiledAutomaton, BytesRef)},
  * intersecting the terms with an automaton.
+ * <p>
+ * By design of the UniformSplit block keys, it is less efficient than
+ * {@code org.apache.lucene.codecs.blocktree.IntersectTermsEnum} for {@link 
org.apache.lucene.search.FuzzyQuery} (-37%).
+ * It is slightly slower for {@link org.apache.lucene.search.WildcardQuery} 
(-5%) and slightly faster for
+ * {@link org.apache.lucene.search.PrefixQuery} (+5%).
+ *
+ * @lucene.experimental
  */
 public class IntersectBlockReader extends BlockReader {
 
-  protected final AutomatonNextTermCalculator nextStringCalculator;
-  protected final ByteRunAutomaton runAutomaton;
-  protected final BytesRef commonSuffixRef; // maybe null
-  protected final BytesRef commonPrefixRef;
-  protected final BytesRef startTerm; // maybe null
+  /**
+   * Block iteration order. Whether to move next block, jump to a block away, 
or end the iteration.
+   */
+  protected enum BlockIteration {NEXT, SEEK, END}
 
-  /** Set this when our current mode is seeking to this term.  Set to null 
after. */
-  protected BytesRef seekTerm;
+  /**
+   * Threshold that controls when to attempt to jump to a block away.
+   * <p>
+   * This counter is 0 when entering a block. It is incremented each time a 
term is rejected by the automaton.
+   * When the counter is greater than or equal to this threshold, then we 
compute the next term accepted by
+   * the automaton, with {@link AutomatonNextTermCalculator}, and we jump to a 
block away if the next term
+   * accepted is greater than the immediate next term in the block.
+   * <p>
+   * A low value, for example 1, improves the performance of automatons 
requiring many jumps, for example
+   * {@link org.apache.lucene.search.FuzzyQuery} and most {@link 
org.apache.lucene.search.WildcardQuery}.
+   * A higher value improves the performance of automatons with less or no 
jump, for example
+   * {@link org.apache.lucene.search.PrefixQuery}.
+   * A threshold of 4 seems to be a good balance.
+   */
+  protected final int NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD = 4;
 
-  protected int blockPrefixRunAutomatonState;
-  protected int blockPrefixLen;
+  protected final Automaton automaton;
+  protected final ByteRunAutomaton runAutomaton;
+  protected final boolean finite;
+  protected final BytesRef commonSuffix; // maybe null
+  protected final int minTermLength;
+  protected final AutomatonNextTermCalculator nextStringCalculator;
 
   /**
-   * Number of bytes accepted by the last call to {@link 
#runAutomatonForState}.
+   * Set this when our current mode is seeking to this term.  Set to null 
after.
+   */
+  protected BytesRef seekTerm;
+  /**
+   * Number of bytes accepted by the automaton when validating the current 
term.
+   */
+  protected int numMatchedBytes;
+  /**
+   * Automaton states reached when validating the current term, from 0 to 
{@link #numMatchedBytes} - 1.
+   */
+  protected int[] states;
+  /**
+   * Block iteration order determined when scanning the terms in the current 
block.
    */
-  protected int numBytesAccepted;
+  protected BlockIteration blockIteration;
   /**
-   * Whether the current term is beyond the automaton common prefix.
-   * If true this means the enumeration should stop immediately.
+   * Counter of the number of consecutively rejected terms.
+   * Depending on {@link #NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD}, this 
may trigger a jump to a block away.
    */
-  protected boolean beyondCommonPrefix;
+  protected int numConsecutivelyRejectedTerms;
 
-  public IntersectBlockReader(CompiledAutomaton compiled, BytesRef startTerm,
-                              IndexDictionary.BrowserSupplier 
dictionaryBrowserSupplier, IndexInput blockInput, PostingsReaderBase 
postingsReader,
-                              FieldMetadata fieldMetadata, BlockDecoder 
blockDecoder) throws IOException {
+  protected IntersectBlockReader(CompiledAutomaton compiled, BytesRef 
startTerm,
+                                 IndexDictionary.BrowserSupplier 
dictionaryBrowserSupplier, IndexInput blockInput,
+                                 PostingsReaderBase postingsReader, 
FieldMetadata fieldMetadata,
+                                 BlockDecoder blockDecoder) throws IOException 
{
     super(dictionaryBrowserSupplier, blockInput, postingsReader, 
fieldMetadata, blockDecoder);
-    this.nextStringCalculator = new AutomatonNextTermCalculator(compiled);
-    Automaton automaton = Objects.requireNonNull(compiled.automaton);
-    this.runAutomaton = Objects.requireNonNull(compiled.runAutomaton);
-    this.commonSuffixRef = compiled.commonSuffixRef; // maybe null
-    this.commonPrefixRef = Operations.getCommonPrefixBytesRef(automaton); // 
never null
-
-    this.startTerm = startTerm;
-    assert startTerm == null || StringHelper.startsWith(startTerm, 
commonPrefixRef);
-    // it is thus also true that startTerm >= commonPrefixRef
-
-    this.seekTerm = startTerm != null ? startTerm : commonPrefixRef;
+    checkIntersectAutomatonType(compiled);
+    automaton = compiled.automaton;
+    runAutomaton = compiled.runAutomaton;
+    finite = compiled.finite;
+    commonSuffix = compiled.commonSuffixRef;
+    minTermLength = getMinTermLength();
+    nextStringCalculator = new AutomatonNextTermCalculator(compiled);
+    seekTerm = startTerm;
   }
 
-  @Override
-  public BytesRef next() throws IOException {
-    clearTermState();
-
-    if (blockHeader == null) { // initial state
-      // note: don't call super.seekCeil here; we have our own logic
-
-      // Set the browser position to the block having the seek term.
-      // Even if -1, it's okay since we'll soon call nextKey().
-      long blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm);
-      if (isBeyondLastTerm(seekTerm, blockStartFP)) {
-        return null; // EOF
-      }
-
-      // Starting at this block find and load the next matching block.
-      //   note: Since seekBlock was just called, we actually consider the 
current block as "next".
-      if (nextBlockMatchingPrefix() == false) { // note: starts at seek'ed 
block, which may have a match
-        return null; // EOF
-      }
+  protected void checkIntersectAutomatonType(CompiledAutomaton compiled) {
+    if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+      throw new IllegalArgumentException("please use 
CompiledAutomaton.getTermsEnum instead");
     }
-
-    do {
-
-      // look in the rest of this block.
-      BytesRef term = nextTermInBlockMatching();
-      if (term != null) {
-        return term;
-      }
-
-      // next term dict matching prefix
-    } while (nextBlockMatchingPrefix());
-
-    return null; // EOF
   }
 
   /**
-   * Find the next block that appears to contain terms that could match the 
automata.
-   * The prefix is the primary clue.  Returns true if at one, or false for no 
more (EOF).
+   * Computes the minimal length of the terms accepted by the automaton.
+   * This speeds up the term scanning for automatons accepting a finite 
language.
    */
-  protected boolean nextBlockMatchingPrefix() throws IOException {
-    if (beyondCommonPrefix) {
-      return false; // EOF
+  protected int getMinTermLength() {
 
 Review comment:
   Actually a test specific to this method is not easy at all. I don't see a 
way to randomize and assert the min length without using a code similar to the 
code itself.
   I'll let it as it is today: I leverage the multiple random tests for 
automatons, regex and fuzzy queries, and I verify that the min term length 
works (no false negative) with the assertions already present in the 
nextTermInBlockMatching() method. For each term, we verify with 
runAutomaton.run() that the acceptance/rejection is correct.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene-solr] bruno-roustant commented on a change in pull request #1270: LUCENE-9237: Faster UniformSplit IntersectBlockReader.

Reply via email to