Re: [PR] Optimize outputs accumulating for SegmentTermsEnum and IntersectTermsEnum [lucene]

via GitHub Wed, 22 Nov 2023 04:47:58 -0800


mikemccand commented on code in PR #12699:
URL: https://github.com/apache/lucene/pull/12699#discussion_r1400933055



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java:
##########
@@ -1190,4 +1176,63 @@ public void seekExact(long ord) {
   public long ord() {
     throw new UnsupportedOperationException();
   }
+
+  static class OutputAccumulator extends DataInput {
+
+    BytesRef[] outputs = new BytesRef[16];
+    BytesRef current;
+    int num;
+    int outputIndex;
+    int index;
+
+    void push(BytesRef output) {
+      if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
+        outputs = ArrayUtil.grow(outputs, num + 1);
+        outputs[num++] = output;
+      }
+    }
+
+    void pop() {
+      this.num--;

Review Comment:
   Maybe `assert num > 0` above this line?
   
   Also you don't need the `this.` prefix when accessing the members in these 
methods.



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java:
##########
@@ -142,12 +138,20 @@ public void setState(int state) {
   }
 
   void load(BytesRef frameIndexData) throws IOException {
-    if (frameIndexData != null) {
-      floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, 
frameIndexData.length);
-      // Skip first long -- has redundant fp, hasTerms
-      // flag, isFloor flag
-      final long code = ite.fr.readVLongOutput(floorDataReader);
-      if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
+    floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, 
frameIndexData.length);
+    load(ite.fr.readVLongOutput(floorDataReader));
+  }
+
+  void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws 
IOException {
+    outputAccumulator.prepareRead();
+    final long code = ite.fr.readVLongOutput(outputAccumulator);

Review Comment:
   Remove this silly `final`?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java:
##########
@@ -373,9 +380,6 @@ public boolean seekExact(BytesRef target) throws 
IOException {
 
       int cmp = 0;
 
-      // TODO: reverse vLong byte order for better FST

Review Comment:
   Yay, stale `TODO` garbage collection!



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java:
##########
@@ -1190,4 +1176,63 @@ public void seekExact(long ord) {
   public long ord() {
     throw new UnsupportedOperationException();
   }
+
+  static class OutputAccumulator extends DataInput {
+
+    BytesRef[] outputs = new BytesRef[16];
+    BytesRef current;
+    int num;
+    int outputIndex;
+    int index;
+
+    void push(BytesRef output) {
+      if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
+        outputs = ArrayUtil.grow(outputs, num + 1);
+        outputs[num++] = output;
+      }
+    }
+
+    void pop() {
+      this.num--;
+    }
+
+    void reset() {
+      this.num = 0;
+    }
+
+    void prepareRead() {
+      this.index = 0;
+      this.outputIndex = 0;
+      this.current = outputs[0];
+    }
+
+    void setFloorData(ByteArrayDataInput floorData) {

Review Comment:
   Could you add a comment that, while this reads/decodes bytes into 
`floorData`, it does not alter the read position?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java:
##########
@@ -142,12 +138,20 @@ public void setState(int state) {
   }
 
   void load(BytesRef frameIndexData) throws IOException {
-    if (frameIndexData != null) {
-      floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, 
frameIndexData.length);
-      // Skip first long -- has redundant fp, hasTerms
-      // flag, isFloor flag
-      final long code = ite.fr.readVLongOutput(floorDataReader);
-      if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
+    floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, 
frameIndexData.length);
+    load(ite.fr.readVLongOutput(floorDataReader));
+  }
+
+  void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws 
IOException {
+    outputAccumulator.prepareRead();
+    final long code = ite.fr.readVLongOutput(outputAccumulator);
+    outputAccumulator.setFloorData(floorDataReader);
+    load(code);
+  }
+
+  void load(Long boxCode) throws IOException {

Review Comment:
   Hmm rename to `blockCode`?  Not sure why it's a box :)



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java:
##########
@@ -1190,4 +1176,63 @@ public void seekExact(long ord) {
   public long ord() {
     throw new UnsupportedOperationException();
   }
+
+  static class OutputAccumulator extends DataInput {
+
+    BytesRef[] outputs = new BytesRef[16];
+    BytesRef current;
+    int num;
+    int outputIndex;
+    int index;
+
+    void push(BytesRef output) {
+      if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
+        outputs = ArrayUtil.grow(outputs, num + 1);
+        outputs[num++] = output;
+      }
+    }
+
+    void pop() {
+      this.num--;
+    }
+
+    void reset() {
+      this.num = 0;
+    }
+
+    void prepareRead() {
+      this.index = 0;
+      this.outputIndex = 0;
+      this.current = outputs[0];
+    }
+
+    void setFloorData(ByteArrayDataInput floorData) {
+      assert outputIndex == num - 1

Review Comment:
   Could we also `assert index == 0` in this assert?  I.e. we are at the start 
of the last `BytesRef`?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java:
##########
@@ -247,7 +243,7 @@ void rewind() {
     nextEnt = -1;
     hasTerms = hasTermsOrig;
     if (isFloor) {
-      floorDataReader.rewind();
+      floorDataReader.setPosition(rewindPos);

Review Comment:
   I wonder if this is still needed?  `setFloorData` didn't change the 
position?  Or might other operations before the `reset()` have altered the read 
position?  Hard to think about.



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java:
##########
@@ -142,12 +138,20 @@ public void setState(int state) {
   }
 
   void load(BytesRef frameIndexData) throws IOException {
-    if (frameIndexData != null) {
-      floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, 
frameIndexData.length);
-      // Skip first long -- has redundant fp, hasTerms
-      // flag, isFloor flag
-      final long code = ite.fr.readVLongOutput(floorDataReader);
-      if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
+    floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, 
frameIndexData.length);
+    load(ite.fr.readVLongOutput(floorDataReader));
+  }
+
+  void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws 
IOException {
+    outputAccumulator.prepareRead();
+    final long code = ite.fr.readVLongOutput(outputAccumulator);
+    outputAccumulator.setFloorData(floorDataReader);
+    load(code);
+  }
+
+  void load(Long boxCode) throws IOException {
+    if (boxCode != null) {

Review Comment:
   Maybe add comment that this means this block is the first one in a possible 
sequence of floor blocks corresponding to a single seek point from the FST 
terms index?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java:
##########
@@ -1190,4 +1176,63 @@ public void seekExact(long ord) {
   public long ord() {
     throw new UnsupportedOperationException();
   }
+
+  static class OutputAccumulator extends DataInput {
+
+    BytesRef[] outputs = new BytesRef[16];
+    BytesRef current;
+    int num;
+    int outputIndex;
+    int index;
+
+    void push(BytesRef output) {
+      if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {

Review Comment:
   Might be better to just check `if (output.length > 0) {`? It'd handle cases 
(not sure this happens) where caller is passing an empty `BytesRef` that is not 
`NO_OUTPUT`?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java:
##########
@@ -104,13 +104,9 @@ public SegmentTermsEnumFrame(SegmentTermsEnum ste, int 
ord) throws IOException {
     suffixLengthsReader = new ByteArrayDataInput();
   }
 
-  public void setFloorData(ByteArrayDataInput in, BytesRef source) {
-    final int numBytes = source.length - (in.getPosition() - source.offset);
-    if (numBytes > floorData.length) {
-      floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
-    }
-    System.arraycopy(source.bytes, source.offset + in.getPosition(), 
floorData, 0, numBytes);
-    floorDataReader.reset(floorData, 0, numBytes);
+  public void setFloorData(SegmentTermsEnum.OutputAccumulator 
outputAccumulator) {
+    outputAccumulator.setFloorData(floorDataReader);
+    rewindPos = floorDataReader.getPosition();

Review Comment:
   Can we move this above line 107?  It smells a bit putting this after -- it 
requires knowing `setFloorData` doesn't change the read position.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Optimize outputs accumulating for SegmentTermsEnum and IntersectTermsEnum [lucene]

Reply via email to