Re: [PR] Random access term dictionary [lucene]

via GitHub Fri, 03 Nov 2023 05:19:21 -0700


mikemccand commented on code in PR #12688:
URL: https://github.com/apache/lucene/pull/12688#discussion_r1381580022



##########
lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java:
##########
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import java.util.Objects;
+import 
org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+
+/**
+ * TermType holds the classification of a term, based on how its postings are 
written.
+ *
+ * <p>It captures -- 1) if a term has a singleton docid (i.e. only one doc 
contains this term). 2)
+ * if the term has skip data. 3) if the term as an VINT encoded position block.

Review Comment:
   `term as a` -> `term has a`?



##########
lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java:
##########
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import 
org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+import 
org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker;
+import 
org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.BytesRef;
+
+final class TermStateCodecImpl implements TermStateCodec {
+  private final TermStateCodecComponent[] components;
+  private final int metadataBytesLength;
+
+  private static int getMetadataLength(TermStateCodecComponent component) {
+    // 1 byte for bitWidth; optionally 8 byte more for the reference value
+    return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0);
+  }
+
+  public TermStateCodecImpl(TermStateCodecComponent[] components) {
+    assert components.length > 0;
+
+    this.components = components;
+    int metadataBytesLength = 0;
+    for (var component : components) {
+      metadataBytesLength += getMetadataLength(component);
+    }
+    this.metadataBytesLength = metadataBytesLength;
+  }
+
+  @Override
+  public byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) {
+    Metadata[] metadataPerComponent = getMetadataPerComponent(inputs);
+    byte[] metadataBytes = serializeMetadata(metadataPerComponent);
+
+    // Encode inputs via the bitpacker
+    for (var termState : inputs) {
+      encodeOne(bitPacker, termState, metadataPerComponent);
+    }
+
+    return metadataBytes;
+  }
+
+  private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs) {
+    Metadata[] metadataPerComponent = new Metadata[components.length];
+    for (int i = 0; i < components.length; i++) {
+      var component = components[i];
+      byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, component);
+      long referenceValue =
+          component.isMonotonicallyIncreasing() ? 
component.getTargetValue(inputs[0]) : 0L;
+      metadataPerComponent[i] = new Metadata(bitWidth, referenceValue);
+    }
+    return metadataPerComponent;
+  }
+
+  private byte[] serializeMetadata(Metadata[] metadataPerComponent) {
+    byte[] metadataBytes = new byte[this.metadataBytesLength];
+    ByteArrayDataOutput dataOut = new ByteArrayDataOutput(metadataBytes);
+
+    for (int i = 0; i < components.length; i++) {
+      var metadata = metadataPerComponent[i];
+      dataOut.writeByte(metadata.bitWidth);
+      if (components[i].isMonotonicallyIncreasing()) {
+        dataOut.writeLong(metadata.referenceValue);
+      }
+    }
+    return metadataBytes;
+  }
+
+  private void encodeOne(

Review Comment:
   OK it looks like it is not column stride -- each term is packing all of its 
metadata into one `byte[]` and then these `byte[]` are concatenated for all the 
terms.



##########
lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java:
##########
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsReader;
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsWriter;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Similar to {@link Lucene90PostingsFormat} but with a different term 
dictionary implementation.

Review Comment:
   It would be nice to spell out some high level details about how the format 
works :)  E.g. single FST holding full terms, with `long` pointer (packed term 
type + up to 62 bit ordinal) to where full term metadata is packed on-disk.  
And the 8 different term type "groupings".
   
   BTW I think FST will pack better if you put the 3 "term type" bits at the 
end (lsb) of the `long` instead of msb, because then prefixes can be shared 
across all term types, instead of just within each term type.  But, this may 
not be a win because it may take more vLong bytes to write each delta, not sure 
:)



##########
lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java:
##########
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import java.util.Objects;
+import 
org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+
+/**
+ * TermType holds the classification of a term, based on how its postings are 
written.
+ *
+ * <p>It captures -- 1) if a term has a singleton docid (i.e. only one doc 
contains this term). 2)
+ * if the term has skip data. 3) if the term as an VINT encoded position block.
+ */
+final class TermType {
+  private static final byte SINGLETON_DOC_MASK = (byte) 1;
+
+  private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1;
+
+  private static final byte HAS_VINT_POSITION_BLOCK_MASK = (byte) 1 << 2;
+
+  public static final int NUM_TOTAL_TYPES = 8;

Review Comment:
   8 because this is all the combinations/permutations of Lucene's different 
indexing options?



##########
lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java:
##########
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import 
org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+
+abstract class TermStateCodecComponent {
+
+  static byte getBitWidth(IntBlockTermState[] termStates, 
TermStateCodecComponent component) {
+    assert termStates.length > 0;
+
+    long maxValSeen = -1;
+    long referenceValue =
+        component.isMonotonicallyIncreasing() ? 
component.getTargetValue(termStates[0]) : 0;
+
+    for (var termState : termStates) {
+      maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState) - 
referenceValue);
+    }
+    return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen));
+  }
+
+  abstract boolean isMonotonicallyIncreasing();
+
+  abstract long getTargetValue(IntBlockTermState termState);
+
+  abstract void setTargetValue(IntBlockTermState termState, long value);
+
+  /** Below are the relevant IntBlockTermState components * */
+  static final class SingletonDocId extends TermStateCodecComponent {
+    public static SingletonDocId INSTANCE = new SingletonDocId();
+
+    private SingletonDocId() {}
+
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return false;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.singletonDocID;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      assert value <= Integer.MAX_VALUE;
+      // A correct codec implementation does not change the value,
+      // after the encode/decode round-trip it should still be a valid int
+      termState.singletonDocID = (int) value;
+    }
+  }
+
+  static final class DocFreq extends TermStateCodecComponent {
+    public static DocFreq INSTANCE = new DocFreq();
+
+    private DocFreq() {}
+
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return false;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.docFreq;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      assert value <= Integer.MAX_VALUE;
+      // A correct codec implementation does not change the value,
+      // after the encode/decode round-trip it should still be a valid int
+      termState.docFreq = (int) value;
+    }
+  }
+
+  static final class TotalTermFreq extends TermStateCodecComponent {
+    public static TotalTermFreq INSTANCE = new TotalTermFreq();
+
+    private TotalTermFreq() {}
+
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return false;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.totalTermFreq;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.totalTermFreq = value;
+    }
+  }
+
+  static final class DocStartFP extends TermStateCodecComponent {
+    public static DocStartFP INSTANCE = new DocStartFP();
+
+    private DocStartFP() {}
+
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return true;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.docStartFP;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.docStartFP = value;
+    }
+  }
+
+  static final class PositionStartFP extends TermStateCodecComponent {
+    public static PositionStartFP INSTANCE = new PositionStartFP();
+
+    private PositionStartFP() {}
+
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return true;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.posStartFP;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.posStartFP = value;
+    }
+  }
+
+  static final class PayloadStartFP extends TermStateCodecComponent {
+    public static PayloadStartFP INSTANCE = new PayloadStartFP();
+
+    private PayloadStartFP() {}
+
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return true;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.payStartFP;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.payStartFP = value;
+    }
+  }
+
+  static final class SkipOffset extends TermStateCodecComponent {
+    public static SkipOffset INSTANCE = new SkipOffset();
+
+    private SkipOffset() {}
+
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return false;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.skipOffset;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.skipOffset = value;
+    }
+  }
+
+  static final class LastPositionBlockOffset extends TermStateCodecComponent {

Review Comment:
   Are these per-term metadata stored column-stride in block of terms?  I.e. 
all `LastPositionBlockOffset` for N terms in one block are encoded as a 
contiguous `byte[]`?  (And same for `skipOffset`, `totalTermFreq`, etc.)?



##########
lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java:
##########
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import java.io.IOException;
+import java.util.Arrays;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTCompiler;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+
+/**
+ * Builds a term index for a given field. Logically this is a map: term -> 
(type, ord) where the
+ * ordinals are scoped to type (not global).
+ */
+final class TermsIndexBuilder {
+  private static long MAX_ORD = (1L << 60) - 1;

Review Comment:
   Because we peel off the three high bits of the `long` to encode the term 
type?



##########
lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java:
##########
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import java.io.IOException;
+import java.util.Arrays;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTCompiler;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+
+/**
+ * Builds a term index for a given field. Logically this is a map: term -> 
(type, ord) where the
+ * ordinals are scoped to type (not global).
+ */
+final class TermsIndexBuilder {
+  private static long MAX_ORD = (1L << 60) - 1;
+
+  private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES];
+  private final FSTCompiler<Long> fstCompiler =
+      new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, 
PositiveIntOutputs.getSingleton());
+
+  TermsIndexBuilder() {
+    Arrays.fill(countPerType, -1);
+  }
+
+  public void addTerm(BytesRef term, TermType termType) throws IOException {
+    IntsRefBuilder scratchInts = new IntsRefBuilder();
+    long ord = ++countPerType[termType.getId()];
+    fstCompiler.add(Util.toIntsRef(term, scratchInts), encode(ord, termType));
+  }
+
+  public TermsIndex build() throws IOException {
+    return new TermsIndex(fstCompiler.compile());
+  }
+
+  private long encode(long ord, TermType termType) {
+    // use a single long to encode `ord` and `termType`
+    // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0`
+    // so it looks like this |...  ord ...| termType| ... hasOutput  ...|
+    // where termType takes 3 bit and hasOutput takes the lowest bit. The rest 
is taken by ord
+    if (ord < 0) {
+      throw new IllegalArgumentException("can't encode negative ord");
+    }
+    if (ord > MAX_ORD) {
+      throw new IllegalArgumentException(
+          "Input ord is too large for TermType: "

Review Comment:
   More user friendly message here too?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Random access term dictionary [lucene]

Reply via email to