richardstartin commented on a change in pull request #7405: URL: https://github.com/apache/pinot/pull/7405#discussion_r717349854
########## File path: pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/nativefst/builders/FSTBuilder.java ########## @@ -0,0 +1,565 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.utils.nativefst.builders; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import org.apache.pinot.segment.local.utils.nativefst.ConstantArcSizeFST; +import org.apache.pinot.segment.local.utils.nativefst.FST; + + +/** + * Fast, memory-conservative finite state transducer builder, returning an + * in-memory {@link FST} that is a tradeoff between construction speed and + * memory consumption. Use serializers to compress the returned automaton into + * more compact form. + * + * @see FSTSerializer + */ +public final class FSTBuilder { + /** + * A comparator comparing full byte arrays. Unsigned byte comparisons ('C'-locale). + */ + public static final Comparator<byte[]> LEXICAL_ORDERING = new Comparator<byte[]>() { + public int compare(byte[] o1, byte[] o2) { + return FSTBuilder.compare(o1, 0, o1.length, o2, 0, o2.length); + } + }; + /** A megabyte. */ + private final static int MB = 1024 * 1024; + + /** + * Internal serialized FST buffer expand ratio. + */ + private final static int BUFFER_GROWTH_SIZE = 5 * MB; + + /** + * Maximum number of labels from a single state. + */ + private final static int MAX_LABELS = 256; + /** + * Internal serialized FST buffer expand ratio. + */ + private final int _bufferGrowthSize; + private byte[] _serialized = new byte[0]; + private Map<Integer, Integer> _outputSymbols = new HashMap<>(); + + /** + * Number of bytes already taken in {@link #_serialized}. Start from 1 to keep + * 0 a sentinel value (for the hash set and final state). + */ + private int _size; + /** + * States on the "active path" (still mutable). Values are addresses of each + * state's first arc. + */ + private int[] _activePath = new int[0]; + /** + * Current length of the active path. + */ + private int _activePathLen; + /** + * The next offset at which an arc will be added to the given state on + * {@link #_activePath}. + */ + private int[] _nextArcOffset = new int[0]; + /** + * Root state. If negative, the automaton has been built already and cannot be + * extended. + */ + private int _root; + /** + * An epsilon state. The first and only arc of this state points either to the + * root or to the terminal state, indicating an empty automaton. + */ + private int _epsilon; + /** + * Hash set of state addresses in {@link #_serialized}, hashed by + * {@link #hash(int, int)}. Zero reserved for an unoccupied slot. + */ + private int[] _hashSet = new int[2]; + /** + * Number of entries currently stored in {@link #_hashSet}. + */ + private int _hashSize = 0; + /** + * Previous sequence added to the automaton in {@link #add(byte[], int, int, int)}. + * Used in assertions only. + */ + private byte[] _previous; + /** + * Information about the automaton and its compilation. + */ + private TreeMap<InfoEntry, Object> _info; + /** + * {@link #_previous} sequence's length, used in assertions only. + */ + private int _previousLength; + /** Number of serialization buffer reallocations. */ + private int _serializationBufferReallocations; + + /** */ + public FSTBuilder() { + this(BUFFER_GROWTH_SIZE); + } + + /** + * @param bufferGrowthSize Buffer growth size (in bytes) when constructing the automaton. + */ + public FSTBuilder(int bufferGrowthSize) { + _bufferGrowthSize = Math.max(bufferGrowthSize, ConstantArcSizeFST.ARC_SIZE * MAX_LABELS); + + // Allocate epsilon state. + _epsilon = allocateState(1); + _serialized[_epsilon + ConstantArcSizeFST.FLAGS_OFFSET] |= ConstantArcSizeFST.BIT_ARC_LAST; + + // Allocate root, with an initial empty set of output arcs. + expandActivePath(1); + _root = _activePath[0]; + } + + public static FST buildFST(SortedMap<String, Integer> input) { + + FSTBuilder fstbuilder = new FSTBuilder(); + + for (Map.Entry<String, Integer> entry : input.entrySet()) { + fstbuilder.add(entry.getKey().getBytes(), 0, entry.getKey().length(), entry.getValue().intValue()); + } + + return fstbuilder.complete(); + } + + /** + * Build a minimal, deterministic automaton from a sorted list of byte + * sequences. + * + * @param input Input sequences to build automaton from. + * @return Returns the automaton encoding all input sequences. + */ + public static FST build(byte[][] input, int[] outputSymbols) { + final FSTBuilder builder = new FSTBuilder(); + + int i = 0; + for (byte[] chs : input) { + builder.add(chs, 0, chs.length, i < outputSymbols.length ? outputSymbols[i] : -1); + ++i; + } + + return builder.complete(); + } + + /** + * Build a minimal, deterministic automaton from an iterable list of byte + * sequences. + * + * @param input Input sequences to build automaton from. + * @return Returns the automaton encoding all input sequences. + */ + public static FST build(Iterable<byte[]> input, int[] outputSymbols) { + final FSTBuilder builder = new FSTBuilder(); + + int i = 0; + + for (byte[] chs : input) { + builder.add(chs, 0, chs.length, i < outputSymbols.length ? outputSymbols[i] : -1); + ++i; + } + + return builder.complete(); + } + + /** + * Lexicographic order of input sequences. By default, consistent with the "C" + * sort (absolute value of bytes, 0-255). + */ + private static int compare(byte[] s1, int start1, int lens1, byte[] s2, int start2, int lens2) { + final int max = Math.min(lens1, lens2); + + for (int i = 0; i < max; i++) { + final byte c1 = s1[start1++]; + final byte c2 = s2[start2++]; + if (c1 != c2) { + return (c1 & 0xff) - (c2 & 0xff); + } + } + + return lens1 - lens2; + } Review comment: If you rebase on master you can replace this with `ByteArray.compare`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org