richardstartin commented on a change in pull request #7405:
URL: https://github.com/apache/pinot/pull/7405#discussion_r716505530



##########
File path: 
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/nativefst/automaton/Automaton.java
##########
@@ -0,0 +1,653 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.pinot.segment.local.utils.nativefst.automaton;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+
+/**
+ * Finite-state automaton with regular expression operations.
+ * <p>
+ * Class invariants:
+ * <ul>
+ * <li> An automaton is either represented explicitly (with {@link State} and 
{@link Transition} objects)
+ *      or with a singleton string ({@link #expandSingleton()}) in case
+ *      the automaton is known to accept exactly one string.
+ *      (Implicitly, all states and transitions of an automaton are reachable 
from its initial state.)
+ * <li> Automata are always reduced (see {@link #reduce()}) 
+ *      and have no transitions to dead states (see {@link 
#removeDeadTransitions()}).
+ * <li> Automata provided as input to operations are generally assumed to be 
disjoint.
+ * </ul>
+ * <p>
+ */
+public class Automaton implements Serializable, Cloneable {
+
+  /**
+   * Minimize using Huffman's O(n<sup>2</sup>) algorithm.
+   * This is the standard text-book algorithm.
+   */
+  public static final int MINIMIZE_HUFFMAN = 0;
+  /**
+   * Minimize using Brzozowski's O(2<sup>n</sup>) algorithm.
+   * This algorithm uses the reverse-determinize-reverse-determinize trick, 
which has a bad
+   * worst-case behavior but often works very well in practice
+   * (even better than Hopcroft's!).
+   */
+  public static final int MINIMIZE_BRZOZOWSKI = 1;
+  /**
+   * Minimize using Hopcroft's O(n log n) algorithm.
+   */
+  public static final int MINIMIZE_HOPCROFT = 2;
+  /**
+   * Minimize using Valmari's O(n + m log m) algorithm.
+   */
+  public static final int MINIMIZE_VALMARI = 3;
+
+  /** Minimize always flag. */
+  public static boolean _minimizeAlways = false;
+
+  /** Selects whether operations may modify the input automata (default: 
<code>false</code>). */
+  public static boolean _allowMutation = false;
+
+  /** Selects minimization algorithm (default: 
<code>MINIMIZE_HOPCROFT</code>). */
+  public static int _minimization = MINIMIZE_HOPCROFT;
+
+  /** Initial state of this automaton. */
+  State _initial;
+
+  /** If true, then this automaton is definitely deterministic
+   (i.e., there are no choices for any run, but a run may crash). */
+  boolean _deterministic;
+
+  /** Hash code. Recomputed by {@link #minimize()}. */
+  int _hashCode;
+
+  /** Singleton string. Null if not applicable. */
+  String _singleton;
+
+  /**
+   * Constructs a new automaton that accepts the empty language.
+   * Using this constructor, automata can be constructed manually from
+   * {@link State} and {@link Transition} objects.
+   * @see #setInitialState(State)
+   * @see State
+   * @see Transition
+   */
+  public Automaton() {
+    _initial = new State();
+    _deterministic = true;
+    _singleton = null;
+  }
+
+  /**
+   * Sets or resets allow mutate flag.
+   * If this flag is set, then all automata operations may modify automata 
given as input;
+   * otherwise, operations will always leave input automata languages 
unmodified.
+   * By default, the flag is not set.
+   * @param flag if true, the flag is set
+   * @return previous value of the flag
+   */
+  static public boolean setAllowMutate(boolean flag) {
+    boolean b = _allowMutation;
+    _allowMutation = flag;
+    return b;
+  }
+
+  /**
+   * Assigns consecutive numbers to the given states.
+   */
+  static void setStateNumbers(Set<State> states) {
+    if (states.size() == Integer.MAX_VALUE) {
+      throw new IllegalArgumentException("number of states exceeded 
Integer.MAX_VALUE");
+    }
+    int number = 0;
+    for (State s : states) {
+      s._number = number++;
+    }
+  }
+
+  /**
+   * Returns a sorted array of transitions for each state (and sets state 
numbers).
+   */
+  static Transition[][] getSortedTransitions(Set<State> states) {
+    setStateNumbers(states);
+    Transition[][] transitions = new Transition[states.size()][];
+    for (State s : states) {
+      transitions[s._number] = s.getSortedTransitionArray(false);
+    }
+    return transitions;
+  }
+
+  /**
+   * See {@link MinimizationOperations#minimize(Automaton)}.
+   * Returns the automaton being given as argument.
+   */
+  public static Automaton minimize(Automaton a) {
+    a.minimize();
+    return a;
+  }
+
+  void checkMinimizeAlways() {
+    if (_minimizeAlways) {
+      minimize();
+    }
+  }
+
+  boolean isSingleton() {
+    return _singleton != null;
+  }
+
+  /**
+   * Gets initial state.
+   * @return state
+   */
+  public State getInitialState() {
+    expandSingleton();
+    return _initial;
+  }
+
+  /**
+   * Sets initial state.
+   * @param s state
+   */
+  public void setInitialState(State s) {
+    _initial = s;
+    _singleton = null;
+  }
+
+  /**
+   * Returns the set of states that are reachable from the initial state.
+   * @return set of {@link State} objects
+   */
+  public Set<State> getStates() {
+    expandSingleton();
+    Set<State> visited;
+
+    visited = new HashSet<>();
+    LinkedList<State> worklist = new LinkedList<State>();
+    worklist.add(_initial);
+    visited.add(_initial);
+    while (!worklist.isEmpty()) {
+      State s = worklist.removeFirst();
+      Collection<Transition> tr;
+
+      tr = s._transitionSet;
+      for (Transition t : tr) {
+        if (!visited.contains(t._to)) {
+          visited.add(t._to);
+          worklist.add(t._to);
+        }
+      }
+    }
+    return visited;
+  }
+
+  /**
+   * Returns the set of reachable accept states.
+   * @return set of {@link State} objects
+   */
+  public Set<State> getAcceptStates() {
+    expandSingleton();
+    HashSet<State> accepts = new HashSet<State>();
+    BitSet visited = new BitSet();
+    LinkedList<State> worklist = new LinkedList<State>();
+    worklist.add(_initial);
+    visited.set(_initial._id);

Review comment:
       Why `_id` and not `_number`? `_id` is global and `_number` is assigned 
locally to the automaton?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to