rmuir commented on a change in pull request #513: URL: https://github.com/apache/lucene/pull/513#discussion_r763022968
########## File path: lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java ########## @@ -556,165 +538,84 @@ static RegExp newLeafNode( * toAutomaton(null)</code> (empty automaton map). */ public Automaton toAutomaton() { - return toAutomaton(null, null, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - } - - /** - * Constructs new <code>Automaton</code> from this <code>RegExp</code>. The constructed automaton - * is minimal and deterministic and has no transitions to dead states. - * - * @param determinizeWorkLimit maximum effort to spend while determinizing the automata. If - * determinizing the automata would require more than this effort, - * TooComplexToDeterminizeException is thrown. Higher numbers require more space but can - * process more complex regexes. Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a - * decent default if you don't otherwise know what to specify. - * @exception IllegalArgumentException if this regular expression uses a named identifier that is - * not available from the automaton provider - * @exception TooComplexToDeterminizeException if determinizing this regexp requires more effort - * than determinizeWorkLimit states - */ - public Automaton toAutomaton(int determinizeWorkLimit) - throws IllegalArgumentException, TooComplexToDeterminizeException { - return toAutomaton(null, null, determinizeWorkLimit); + return toAutomaton(null, null); } /** - * Constructs new <code>Automaton</code> from this <code>RegExp</code>. The constructed automaton - * is minimal and deterministic and has no transitions to dead states. + * Constructs new <code>Automaton</code> from this <code>RegExp</code>. * * @param automaton_provider provider of automata for named identifiers - * @param determinizeWorkLimit maximum effort to spend while determinizing the automata. If - * determinizing the automata would require more than this effort, - * TooComplexToDeterminizeException is thrown. Higher numbers require more space but can - * process more complex regexes. Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a - * decent default if you don't otherwise know what to specify. * @exception IllegalArgumentException if this regular expression uses a named identifier that is * not available from the automaton provider - * @exception TooComplexToDeterminizeException if determinizing this regexp requires more effort - * than determinizeWorkLimit states */ - public Automaton toAutomaton(AutomatonProvider automaton_provider, int determinizeWorkLimit) + public Automaton toAutomaton(AutomatonProvider automaton_provider) throws IllegalArgumentException, TooComplexToDeterminizeException { - return toAutomaton(null, automaton_provider, determinizeWorkLimit); + return toAutomaton(null, automaton_provider); } /** - * Constructs new <code>Automaton</code> from this <code>RegExp</code>. The constructed automaton - * is minimal and deterministic and has no transitions to dead states. + * Constructs new <code>Automaton</code> from this <code>RegExp</code>. * * @param automata a map from automaton identifiers to automata (of type <code>Automaton</code>). - * @param determinizeWorkLimit maximum effort to spend while determinizing the automata. If - * determinizing the automata would require more than this effort, - * TooComplexToDeterminizeException is thrown. Higher numbers require more space but can - * process more complex regexes. * @exception IllegalArgumentException if this regular expression uses a named identifier that * does not occur in the automaton map - * @exception TooComplexToDeterminizeException if determinizing this regexp requires more effort - * than determinizeWorkLimit states */ - public Automaton toAutomaton(Map<String, Automaton> automata, int determinizeWorkLimit) + public Automaton toAutomaton(Map<String, Automaton> automata) throws IllegalArgumentException, TooComplexToDeterminizeException { - return toAutomaton(automata, null, determinizeWorkLimit); + return toAutomaton(automata, null); } private Automaton toAutomaton( - Map<String, Automaton> automata, - AutomatonProvider automaton_provider, - int determinizeWorkLimit) - throws IllegalArgumentException, TooComplexToDeterminizeException { - try { - return toAutomatonInternal(automata, automaton_provider, determinizeWorkLimit); - } catch (TooComplexToDeterminizeException e) { - throw new TooComplexToDeterminizeException(this, e); - } - } - - private Automaton toAutomatonInternal( - Map<String, Automaton> automata, - AutomatonProvider automaton_provider, - int determinizeWorkLimit) + Map<String, Automaton> automata, AutomatonProvider automaton_provider) throws IllegalArgumentException { List<Automaton> list; Automaton a = null; switch (kind) { case REGEXP_PRE_CLASS: RegExp expanded = expandPredefined(); - a = expanded.toAutomatonInternal(automata, automaton_provider, determinizeWorkLimit); + a = expanded.toAutomaton(automata, automaton_provider); break; case REGEXP_UNION: list = new ArrayList<>(); - findLeaves( - exp1, Kind.REGEXP_UNION, list, automata, automaton_provider, determinizeWorkLimit); - findLeaves( - exp2, Kind.REGEXP_UNION, list, automata, automaton_provider, determinizeWorkLimit); + findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider); + findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider); a = Operations.union(list); - a = MinimizationOperations.minimize(a, determinizeWorkLimit); break; case REGEXP_CONCATENATION: list = new ArrayList<>(); - findLeaves( - exp1, - Kind.REGEXP_CONCATENATION, - list, - automata, - automaton_provider, - determinizeWorkLimit); - findLeaves( - exp2, - Kind.REGEXP_CONCATENATION, - list, - automata, - automaton_provider, - determinizeWorkLimit); + findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata, automaton_provider); + findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata, automaton_provider); a = Operations.concatenate(list); - a = MinimizationOperations.minimize(a, determinizeWorkLimit); break; case REGEXP_INTERSECTION: a = Operations.intersection( - exp1.toAutomatonInternal(automata, automaton_provider, determinizeWorkLimit), - exp2.toAutomatonInternal(automata, automaton_provider, determinizeWorkLimit)); - a = MinimizationOperations.minimize(a, determinizeWorkLimit); + exp1.toAutomaton(automata, automaton_provider), + exp2.toAutomaton(automata, automaton_provider)); break; case REGEXP_OPTIONAL: - a = - Operations.optional( - exp1.toAutomatonInternal(automata, automaton_provider, determinizeWorkLimit)); - a = MinimizationOperations.minimize(a, determinizeWorkLimit); + a = Operations.optional(exp1.toAutomaton(automata, automaton_provider)); break; case REGEXP_REPEAT: - a = - Operations.repeat( - exp1.toAutomatonInternal(automata, automaton_provider, determinizeWorkLimit)); - a = MinimizationOperations.minimize(a, determinizeWorkLimit); Review comment: Yes, but I think the behavior is now a good thing. Previously, `minimize()` (and hence `determinize()`, too), could be called many times during parsing. The user may have set a work limit of 10000, but it could be exceeded in some cases, where it is called many times, e.g. 5000 + 5000 + 5000 + 5000 + 5000 + 5000. So I think it is better to let the parser just be a parser and let the caller decide on what to do in a single place (`minimize()`, `determinize()`, or maybe nothing at all and use an NFA execution). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org