This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-codec.git
The following commit(s) were added to refs/heads/master by this push: new 489a3ca6 Precompile regular expressions in Lang.loadFromResource(String, Languages) 489a3ca6 is described below commit 489a3ca6b82c9839ffa4b51a5999a06887acd4e1 Author: Gary Gregory <garydgreg...@gmail.com> AuthorDate: Mon May 19 10:33:18 2025 -0400 Precompile regular expressions in Lang.loadFromResource(String, Languages) - Precompile regular expressions in PhoneticEngine.encode(String, LanguageSet) - Precompile regular expressions in org.apache.commons.codec.language.bm.Rule.parse*(*) --- src/changes/changes.xml | 7 +++++-- src/main/java/org/apache/commons/codec/language/bm/Lang.java | 6 ++++-- .../org/apache/commons/codec/language/bm/PhoneticEngine.java | 7 +++++-- .../apache/commons/codec/language/bm/ResourceConstants.java | 4 ++++ src/main/java/org/apache/commons/codec/language/bm/Rule.java | 10 +++++++--- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index e8854a9e..eea6bb2a 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -48,8 +48,11 @@ The <action> type attribute can be add,update,fix,remove. <action type="fix" dev="ggregory" due-to="Gary Gregory">Remove -nouses directive from maven-bundle-plugin. OSGi package imports now state 'uses' definitions for package imports, this doesn't affect JPMS (from org.apache.commons:commons-parent:80).</action> <action type="fix" dev="ggregory" due-to="Gary Gregory">Refactor DigestUtils.updateDigest(MessageDigest, File) to use NIO.</action> <action type="fix" dev="ggregory" due-to="Gary Gregory" issue="CODEC-328" >Clarify Javadoc for org.apache.commons.codec.digest.UnixCrypt.crypt(byte[],String).</action> - <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile and resuse x3 regular expression in DaitchMokotoffSoundex.Rule.</action> - <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile and resuse regular expressions in DaitchMokotoffSoundex.parseRules(Scanner, String, Map, Map).</action> + <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in DaitchMokotoffSoundex.Rule.</action> + <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in DaitchMokotoffSoundex.parseRules(Scanner, String, Map, Map).</action> + <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in Lang.loadFromResource(String, Languages).</action> + <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in PhoneticEngine.encode(String, LanguageSet).</action> + <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in org.apache.commons.codec.language.bm.Rule.parse*(*).</action> <!-- ADD --> <action type="add" dev="ggregory" due-to="Gary Gregory">Add HmacUtils.hmac(Path).</action> <action type="add" dev="ggregory" due-to="Gary Gregory">Add HmacUtils.hmacHex(Path).</action> diff --git a/src/main/java/org/apache/commons/codec/language/bm/Lang.java b/src/main/java/org/apache/commons/codec/language/bm/Lang.java index d92e37c4..6aacea1a 100644 --- a/src/main/java/org/apache/commons/codec/language/bm/Lang.java +++ b/src/main/java/org/apache/commons/codec/language/bm/Lang.java @@ -103,6 +103,8 @@ public class Lang { private static final String LANGUAGE_RULES_RN = "/org/apache/commons/codec/language/bm/%s_lang.txt"; + private static final Pattern PLUS = Pattern.compile("\\+"); + static { for (final NameType s : NameType.values()) { LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s))); @@ -163,7 +165,7 @@ public class Lang { } // split it up - final String[] parts = line.split("\\s+"); + final String[] parts = ResourceConstants.SPACES.split(line); if (parts.length != 3) { throw new IllegalArgumentException("Malformed line '" + rawLine + @@ -171,7 +173,7 @@ public class Lang { } final Pattern pattern = Pattern.compile(parts[0]); - final String[] langs = parts[1].split("\\+"); + final String[] langs = PLUS.split(parts[1]); final boolean accept = parts[2].equals("true"); rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept)); diff --git a/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java b/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java index 482d6ffd..6b426d7a 100644 --- a/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java +++ b/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java @@ -29,6 +29,7 @@ import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.TreeMap; +import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.codec.language.bm.Languages.LanguageSet; @@ -227,6 +228,8 @@ public class PhoneticEngine { private static final Map<NameType, Set<String>> NAME_PREFIXES = new EnumMap<>(NameType.class); + private static final Pattern QUOTE = Pattern.compile("'"); + static { NAME_PREFIXES.put(NameType.ASHKENAZI, Collections.unmodifiableSet( @@ -401,14 +404,14 @@ public class PhoneticEngine { } } - final List<String> words = Arrays.asList(input.split("\\s+")); + final List<String> words = Arrays.asList(ResourceConstants.SPACES.split(input)); final List<String> words2 = new ArrayList<>(); // special-case handling of word prefixes based upon the name type switch (this.nameType) { case SEPHARDIC: words.forEach(aWord -> { - final String[] parts = aWord.split("'", -1); + final String[] parts = QUOTE.split(aWord, -1); words2.add(parts[parts.length - 1]); }); words2.removeAll(NAME_PREFIXES.get(this.nameType)); diff --git a/src/main/java/org/apache/commons/codec/language/bm/ResourceConstants.java b/src/main/java/org/apache/commons/codec/language/bm/ResourceConstants.java index cee46f59..9b0d4a7a 100644 --- a/src/main/java/org/apache/commons/codec/language/bm/ResourceConstants.java +++ b/src/main/java/org/apache/commons/codec/language/bm/ResourceConstants.java @@ -17,6 +17,8 @@ package org.apache.commons.codec.language.bm; +import java.util.regex.Pattern; + import org.apache.commons.codec.CharEncoding; /** @@ -32,5 +34,7 @@ final class ResourceConstants { static final String ENCODING = CharEncoding.UTF_8; static final String EXT_CMT_END = "*/"; static final String EXT_CMT_START = "/*"; + static final Pattern SPACES = Pattern.compile("\\s+"); + } diff --git a/src/main/java/org/apache/commons/codec/language/bm/Rule.java b/src/main/java/org/apache/commons/codec/language/bm/Rule.java index 871af388..5e38c610 100644 --- a/src/main/java/org/apache/commons/codec/language/bm/Rule.java +++ b/src/main/java/org/apache/commons/codec/language/bm/Rule.java @@ -300,6 +300,10 @@ public class Rule { private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length(); + private static final Pattern AROUND_PLUS = Pattern.compile("[+]"); + + private static final Pattern AROUND_PIPE = Pattern.compile("[|]"); + private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = new EnumMap<>(NameType.class); @@ -452,7 +456,7 @@ public class Rule { } final String before = ph.substring(0, open); final String in = ph.substring(open + 1, ph.length() - 1); - final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]"))); + final Set<String> langs = new HashSet<>(Arrays.asList(AROUND_PLUS.split(in))); return new Phoneme(before, Languages.LanguageSet.from(langs)); } @@ -467,7 +471,7 @@ public class Rule { final List<Phoneme> phs = new ArrayList<>(); final String body = ph.substring(1, ph.length() - 1); - for (final String part : body.split("[|]")) { + for (final String part : AROUND_PIPE.split(body)) { phs.add(parsePhoneme(part)); } if (body.startsWith("|") || body.endsWith("|")) { @@ -521,7 +525,7 @@ public class Rule { } } else { // rule - final String[] parts = line.split("\\s+"); + final String[] parts = ResourceConstants.SPACES.split(line); if (parts.length != 4) { throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location);