This is an automated email from the ASF dual-hosted git repository.

ggregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git


The following commit(s) were added to refs/heads/master by this push:
     new 489a3ca6 Precompile regular expressions in 
Lang.loadFromResource(String, Languages)
489a3ca6 is described below

commit 489a3ca6b82c9839ffa4b51a5999a06887acd4e1
Author: Gary Gregory <garydgreg...@gmail.com>
AuthorDate: Mon May 19 10:33:18 2025 -0400

    Precompile regular expressions in Lang.loadFromResource(String,
    Languages)
    
    - Precompile regular expressions in PhoneticEngine.encode(String,
    LanguageSet)
    - Precompile regular expressions in
    org.apache.commons.codec.language.bm.Rule.parse*(*)
---
 src/changes/changes.xml                                        |  7 +++++--
 src/main/java/org/apache/commons/codec/language/bm/Lang.java   |  6 ++++--
 .../org/apache/commons/codec/language/bm/PhoneticEngine.java   |  7 +++++--
 .../apache/commons/codec/language/bm/ResourceConstants.java    |  4 ++++
 src/main/java/org/apache/commons/codec/language/bm/Rule.java   | 10 +++++++---
 5 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index e8854a9e..eea6bb2a 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -48,8 +48,11 @@ The <action> type attribute can be add,update,fix,remove.
       <action type="fix" dev="ggregory" due-to="Gary Gregory">Remove -nouses 
directive from maven-bundle-plugin. OSGi package imports now state 'uses' 
definitions for package imports, this doesn't affect JPMS (from 
org.apache.commons:commons-parent:80).</action>
       <action type="fix" dev="ggregory" due-to="Gary Gregory">Refactor 
DigestUtils.updateDigest(MessageDigest, File) to use NIO.</action>
       <action type="fix" dev="ggregory" due-to="Gary Gregory" 
issue="CODEC-328" >Clarify Javadoc for 
org.apache.commons.codec.digest.UnixCrypt.crypt(byte[],String).</action>
-      <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile and 
resuse x3 regular expression in DaitchMokotoffSoundex.Rule.</action>
-      <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile and 
resuse regular expressions in DaitchMokotoffSoundex.parseRules(Scanner, String, 
Map, Map).</action>
+      <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile 
regular expressions in DaitchMokotoffSoundex.Rule.</action>
+      <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile 
regular expressions in DaitchMokotoffSoundex.parseRules(Scanner, String, Map, 
Map).</action>
+      <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile 
regular expressions in Lang.loadFromResource(String, Languages).</action>
+      <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile 
regular expressions in PhoneticEngine.encode(String, LanguageSet).</action>
+      <action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile 
regular expressions in 
org.apache.commons.codec.language.bm.Rule.parse*(*).</action>
       <!-- ADD -->
       <action type="add" dev="ggregory" due-to="Gary Gregory">Add 
HmacUtils.hmac(Path).</action>      
       <action type="add" dev="ggregory" due-to="Gary Gregory">Add 
HmacUtils.hmacHex(Path).</action>      
diff --git a/src/main/java/org/apache/commons/codec/language/bm/Lang.java 
b/src/main/java/org/apache/commons/codec/language/bm/Lang.java
index d92e37c4..6aacea1a 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/Lang.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/Lang.java
@@ -103,6 +103,8 @@ public class Lang {
 
     private static final String LANGUAGE_RULES_RN = 
"/org/apache/commons/codec/language/bm/%s_lang.txt";
 
+    private static final Pattern PLUS = Pattern.compile("\\+");
+
     static {
         for (final NameType s : NameType.values()) {
             LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, 
s.getName()), Languages.getInstance(s)));
@@ -163,7 +165,7 @@ public class Lang {
                     }
 
                     // split it up
-                    final String[] parts = line.split("\\s+");
+                    final String[] parts = 
ResourceConstants.SPACES.split(line);
 
                     if (parts.length != 3) {
                         throw new IllegalArgumentException("Malformed line '" 
+ rawLine +
@@ -171,7 +173,7 @@ public class Lang {
                     }
 
                     final Pattern pattern = Pattern.compile(parts[0]);
-                    final String[] langs = parts[1].split("\\+");
+                    final String[] langs = PLUS.split(parts[1]);
                     final boolean accept = parts[2].equals("true");
 
                     rules.add(new LangRule(pattern, new 
HashSet<>(Arrays.asList(langs)), accept));
diff --git 
a/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java 
b/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
index 482d6ffd..6b426d7a 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
@@ -29,6 +29,7 @@ import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 import org.apache.commons.codec.language.bm.Languages.LanguageSet;
@@ -227,6 +228,8 @@ public class PhoneticEngine {
 
     private static final Map<NameType, Set<String>> NAME_PREFIXES = new 
EnumMap<>(NameType.class);
 
+    private static final Pattern QUOTE = Pattern.compile("'");
+
     static {
         NAME_PREFIXES.put(NameType.ASHKENAZI,
                 Collections.unmodifiableSet(
@@ -401,14 +404,14 @@ public class PhoneticEngine {
             }
         }
 
-        final List<String> words = Arrays.asList(input.split("\\s+"));
+        final List<String> words = 
Arrays.asList(ResourceConstants.SPACES.split(input));
         final List<String> words2 = new ArrayList<>();
 
         // special-case handling of word prefixes based upon the name type
         switch (this.nameType) {
         case SEPHARDIC:
             words.forEach(aWord -> {
-                final String[] parts = aWord.split("'", -1);
+                final String[] parts = QUOTE.split(aWord, -1);
                 words2.add(parts[parts.length - 1]);
             });
             words2.removeAll(NAME_PREFIXES.get(this.nameType));
diff --git 
a/src/main/java/org/apache/commons/codec/language/bm/ResourceConstants.java 
b/src/main/java/org/apache/commons/codec/language/bm/ResourceConstants.java
index cee46f59..9b0d4a7a 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/ResourceConstants.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/ResourceConstants.java
@@ -17,6 +17,8 @@
 
 package org.apache.commons.codec.language.bm;
 
+import java.util.regex.Pattern;
+
 import org.apache.commons.codec.CharEncoding;
 
 /**
@@ -32,5 +34,7 @@ final class ResourceConstants {
     static final String ENCODING = CharEncoding.UTF_8;
     static final String EXT_CMT_END = "*/";
     static final String EXT_CMT_START = "/*";
+    static final Pattern SPACES = Pattern.compile("\\s+");
+
 
 }
diff --git a/src/main/java/org/apache/commons/codec/language/bm/Rule.java 
b/src/main/java/org/apache/commons/codec/language/bm/Rule.java
index 871af388..5e38c610 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/Rule.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/Rule.java
@@ -300,6 +300,10 @@ public class Rule {
 
     private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
 
+    private static final Pattern AROUND_PLUS = Pattern.compile("[+]");
+
+    private static final Pattern AROUND_PIPE = Pattern.compile("[|]");
+
     private static final Map<NameType, Map<RuleType, Map<String, Map<String, 
List<Rule>>>>> RULES =
             new EnumMap<>(NameType.class);
 
@@ -452,7 +456,7 @@ public class Rule {
             }
             final String before = ph.substring(0, open);
             final String in = ph.substring(open + 1, ph.length() - 1);
-            final Set<String> langs = new 
HashSet<>(Arrays.asList(in.split("[+]")));
+            final Set<String> langs = new 
HashSet<>(Arrays.asList(AROUND_PLUS.split(in)));
 
             return new Phoneme(before, Languages.LanguageSet.from(langs));
         }
@@ -467,7 +471,7 @@ public class Rule {
 
             final List<Phoneme> phs = new ArrayList<>();
             final String body = ph.substring(1, ph.length() - 1);
-            for (final String part : body.split("[|]")) {
+            for (final String part : AROUND_PIPE.split(body)) {
                 phs.add(parsePhoneme(part));
             }
             if (body.startsWith("|") || body.endsWith("|")) {
@@ -521,7 +525,7 @@ public class Rule {
                     }
                 } else {
                     // rule
-                    final String[] parts = line.split("\\s+");
+                    final String[] parts = 
ResourceConstants.SPACES.split(line);
                     if (parts.length != 4) {
                         throw new IllegalArgumentException("Malformed rule 
statement split into " + parts.length +
                                                            " parts: " + 
rawLine + " in " + location);

Reply via email to