This is an automated email from the ASF dual-hosted git repository.

ggregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git

commit 145b85161e6f62a02866ee065631704433ffe300
Author: Gary Gregory <[email protected]>
AuthorDate: Sat Jan 14 08:49:04 2023 -0500

    Add and use a package-private singleton for RegexTokenizer
---
 src/changes/changes.xml                                     |  1 +
 .../org/apache/commons/text/similarity/CosineDistance.java  |  9 ++-------
 .../org/apache/commons/text/similarity/RegexTokenizer.java  | 13 ++++++++++---
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 8465d66b..6b5bf7b9 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -50,6 +50,7 @@ The <action> type attribute can be add,update,fix,remove.
     <action                  type="fix" dev="aherbert" due-to="James Nord">Fix 
javadoc for StringEscapeUtils.escapeHtml4 #382</action>
     <action                  type="fix" dev="ggregory" due-to="Pavel Belousov, 
Gary Gregory">TextStringBuidler#hashCode() allocates a String on each call 
#387.</action>
     <action issue="TEXT-221" type="fix" dev="aherbert" due-to="Remco 
Riswick">Fix Bundle-SymbolicName to use the package name 
org.apache.commons.text</action>
+    <action                  type="fix" dev="ggregory" due-to="Gary 
Gregory">Add and use a package-private singleton for RegexTokenizer.</action>
     <!-- ADD -->
     <!-- UPDATE -->
     <action                  type="update" dev="ggregory" 
due-to="Dependabot">Bump actions/cache from 3.0.8 to 3.0.10 #361, #365.</action>
diff --git 
a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java 
b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
index 5c9a24c6..1544018b 100644
--- a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java
@@ -35,11 +35,6 @@ import java.util.Map;
  */
 public class CosineDistance implements EditDistance<Double> {
 
-    /**
-     * Tokenizer used to convert the character sequence into a vector.
-     */
-    private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer();
-
     /**
      * Cosine similarity.
      */
@@ -47,8 +42,8 @@ public class CosineDistance implements EditDistance<Double> {
 
     @Override
     public Double apply(final CharSequence left, final CharSequence right) {
-        final CharSequence[] leftTokens = tokenizer.tokenize(left);
-        final CharSequence[] rightTokens = tokenizer.tokenize(right);
+        final CharSequence[] leftTokens = 
RegexTokenizer.INSTANCE.tokenize(left);
+        final CharSequence[] rightTokens = 
RegexTokenizer.INSTANCE.tokenize(right);
 
         final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens);
         final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens);
diff --git 
a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java 
b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java
index 213e01c7..be64f849 100644
--- a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java
+++ b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java
@@ -26,9 +26,11 @@ import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.Validate;
 
 /**
- * A simple word tokenizer that utilizes regex to find words. It applies a 
regex
- * {@code (\w)+} over the input text to extract words from a given character
- * sequence.
+ * A simple word {@link Tokenizer} that utilizes a regex to find words. It 
applies a regex {@code (\w)+} over the input text to extract words from a given
+ * character sequence.
+ * <p>
+ * Instances of this class are immutable and are safe for use by multiple 
concurrent threads.
+ * </p>
  *
  * @since 1.0
  */
@@ -37,6 +39,11 @@ final class RegexTokenizer implements 
Tokenizer<CharSequence> {
     /** The whitespace pattern. */
     private static final Pattern PATTERN = Pattern.compile("(\\w)+");
 
+    /**
+     * Singleton instance.
+     */
+    static final RegexTokenizer INSTANCE = new RegexTokenizer();
+
     /**
      * {@inheritDoc}
      *

Reply via email to