This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-text.git
commit 145b85161e6f62a02866ee065631704433ffe300 Author: Gary Gregory <[email protected]> AuthorDate: Sat Jan 14 08:49:04 2023 -0500 Add and use a package-private singleton for RegexTokenizer --- src/changes/changes.xml | 1 + .../org/apache/commons/text/similarity/CosineDistance.java | 9 ++------- .../org/apache/commons/text/similarity/RegexTokenizer.java | 13 ++++++++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 8465d66b..6b5bf7b9 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -50,6 +50,7 @@ The <action> type attribute can be add,update,fix,remove. <action type="fix" dev="aherbert" due-to="James Nord">Fix javadoc for StringEscapeUtils.escapeHtml4 #382</action> <action type="fix" dev="ggregory" due-to="Pavel Belousov, Gary Gregory">TextStringBuidler#hashCode() allocates a String on each call #387.</action> <action issue="TEXT-221" type="fix" dev="aherbert" due-to="Remco Riswick">Fix Bundle-SymbolicName to use the package name org.apache.commons.text</action> + <action type="fix" dev="ggregory" due-to="Gary Gregory">Add and use a package-private singleton for RegexTokenizer.</action> <!-- ADD --> <!-- UPDATE --> <action type="update" dev="ggregory" due-to="Dependabot">Bump actions/cache from 3.0.8 to 3.0.10 #361, #365.</action> diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java index 5c9a24c6..1544018b 100644 --- a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java @@ -35,11 +35,6 @@ import java.util.Map; */ public class CosineDistance implements EditDistance<Double> { - /** - * Tokenizer used to convert the character sequence into a vector. - */ - private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer(); - /** * Cosine similarity. */ @@ -47,8 +42,8 @@ public class CosineDistance implements EditDistance<Double> { @Override public Double apply(final CharSequence left, final CharSequence right) { - final CharSequence[] leftTokens = tokenizer.tokenize(left); - final CharSequence[] rightTokens = tokenizer.tokenize(right); + final CharSequence[] leftTokens = RegexTokenizer.INSTANCE.tokenize(left); + final CharSequence[] rightTokens = RegexTokenizer.INSTANCE.tokenize(right); final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens); final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens); diff --git a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java index 213e01c7..be64f849 100644 --- a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java +++ b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java @@ -26,9 +26,11 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; /** - * A simple word tokenizer that utilizes regex to find words. It applies a regex - * {@code (\w)+} over the input text to extract words from a given character - * sequence. + * A simple word {@link Tokenizer} that utilizes a regex to find words. It applies a regex {@code (\w)+} over the input text to extract words from a given + * character sequence. + * <p> + * Instances of this class are immutable and are safe for use by multiple concurrent threads. + * </p> * * @since 1.0 */ @@ -37,6 +39,11 @@ final class RegexTokenizer implements Tokenizer<CharSequence> { /** The whitespace pattern. */ private static final Pattern PATTERN = Pattern.compile("(\\w)+"); + /** + * Singleton instance. + */ + static final RegexTokenizer INSTANCE = new RegexTokenizer(); + /** * {@inheritDoc} *
