This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-text.git
commit f40607a1689b96831212e73a3588187778e6dc2a Author: Alex Herbert <aherb...@apache.org> AuthorDate: Thu Mar 7 23:13:49 2019 +0000 TEXT-156: Fix the RegexTokenizer to use a static Pattern. Remove the use of CharSequence.toString() to pass to the matcher(CharSequence) method. Fix the javadoc header @code tag. --- .../java/org/apache/commons/text/similarity/RegexTokenizer.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java index cc009ef..f650c0c 100644 --- a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java +++ b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java @@ -26,12 +26,14 @@ import org.apache.commons.lang3.Validate; /** * A simple word tokenizer that utilizes regex to find words. It applies a regex - * {@code}(\w)+{@code} over the input text to extract words from a given character + * {@code (\w)+} over the input text to extract words from a given character * sequence. * * @since 1.0 */ class RegexTokenizer implements Tokenizer<CharSequence> { + /** The whitespace pattern. */ + private static final Pattern PATTERN = Pattern.compile("(\\w)+"); /** * {@inheritDoc} @@ -41,8 +43,7 @@ class RegexTokenizer implements Tokenizer<CharSequence> { @Override public CharSequence[] tokenize(final CharSequence text) { Validate.isTrue(StringUtils.isNotBlank(text), "Invalid text"); - final Pattern pattern = Pattern.compile("(\\w)+"); - final Matcher matcher = pattern.matcher(text.toString()); + final Matcher matcher = PATTERN.matcher(text); final List<String> tokens = new ArrayList<>(); while (matcher.find()) { tokens.add(matcher.group(0));