This is an automated email from the ASF dual-hosted git repository. atri pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new 84aa659b71 Fixes SQL wildcard escaping in LIKE queries (#10897) 84aa659b71 is described below commit 84aa659b713c3970da41a43925427708bc72a8c2 Author: Aravind Suresh <arvindsuresh2...@gmail.com> AuthorDate: Fri Jun 23 00:25:29 2023 +0530 Fixes SQL wildcard escaping in LIKE queries (#10897) * Fixes SQL wildcard escaping in LIKE queries * Updated unit tests * Fixed lint issues * Minor refactor * Addressed reviewer comments * Fixed lint * Addressed reviewer comments * Minor fixes * Minor refactor --- .../common/utils/RegexpPatternConverterUtils.java | 87 +++++++++++++++------- .../utils/RegexpPatternConverterUtilsTest.java | 37 +++++++++ 2 files changed, 97 insertions(+), 27 deletions(-) diff --git a/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java b/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java index c62cf86f23..31a0d59d95 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java @@ -18,6 +18,9 @@ */ package org.apache.pinot.common.utils; + +import com.google.common.primitives.Chars; + /** * Utility for converting regex patterns. */ @@ -25,9 +28,14 @@ public class RegexpPatternConverterUtils { private RegexpPatternConverterUtils() { } - /* Represents all metacharacters to be processed */ - public static final String[] REGEXP_METACHARACTERS = - {"\\", "^", "$", ".", "{", "}", "[", "]", "(", ")", "*", "+", "?", "|", "<", ">", "-", "&", "/"}; + /* + * Represents all metacharacters to be processed. + * This excludes the \ (back slash) character as that doubles up as an escape character as well. + * So it is handled separately in the conversion logic. + */ + public static final char[] REGEXP_METACHARACTERS = new char[]{ + '^', '$', '.', '{', '}', '[', ']', '(', ')', '*', '+', '?', '|', '<', '>', '-', '&', '/'}; + public static final char BACK_SLASH = '\\'; /** * Converts a LIKE pattern into REGEXP_LIKE pattern. @@ -64,24 +72,61 @@ public class RegexpPatternConverterUtils { break; } - String escaped = escapeMetaCharacters(likePattern.substring(start, end)); - StringBuilder sb = new StringBuilder(escaped.length() + 2); - sb.append(prefix); - sb.append(escaped); - sb.append(suffix); + likePattern = likePattern.substring(start, end); + return escapeMetaCharsAndWildcards(likePattern, prefix, suffix); + } + /** + * Escapes the provided pattern by considering the following constraints: + * <ul> + * <li> SQL wildcards escaping is handled (_, %) </li> + * <li> Regex meta characters escaping is handled </li> + * </ul> + * @param input the provided input string + * @param prefix the prefix to be added to the output string + * @param suffix the suffix to be added to the output string + * @return the final output string + */ + private static String escapeMetaCharsAndWildcards(String input, String prefix, String suffix) { + StringBuilder sb = new StringBuilder(); + sb.append(prefix); + // handling SQL wildcards (_, %) by replacing them with corresponding regex equivalents + // we ignore them if the SQL wildcards are escaped int i = 0; - while (i < sb.length()) { - char c = sb.charAt(i); - if (c == '_') { - sb.replace(i, i + 1, "."); - } else if (c == '%') { - sb.replace(i, i + 1, ".*"); - i++; + int len = input.length(); + boolean isPrevCharBackSlash = false; + while (i < len) { + char c = input.charAt(i); + switch (c) { + case '_': + sb.append(isPrevCharBackSlash ? c : "."); + break; + case '%': + sb.append(isPrevCharBackSlash ? c : ".*"); + break; + default: + // either the current character is a meta-character + // OR + // this means the previous character is a \ + // but it was not used for escaping SQL wildcards + // so let's escape this \ in the output + // this case is separately handled outside the meta characters list + if (Chars.indexOf(REGEXP_METACHARACTERS, c) >= 0 || isPrevCharBackSlash) { + sb.append(BACK_SLASH); + } + sb.append(c); + break; } + isPrevCharBackSlash = (c == BACK_SLASH); i++; } + // handle trailing \ + if (isPrevCharBackSlash) { + sb.append(BACK_SLASH); + } + + sb.append(suffix); return sb.toString(); } @@ -103,18 +148,6 @@ public class RegexpPatternConverterUtils { return -1; } - /** - * Add escape characters before special characters - */ - private static String escapeMetaCharacters(String pattern) { - for (String metaCharacter : REGEXP_METACHARACTERS) { - if (pattern.contains(metaCharacter)) { - pattern = pattern.replace(metaCharacter, "\\" + metaCharacter); - } - } - return pattern; - } - /** * Converts a REGEXP_LIKE pattern into Lucene REGEXP pattern. */ diff --git a/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java index 8c94cb4886..0341ee7606 100644 --- a/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java +++ b/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java @@ -125,4 +125,41 @@ public class RegexpPatternConverterUtilsTest { String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("z%"); assertEquals(regexpLikePattern, "^z"); } + + @Test + public void testEscapedWildcard1() { + // the first underscore (_ in _b) is escaped, so it is meant to match an actual "_b" string in the provided + // string + // the second underscore (_ in b_) is not escaped, so it is a SQL wildcard that is used to match a single + // character, which in the regex space is "." + String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("a\\_b_\\"); + assertEquals(regexpLikePattern, "^a\\_b.\\\\$"); + String luceneRegExpPattern = RegexpPatternConverterUtils.regexpLikeToLuceneRegExp(regexpLikePattern); + assertEquals(luceneRegExpPattern, "a\\_b.\\\\"); + } + + @Test + public void testEscapedWildcard2() { + // the % (% in %b) is escaped, so it is meant to match an actual "%b" string in the provided + // string + // the "\" before c is a normal "\", so it is meant to match an actual "\" string in the provided + // string, this is done because "c" is not a SQL wildcard - hence the "\" before that is used as-is + // and is not used for escaping "c" + // so, this "\" is escaped in the output as it is a regex metacharacter and the converted regex + // will match "a%b\cde" in the provided string + String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("a\\%b\\cde"); + assertEquals(regexpLikePattern, "^a\\%b\\\\cde$"); + String luceneRegExpPattern = RegexpPatternConverterUtils.regexpLikeToLuceneRegExp(regexpLikePattern); + assertEquals(luceneRegExpPattern, "a\\%b\\\\cde"); + } + @Test + public void testEscapedWildcard3() { + // here the "\" character is used to escape _, so _ here is not treated as a SQL wildcard + // but it is meant to actually match "_" in the provided string + // so the corresponding regex doesn't convert the "_" to "." + String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("%2\\_2%"); + assertEquals(regexpLikePattern, "2\\_2"); + String luceneRegExpPattern = RegexpPatternConverterUtils.regexpLikeToLuceneRegExp(regexpLikePattern); + assertEquals(luceneRegExpPattern, ".*2\\_2.*"); + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org