atris commented on code in PR #10897: URL: https://github.com/apache/pinot/pull/10897#discussion_r1228567126
########## pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java: ########## @@ -18,16 +18,23 @@ */ package org.apache.pinot.common.utils; +import com.google.common.collect.ImmutableSet; +import java.util.Set; + /** * Utility for converting regex patterns. */ public class RegexpPatternConverterUtils { private RegexpPatternConverterUtils() { } - /* Represents all metacharacters to be processed */ - public static final String[] REGEXP_METACHARACTERS = - {"\\", "^", "$", ".", "{", "}", "[", "]", "(", ")", "*", "+", "?", "|", "<", ">", "-", "&", "/"}; + /* + * Represents all metacharacters to be processed. + * This excludes the \ (back slash) character as that doubles up as an escape character as well. + * So it is handled separately in the conversion logic. + */ + public static final Set<String> REGEXP_METACHARACTERS = ImmutableSet.of( Review Comment: Why the move to a set? Since the number of characters is small, isn't an array more space efficient? ########## pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java: ########## @@ -64,24 +71,42 @@ public static String likeToRegexpLike(String likePattern) { break; } - String escaped = escapeMetaCharacters(likePattern.substring(start, end)); - StringBuilder sb = new StringBuilder(escaped.length() + 2); + likePattern = likePattern.substring(start, end); + StringBuilder sb = new StringBuilder(); sb.append(prefix); - sb.append(escaped); - sb.append(suffix); + // handling SQL wildcards by replacing them with corresponding regex equivalents + // we ignore them if the SQL wildcards are escaped int i = 0; - while (i < sb.length()) { - char c = sb.charAt(i); + boolean isPrevCharBackSlash = false; + while (i < likePattern.length()) { + char c = likePattern.charAt(i); if (c == '_') { - sb.replace(i, i + 1, "."); + sb.append(isPrevCharBackSlash ? c : "."); } else if (c == '%') { - sb.replace(i, i + 1, ".*"); - i++; + sb.append(isPrevCharBackSlash ? c : ".*"); Review Comment: Why was this logic moved out of escapeMetaCharacters? ########## pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java: ########## @@ -64,24 +71,42 @@ public static String likeToRegexpLike(String likePattern) { break; } - String escaped = escapeMetaCharacters(likePattern.substring(start, end)); - StringBuilder sb = new StringBuilder(escaped.length() + 2); + likePattern = likePattern.substring(start, end); + StringBuilder sb = new StringBuilder(); sb.append(prefix); - sb.append(escaped); - sb.append(suffix); + // handling SQL wildcards by replacing them with corresponding regex equivalents + // we ignore them if the SQL wildcards are escaped int i = 0; - while (i < sb.length()) { - char c = sb.charAt(i); + boolean isPrevCharBackSlash = false; + while (i < likePattern.length()) { + char c = likePattern.charAt(i); if (c == '_') { - sb.replace(i, i + 1, "."); + sb.append(isPrevCharBackSlash ? c : "."); } else if (c == '%') { - sb.replace(i, i + 1, ".*"); - i++; + sb.append(isPrevCharBackSlash ? c : ".*"); + } else if (REGEXP_METACHARACTERS.contains(String.valueOf(c))) { + sb.append('\\').append(c); + } else { + if (isPrevCharBackSlash) { + // this means the previous character is a \ + // but it was not used for escaping SQL wildcards + // so let's escape this \ in the output + // this case is separately handled outside of the meta characters list + sb.append('\\'); Review Comment: Why can this be not merged with line 89? ########## pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java: ########## @@ -64,24 +71,42 @@ public static String likeToRegexpLike(String likePattern) { break; } - String escaped = escapeMetaCharacters(likePattern.substring(start, end)); - StringBuilder sb = new StringBuilder(escaped.length() + 2); + likePattern = likePattern.substring(start, end); + StringBuilder sb = new StringBuilder(); sb.append(prefix); - sb.append(escaped); - sb.append(suffix); + // handling SQL wildcards by replacing them with corresponding regex equivalents + // we ignore them if the SQL wildcards are escaped int i = 0; - while (i < sb.length()) { - char c = sb.charAt(i); + boolean isPrevCharBackSlash = false; + while (i < likePattern.length()) { + char c = likePattern.charAt(i); if (c == '_') { - sb.replace(i, i + 1, "."); Review Comment: NIT: Lets use constant named variables instead of symbols here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org