Re: [PR] Cleanup and fix EscapeQuerySyntaxImpl [lucene]

via GitHub Mon, 08 Jan 2024 03:26:26 -0800


sabi0 commented on code in PR #12973:
URL: https://github.com/apache/lucene/pull/12973#discussion_r1444477534



##########
lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/EscapeQuerySyntaxImpl.java:
##########
@@ -40,105 +40,109 @@ public class EscapeQuerySyntaxImpl implements 
EscapeQuerySyntax {
     "AND", "OR", "NOT", "TO", "WITHIN", "SENTENCE", "PARAGRAPH", "INORDER"
   };
 
-  private static final CharSequence escapeChar(CharSequence str, Locale 
locale) {
-    if (str == null || str.length() == 0) return str;
+  private static CharSequence escapeChar(CharSequence str, Locale locale) {
+    if (str == null || str.isEmpty()) return str;
 
     CharSequence buffer = str;
 
-    // regular escapable Char for terms
-    for (int i = 0; i < escapableTermChars.length; i++) {
-      buffer = replaceIgnoreCase(buffer, 
escapableTermChars[i].toLowerCase(locale), "\\", locale);
+    // regular escapable char for terms
+    for (String escapableTermChar : escapableTermChars) {
+      buffer = escapeIgnoringCase(buffer, 
escapableTermChar.toLowerCase(locale), "\\", locale);
     }
 
-    // First Character of a term as more escaping chars
-    for (int i = 0; i < escapableTermExtraFirstChars.length; i++) {
-      if (buffer.charAt(0) == escapableTermExtraFirstChars[i].charAt(0)) {
-        buffer = "\\" + buffer.charAt(0) + buffer.subSequence(1, 
buffer.length());
+    // first char of a term as more escaping chars
+    for (String escapableTermExtraFirstChar : escapableTermExtraFirstChars) {
+      if (buffer.charAt(0) == escapableTermExtraFirstChar.charAt(0)) {
+        buffer = "\\" + buffer;
         break;
       }
     }
 
     return buffer;
   }
 
-  private final CharSequence escapeQuoted(CharSequence str, Locale locale) {
-    if (str == null || str.length() == 0) return str;
+  private static CharSequence escapeQuoted(CharSequence str, Locale locale) {
+    if (str == null || str.isEmpty()) return str;
 
     CharSequence buffer = str;
 
-    for (int i = 0; i < escapableQuotedChars.length; i++) {
-      buffer = replaceIgnoreCase(buffer, 
escapableTermChars[i].toLowerCase(locale), "\\", locale);
+    for (String escapableQuotedChar : escapableQuotedChars) {
+      buffer = escapeIgnoringCase(buffer, 
escapableQuotedChar.toLowerCase(locale), "\\", locale);
     }
     return buffer;
   }
 
-  private static final CharSequence escapeTerm(CharSequence term, Locale 
locale) {
-    if (term == null) return term;
+  private static CharSequence escapeTerm(CharSequence term, Locale locale) {
+    if (term == null || term.isEmpty()) return term;
 
-    // Escape single Chars
+    // escape single chars
     term = escapeChar(term, locale);
     term = escapeWhiteChar(term, locale);
 
-    // Escape Parser Words
-    for (int i = 0; i < escapableWordTokens.length; i++) {
-      if (escapableWordTokens[i].equalsIgnoreCase(term.toString())) return 
"\\" + term;
+    // escape parser words
+    for (String escapableWordToken : escapableWordTokens) {
+      if (escapableWordToken.equalsIgnoreCase(term.toString())) return "\\" + 
term;
     }
     return term;
   }
 
   /**
-   * replace with ignore case
+   * Prepend every case-insensitive occurrence of the {@code sequence1} in the 
{@code string} with
+   * the {@code escapeChar}. When the {@code sequence1} is empty, every 
character in the {@code
+   * string} is escaped.
    *
-   * @param string string to get replaced
+   * @param string string to apply escaping to
    * @param sequence1 the old character sequence in lowercase
-   * @param escapeChar the new character to prefix sequence1 in return string.
-   * @return the new String
+   * @param escapeChar the escape character to prefix sequence1 in the 
returned string
+   * @return CharSequence with every occurrence of {@code sequence1} prepended 
with {@code
+   *     escapeChar}
    */
-  private static CharSequence replaceIgnoreCase(
+  private static CharSequence escapeIgnoringCase(
       CharSequence string, CharSequence sequence1, CharSequence escapeChar, 
Locale locale) {
     if (escapeChar == null || sequence1 == null || string == null) throw new 
NullPointerException();
 
-    // empty string case
     int count = string.length();
     int sequence1Length = sequence1.length();
+
+    // empty search string - escape every character
     if (sequence1Length == 0) {
-      StringBuilder result = new StringBuilder((count + 1) * 
escapeChar.length());
-      result.append(escapeChar);
+      StringBuilder result = new StringBuilder(count * (1 + 
escapeChar.length()));
       for (int i = 0; i < count; i++) {
-        result.append(string.charAt(i));

Review Comment:
   The `escapeIgnoringCase` method is `private`. It is called in three places, 
all looking like this:
   ```
       for (String escapableQuotedChar : escapableQuotedChars) {
         buffer = escapeIgnoringCase(buffer, 
escapableQuotedChar.toLowerCase(locale), "\\", locale);
       }
   ```
   
   I.e. the input for the search string `sequence1` parameter is always 
controlled and is never an empty string:
   ```
     private static final String[] escapableTermChars = {
       "\"", "<", ">", "=", "!", "(", ")", "^", "[", "{", ":", "]", "}", "~", 
"/"
     };
   
     private static final String[] escapableQuotedChars = {"\""};
   
     private static final String[] escapableWhiteChars = {" ", "\t", "\n", 
"\r", "\f", "\b", "\u3000"};
   ```
   (unless some weird locale drops one of those characters when converting to 
lower case)
   
   I wonder if this whole "empty search string" block should be replaced with 
an `IllegalArgumentException`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Cleanup and fix EscapeQuerySyntaxImpl [lucene]

Reply via email to