This is an automated email from the ASF dual-hosted git repository.

ddanielr pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/2.1 by this push:
     new 92331ea113 Throw error when non-standard chars exist (#4348)
92331ea113 is described below

commit 92331ea113b12c0a833a0a5ea714294832c6d822
Author: Daniel Roberts <ddani...@gmail.com>
AuthorDate: Tue Mar 12 11:12:53 2024 -0400

    Throw error when non-standard chars exist (#4348)
    
    * Throw error when non-standard chars exist
    
    Adds an explicit error message when base64 coding should have been used.
    Removes specical handling of `\` character and replaces it with an
    allowed character set.
    
    ---------
    
    Co-authored-by: Keith Turner <ktur...@apache.org>
---
 .../accumulo/core/file/rfile/GenerateSplits.java   | 32 +++++++++--
 .../core/file/rfile/GenerateSplitsTest.java        | 64 +++++++++++++++++++---
 2 files changed, 82 insertions(+), 14 deletions(-)

diff --git 
a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java 
b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
index 1812358a59..865210a970 100644
--- a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
+++ b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
@@ -30,6 +30,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Base64;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
@@ -72,6 +73,10 @@ import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
 public class GenerateSplits implements KeywordExecutable {
   private static final Logger log = 
LoggerFactory.getLogger(GenerateSplits.class);
 
+  private static final Set<Character> allowedChars = new HashSet<>();
+
+  private static final String encodeFlag = "-b64";
+
   static class Opts extends ConfigOpts {
     @Parameter(names = {"-n", "--num"},
         description = "The number of split points to generate. Can be used to 
create n+1 tablets. Cannot use with the split size option.")
@@ -81,7 +86,8 @@ public class GenerateSplits implements KeywordExecutable {
         description = "The minimum split size in uncompressed bytes. Cannot 
use with num splits option.")
     public long splitSize = 0;
 
-    @Parameter(names = {"-b64", "--base64encoded"}, description = "Base 64 
encode the split points")
+    @Parameter(names = {encodeFlag, "--base64encoded"},
+        description = "Base 64 encode the split points")
     public boolean base64encode = false;
 
     @Parameter(names = {"-sf", "--splits-file"}, description = "Output the 
splits to a file")
@@ -89,6 +95,7 @@ public class GenerateSplits implements KeywordExecutable {
 
     @Parameter(description = "<file|directory>[ <file|directory>...] -n <num> 
| -ss <split_size>")
     public List<String> files = new ArrayList<>();
+
   }
 
   @Override
@@ -145,6 +152,20 @@ public class GenerateSplits implements KeywordExecutable {
       log.trace("Found the following files: {}", filePaths);
     }
 
+    if (!encode) {
+      // Generate the allowed Character set
+      for (int i = 0; i < 10; i++) {
+        // 0-9
+        allowedChars.add((char) (i + 48));
+      }
+      for (int i = 0; i < 26; i++) {
+        // Uppercase A-Z
+        allowedChars.add((char) (i + 65));
+        // Lowercase a-z
+        allowedChars.add((char) (i + 97));
+      }
+    }
+
     // if no size specified look at indexed keys first
     if (opts.splitSize == 0) {
       splits = getIndexKeys(siteConf, hadoopConf, fs, filePaths, 
requestedNumSplits, encode,
@@ -256,16 +277,15 @@ public class GenerateSplits implements KeywordExecutable {
     if (encode) {
       return Base64.getEncoder().encodeToString(bytes);
     } else {
-      // drop non printable characters
       StringBuilder sb = new StringBuilder();
       for (byte aByte : bytes) {
         int c = 0xff & aByte;
-        if (c == '\\') {
-          sb.append("\\\\");
-        } else if (c >= 32 && c <= 126) {
+        if (allowedChars.contains((char) c)) {
           sb.append((char) c);
         } else {
-          log.debug("Dropping non printable char: \\x{}", 
Integer.toHexString(c));
+          // Fail if non-printable characters are detected.
+          throw new UnsupportedOperationException("Non printable char: \\x" + 
Integer.toHexString(c)
+              + " detected. Must use Base64 encoded output.  The behavior 
around non printable chars changed in 2.1.3 to throw an error, the previous 
behavior was likely to cause bugs.");
         }
       }
       return sb.toString();
diff --git 
a/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java
 
b/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java
index d2b7a8577e..b0111bbe2a 100644
--- 
a/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java
+++ 
b/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java
@@ -33,10 +33,13 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Base64;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
 import java.util.TreeSet;
+import java.util.stream.Collectors;
 
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
@@ -91,13 +94,13 @@ public class GenerateSplitsTest {
     List<String> args = List.of(rfilePath, "--num", "2", "-sf", 
splitsFilePath);
     log.info("Invoking GenerateSplits with {}", args);
     GenerateSplits.main(args.toArray(new String[0]));
-    verifySplitsFile("r3", "r6");
+    verifySplitsFile(false, "r6", "r3");
 
     // test more splits requested than indices
     args = List.of(rfilePath, "--num", "4", "-sf", splitsFilePath);
     log.info("Invoking GenerateSplits with {}", args);
     GenerateSplits.main(args.toArray(new String[0]));
-    verifySplitsFile("r2", "r3", "r4", "r5");
+    verifySplitsFile(false, "r2", "r3", "r4", "r5");
   }
 
   @Test
@@ -105,15 +108,25 @@ public class GenerateSplitsTest {
     List<String> args = List.of(rfilePath, "-ss", "21", "-sf", splitsFilePath);
     log.info("Invoking GenerateSplits with {}", args);
     GenerateSplits.main(args.toArray(new String[0]));
-    verifySplitsFile("r2", "r4", "r6");
+    verifySplitsFile(false, "r2", "r4", "r6");
   }
 
-  private void verifySplitsFile(String... splits) throws IOException {
-    String splitsFile = Files.readString(Paths.get(splitsFilePath));
-    assertEquals(splits.length, splitsFile.split("\n").length);
-    for (String s : splits) {
-      assertTrue(splitsFile.contains(s), "Did not find " + s + " in: " + 
splitsFile);
+  private void verifySplitsFile(boolean encoded, String... splits) throws 
IOException {
+    String[] gSplits = Files.readString(Paths.get(splitsFilePath)).split("\n");
+    assertEquals(splits.length, gSplits.length);
+    TreeSet<String> expectedSplits =
+        Arrays.stream(splits).collect(Collectors.toCollection(TreeSet::new));
+    TreeSet<String> generatedSplits = Arrays.stream(gSplits).map(s -> 
decode(encoded, s))
+        .map(String::new).collect(Collectors.toCollection(TreeSet::new));
+    assertEquals(expectedSplits, generatedSplits,
+        "Failed to find expected splits in " + generatedSplits);
+  }
+
+  private String decode(boolean decode, String string) {
+    if (decode) {
+      return new String(Base64.getDecoder().decode(string));
     }
+    return string;
   }
 
   @Test
@@ -156,6 +169,41 @@ public class GenerateSplitsTest {
     assertEquals(Set.of("001", "002", "003", "004", "005", "006", "007", 
"008", "009"), desired);
   }
 
+  @Test
+  public void testNullValues() throws Exception {
+    trf.openWriter(false);
+    trf.writer.startNewLocalityGroup("lg1", newColFamByteSequence("cf1", 
"cf2"));
+    trf.writer.append(newKey("r1\0a", "cf1", "cq1", "L1", 55), 
newValue("foo1"));
+    trf.writer.append(newKey("r2\0b", "cf2", "cq1", "L1", 55), 
newValue("foo2"));
+    trf.writer.append(newKey("r3\0c", "cf2", "cq1", "L1", 55), 
newValue("foo3"));
+    trf.writer.startNewLocalityGroup("lg2", newColFamByteSequence("cf3", 
"cf4"));
+    trf.writer.append(newKey("r4\0d", "cf3", "cq1", "L1", 55), 
newValue("foo4"));
+    trf.writer.append(newKey("r5\0e", "cf4", "cq1", "L1", 55), 
newValue("foo5"));
+    trf.writer.append(newKey("r6\0f", "cf4", "cq1", "L1", 55), 
newValue("foo6"));
+    trf.closeWriter();
+
+    File file = new File(tempDir, "testGenerateSplitsWithNulls.rf");
+    assertTrue(file.createNewFile(), "Failed to create file: " + file);
+    try (var fileOutputStream = new FileOutputStream(file)) {
+      fileOutputStream.write(trf.baos.toByteArray());
+    }
+    rfilePath = "file:" + file.getAbsolutePath();
+    log.info("Wrote to file {}", rfilePath);
+
+    File splitsFile = new File(tempDir, "testSplitsFileWithNulls");
+    assertTrue(splitsFile.createNewFile(), "Failed to create file: " + 
splitsFile);
+    splitsFilePath = splitsFile.getAbsolutePath();
+
+    List<String> finalArgs = List.of(rfilePath, "--num", "2", "-sf", 
splitsFilePath);
+    assertThrows(UnsupportedOperationException.class,
+        () -> GenerateSplits.main(finalArgs.toArray(new String[0])));
+
+    List<String> args = List.of(rfilePath, "--num", "2", "-sf", 
splitsFilePath, "-b64");
+    GenerateSplits.main(args.toArray(new String[0]));
+    // Validate that base64 split points are working
+    verifySplitsFile(true, "r6\0f", "r3\0c");
+  }
+
   /**
    * Create the requested number of splits. Works up to 3 digits or max 999.
    */

Reply via email to