This is an automated email from the ASF dual-hosted git repository. ddanielr pushed a commit to branch 2.1 in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/2.1 by this push: new 92331ea113 Throw error when non-standard chars exist (#4348) 92331ea113 is described below commit 92331ea113b12c0a833a0a5ea714294832c6d822 Author: Daniel Roberts <ddani...@gmail.com> AuthorDate: Tue Mar 12 11:12:53 2024 -0400 Throw error when non-standard chars exist (#4348) * Throw error when non-standard chars exist Adds an explicit error message when base64 coding should have been used. Removes specical handling of `\` character and replaces it with an allowed character set. --------- Co-authored-by: Keith Turner <ktur...@apache.org> --- .../accumulo/core/file/rfile/GenerateSplits.java | 32 +++++++++-- .../core/file/rfile/GenerateSplitsTest.java | 64 +++++++++++++++++++--- 2 files changed, 82 insertions(+), 14 deletions(-) diff --git a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java index 1812358a59..865210a970 100644 --- a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java +++ b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java @@ -30,6 +30,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Base64; import java.util.Collections; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; @@ -72,6 +73,10 @@ import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; public class GenerateSplits implements KeywordExecutable { private static final Logger log = LoggerFactory.getLogger(GenerateSplits.class); + private static final Set<Character> allowedChars = new HashSet<>(); + + private static final String encodeFlag = "-b64"; + static class Opts extends ConfigOpts { @Parameter(names = {"-n", "--num"}, description = "The number of split points to generate. Can be used to create n+1 tablets. Cannot use with the split size option.") @@ -81,7 +86,8 @@ public class GenerateSplits implements KeywordExecutable { description = "The minimum split size in uncompressed bytes. Cannot use with num splits option.") public long splitSize = 0; - @Parameter(names = {"-b64", "--base64encoded"}, description = "Base 64 encode the split points") + @Parameter(names = {encodeFlag, "--base64encoded"}, + description = "Base 64 encode the split points") public boolean base64encode = false; @Parameter(names = {"-sf", "--splits-file"}, description = "Output the splits to a file") @@ -89,6 +95,7 @@ public class GenerateSplits implements KeywordExecutable { @Parameter(description = "<file|directory>[ <file|directory>...] -n <num> | -ss <split_size>") public List<String> files = new ArrayList<>(); + } @Override @@ -145,6 +152,20 @@ public class GenerateSplits implements KeywordExecutable { log.trace("Found the following files: {}", filePaths); } + if (!encode) { + // Generate the allowed Character set + for (int i = 0; i < 10; i++) { + // 0-9 + allowedChars.add((char) (i + 48)); + } + for (int i = 0; i < 26; i++) { + // Uppercase A-Z + allowedChars.add((char) (i + 65)); + // Lowercase a-z + allowedChars.add((char) (i + 97)); + } + } + // if no size specified look at indexed keys first if (opts.splitSize == 0) { splits = getIndexKeys(siteConf, hadoopConf, fs, filePaths, requestedNumSplits, encode, @@ -256,16 +277,15 @@ public class GenerateSplits implements KeywordExecutable { if (encode) { return Base64.getEncoder().encodeToString(bytes); } else { - // drop non printable characters StringBuilder sb = new StringBuilder(); for (byte aByte : bytes) { int c = 0xff & aByte; - if (c == '\\') { - sb.append("\\\\"); - } else if (c >= 32 && c <= 126) { + if (allowedChars.contains((char) c)) { sb.append((char) c); } else { - log.debug("Dropping non printable char: \\x{}", Integer.toHexString(c)); + // Fail if non-printable characters are detected. + throw new UnsupportedOperationException("Non printable char: \\x" + Integer.toHexString(c) + + " detected. Must use Base64 encoded output. The behavior around non printable chars changed in 2.1.3 to throw an error, the previous behavior was likely to cause bugs."); } } return sb.toString(); diff --git a/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java b/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java index d2b7a8577e..b0111bbe2a 100644 --- a/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java +++ b/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java @@ -33,10 +33,13 @@ import java.io.FileOutputStream; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Base64; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.TreeSet; +import java.util.stream.Collectors; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -91,13 +94,13 @@ public class GenerateSplitsTest { List<String> args = List.of(rfilePath, "--num", "2", "-sf", splitsFilePath); log.info("Invoking GenerateSplits with {}", args); GenerateSplits.main(args.toArray(new String[0])); - verifySplitsFile("r3", "r6"); + verifySplitsFile(false, "r6", "r3"); // test more splits requested than indices args = List.of(rfilePath, "--num", "4", "-sf", splitsFilePath); log.info("Invoking GenerateSplits with {}", args); GenerateSplits.main(args.toArray(new String[0])); - verifySplitsFile("r2", "r3", "r4", "r5"); + verifySplitsFile(false, "r2", "r3", "r4", "r5"); } @Test @@ -105,15 +108,25 @@ public class GenerateSplitsTest { List<String> args = List.of(rfilePath, "-ss", "21", "-sf", splitsFilePath); log.info("Invoking GenerateSplits with {}", args); GenerateSplits.main(args.toArray(new String[0])); - verifySplitsFile("r2", "r4", "r6"); + verifySplitsFile(false, "r2", "r4", "r6"); } - private void verifySplitsFile(String... splits) throws IOException { - String splitsFile = Files.readString(Paths.get(splitsFilePath)); - assertEquals(splits.length, splitsFile.split("\n").length); - for (String s : splits) { - assertTrue(splitsFile.contains(s), "Did not find " + s + " in: " + splitsFile); + private void verifySplitsFile(boolean encoded, String... splits) throws IOException { + String[] gSplits = Files.readString(Paths.get(splitsFilePath)).split("\n"); + assertEquals(splits.length, gSplits.length); + TreeSet<String> expectedSplits = + Arrays.stream(splits).collect(Collectors.toCollection(TreeSet::new)); + TreeSet<String> generatedSplits = Arrays.stream(gSplits).map(s -> decode(encoded, s)) + .map(String::new).collect(Collectors.toCollection(TreeSet::new)); + assertEquals(expectedSplits, generatedSplits, + "Failed to find expected splits in " + generatedSplits); + } + + private String decode(boolean decode, String string) { + if (decode) { + return new String(Base64.getDecoder().decode(string)); } + return string; } @Test @@ -156,6 +169,41 @@ public class GenerateSplitsTest { assertEquals(Set.of("001", "002", "003", "004", "005", "006", "007", "008", "009"), desired); } + @Test + public void testNullValues() throws Exception { + trf.openWriter(false); + trf.writer.startNewLocalityGroup("lg1", newColFamByteSequence("cf1", "cf2")); + trf.writer.append(newKey("r1\0a", "cf1", "cq1", "L1", 55), newValue("foo1")); + trf.writer.append(newKey("r2\0b", "cf2", "cq1", "L1", 55), newValue("foo2")); + trf.writer.append(newKey("r3\0c", "cf2", "cq1", "L1", 55), newValue("foo3")); + trf.writer.startNewLocalityGroup("lg2", newColFamByteSequence("cf3", "cf4")); + trf.writer.append(newKey("r4\0d", "cf3", "cq1", "L1", 55), newValue("foo4")); + trf.writer.append(newKey("r5\0e", "cf4", "cq1", "L1", 55), newValue("foo5")); + trf.writer.append(newKey("r6\0f", "cf4", "cq1", "L1", 55), newValue("foo6")); + trf.closeWriter(); + + File file = new File(tempDir, "testGenerateSplitsWithNulls.rf"); + assertTrue(file.createNewFile(), "Failed to create file: " + file); + try (var fileOutputStream = new FileOutputStream(file)) { + fileOutputStream.write(trf.baos.toByteArray()); + } + rfilePath = "file:" + file.getAbsolutePath(); + log.info("Wrote to file {}", rfilePath); + + File splitsFile = new File(tempDir, "testSplitsFileWithNulls"); + assertTrue(splitsFile.createNewFile(), "Failed to create file: " + splitsFile); + splitsFilePath = splitsFile.getAbsolutePath(); + + List<String> finalArgs = List.of(rfilePath, "--num", "2", "-sf", splitsFilePath); + assertThrows(UnsupportedOperationException.class, + () -> GenerateSplits.main(finalArgs.toArray(new String[0]))); + + List<String> args = List.of(rfilePath, "--num", "2", "-sf", splitsFilePath, "-b64"); + GenerateSplits.main(args.toArray(new String[0])); + // Validate that base64 split points are working + verifySplitsFile(true, "r6\0f", "r3\0c"); + } + /** * Create the requested number of splits. Works up to 3 digits or max 999. */