Repository: spark Updated Branches: refs/heads/master 6d94bf6ac -> a20e743fb
[SPARK-9460] Fix prefix generation for UTF8String. Previously we could be getting garbage data if the number of bytes is 0, or on JVMs that are 4 byte aligned, or when compressedoops is on. Author: Reynold Xin <[email protected]> Closes #7789 from rxin/utf8string and squashes the following commits: 86ffa3e [Reynold Xin] Mask out data outside of valid range. 4d647ed [Reynold Xin] Mask out data. c6e8794 [Reynold Xin] [SPARK-9460] Fix prefix generation for UTF8String. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a20e743f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a20e743f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a20e743f Branch: refs/heads/master Commit: a20e743fb863de809863652931bc982aac2d1f86 Parents: 6d94bf6 Author: Reynold Xin <[email protected]> Authored: Thu Jul 30 13:09:43 2015 -0700 Committer: Reynold Xin <[email protected]> Committed: Thu Jul 30 13:09:43 2015 -0700 ---------------------------------------------------------------------- .../apache/spark/unsafe/types/UTF8String.java | 36 ++++++++++++++++++-- .../spark/unsafe/types/UTF8StringSuite.java | 8 +++++ 2 files changed, 41 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/a20e743f/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java ---------------------------------------------------------------------- diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 5752200..c38953f 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -66,6 +66,19 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { } /** + * Creates an UTF8String from byte array, which should be encoded in UTF-8. + * + * Note: `bytes` will be hold by returned UTF8String. + */ + public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) { + if (bytes != null) { + return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes); + } else { + return null; + } + } + + /** * Creates an UTF8String from String. */ public static UTF8String fromString(String str) { @@ -89,10 +102,10 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { return fromBytes(spaces); } - protected UTF8String(Object base, long offset, int size) { + protected UTF8String(Object base, long offset, int numBytes) { this.base = base; this.offset = offset; - this.numBytes = size; + this.numBytes = numBytes; } /** @@ -141,7 +154,24 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { * Returns a 64-bit integer that can be used as the prefix used in sorting. */ public long getPrefix() { - long p = PlatformDependent.UNSAFE.getLong(base, offset); + // Since JVMs are either 4-byte aligned or 8-byte aligned, we check the size of the string. + // If size is 0, just return 0. + // If size is between 0 and 4 (inclusive), assume data is 4-byte aligned under the hood and + // use a getInt to fetch the prefix. + // If size is greater than 4, assume we have at least 8 bytes of data to fetch. + // After getting the data, we use a mask to mask out data that is not part of the string. + long p; + if (numBytes >= 8) { + p = PlatformDependent.UNSAFE.getLong(base, offset); + } else if (numBytes > 4) { + p = PlatformDependent.UNSAFE.getLong(base, offset); + p = p & ((1L << numBytes * 8) - 1); + } else if (numBytes > 0) { + p = (long) PlatformDependent.UNSAFE.getInt(base, offset); + p = p & ((1L << numBytes * 8) - 1); + } else { + p = 0; + } p = java.lang.Long.reverseBytes(p); return p; } http://git-wip-us.apache.org/repos/asf/spark/blob/a20e743f/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java ---------------------------------------------------------------------- diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 42e09e4..f2cc19c 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -71,6 +71,14 @@ public class UTF8StringSuite { fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0); assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0); assertTrue(fromString("ä½ å¥½").getPrefix() - fromString("ä¸ç").getPrefix() > 0); + + byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + byte[] buf2 = {1, 2, 3}; + UTF8String str1 = UTF8String.fromBytes(buf1, 0, 3); + UTF8String str2 = UTF8String.fromBytes(buf1, 0, 8); + UTF8String str3 = UTF8String.fromBytes(buf2); + assertTrue(str1.getPrefix() - str2.getPrefix() < 0); + assertEquals(str1.getPrefix(), str3.getPrefix()); } @Test --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
