Hello linaro-toolchain,
This is a patch to add a NEON optimised buffer folding routine to
Hadoop 2.0.2-alpha. It allows for faster CRC32c checksum
computation.

The code is in the form of NEON intrinsics, and details of the
implementation can be found at:
https://wiki.linaro.org/LEG/Engineering/CRC

This patch is against Hadoop 2.0.2-alpha:
https://github.com/apache/hadoop-common/tree/branch-2.0.2-alpha

I've sent this to linaro-toolchain because I would like the NEON
intrinsic implementation to be scrutinised. I would be particularly
interested in any superfluous instructions being identified or any
bad practices that could be cleaned up before opening these patches
up more.

Any comments/critique would really be appreciated.

Thanks,
-- 
Steve

Signed-off-by: Steve Capper <steve.cap...@linaro.org>
---
 .../hadoop-common/src/JNIFlags.cmake               |    2 +
 .../native/src/org/apache/hadoop/util/bulk_crc32.c |  142 +++++++++++++++++++-
 2 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/JNIFlags.cmake 
b/hadoop-common-project/hadoop-common/src/JNIFlags.cmake
index aba4c18..5c9336a 100644
--- a/hadoop-common-project/hadoop-common/src/JNIFlags.cmake
+++ b/hadoop-common-project/hadoop-common/src/JNIFlags.cmake
@@ -36,6 +36,8 @@ endif (JVM_ARCH_DATA_MODEL EQUAL 32)
 
 # Determine float ABI of JVM on ARM Linux
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND CMAKE_SYSTEM_NAME STREQUAL 
"Linux")
+    message("Enabling NEON support.")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
     find_program(READELF readelf)
     if (READELF MATCHES "NOTFOUND")
         message(WARNING "readelf not found; JVM float ABI detection disabled")
diff --git 
a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
 
b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
index 8822b5c..6282c60 100644
--- 
a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
+++ 
b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
@@ -41,6 +41,16 @@ static inline uint32_t crc_val(uint32_t crc);
 static uint32_t crc32_zlib_sb8(uint32_t crc, const uint8_t *buf, size_t 
length);
 uint32_t crc32c_sb8(uint32_t crc, const uint8_t *buf, size_t length);
 
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+static uint32_t crc32c_neon(uint32_t crc, const uint8_t *buf, size_t length);
+#undef USE_PIPELINED
+#define CRC32C_FUNC crc32c_neon
+#else
+#define CRC32C_FUNC crc32c_sb8
+#endif /* __ARM_NEON__ */
+
+
 #ifdef USE_PIPELINED
 static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, 
const uint8_t *p_buf, size_t block_size, int num_blocks);
 #endif
@@ -58,7 +68,7 @@ int bulk_calculate_crc(const uint8_t *data, size_t data_len,
       crc_update_func = crc32_zlib_sb8;
       break;
     case CRC32C_POLYNOMIAL:
-      crc_update_func = crc32c_sb8;
+      crc_update_func = CRC32C_FUNC;
       break;
     default:
       return -EINVAL;
@@ -100,7 +110,7 @@ int bulk_verify_crc(const uint8_t *data, size_t data_len,
         do_pipelined = 1;
 #endif
       } else {
-        crc_update_func = crc32c_sb8;
+        crc_update_func = CRC32C_FUNC;
       }
       break;
     default:
@@ -256,6 +266,134 @@ static uint32_t crc32_zlib_sb8(
   return crc;    
 }
 
+#ifdef __ARM_NEON__
+
+/*
+ * Functions to reduce the size of the input buffer (fold) on ARM
+ * NEON. The smaller buffer has the same CRC32c checksum as the
+ * original.
+ *
+ * Most of the NEON buffer folding work takes place in the function
+ * below. We do the following:
+ * 1) 4 sets of vmull.p8's
+ * 2) Combine these to give a "vmull.p32" (lf3)
+ * 3) Shift left 1 bit to account for the endianess of multiplication.
+ *
+ * The folding and multiplication logic can be found documented at:
+ * https://wiki.linaro.org/LEG/Engineering/CRC
+ */
+static inline uint64x1_t crc32c_neon_proc_part(poly8x8_t lhs, poly8x8_t rhs1,
+       poly8x8_t rhs2, poly8x8_t rhs3, poly8x8_t rhs4)
+{
+       poly16x8_t lm1, lm2, lm3, lm4;
+       poly16x4x2_t lz1, lz2;
+       uint16x4_t le1, le2;
+       uint32x2_t le3;
+       uint32x4_t ls1, ls2, lf1, lf2;
+       uint64x2_t ls3, le4;
+       uint64x1_t lf3, lf4;
+
+       lm1 = vmull_p8(lhs, rhs1);
+       lm2 = vmull_p8(lhs, rhs2);
+       lz1 = vuzp_p16(vget_low_p16(lm2), vget_high_p16(lm2));
+       le1 = veor_u16(vreinterpret_u16_p16(lz1.val[0]),
+                       vreinterpret_u16_p16(lz1.val[1]));
+       ls1 = vshll_n_u16(le1, 8);
+       lf1 = veorq_u32(ls1, vreinterpretq_u32_p16(lm1));
+
+       lm3 = vmull_p8(lhs, rhs3);
+       lm4 = vmull_p8(lhs, rhs4);
+       lz2 = vuzp_p16(vget_low_p16(lm4), vget_high_p16(lm4));
+       le2 = veor_u16(vreinterpret_u16_p16(lz2.val[0]),
+                       vreinterpret_u16_p16(lz2.val[1]));
+       ls2 = vshll_n_u16(le2, 8);
+       lf2 = veorq_u32(ls2, vreinterpretq_u32_p16(lm3));
+
+       le3 = veor_u32(vget_low_u32(lf2), vget_high_u32(lf2));
+       ls3 = vshll_n_u32(le3, 16);
+       le4 = veorq_u64(ls3, vreinterpretq_u64_u32(lf1));
+       lf3 = 
vreinterpret_u64_u32(veor_u32(vget_low_u32(vreinterpretq_u32_u64(le4)),
+                                       
vget_high_u32(vreinterpretq_u32_u64(le4))));
+       lf4 = vshl_n_u64(lf3, 1);
+       return lf4;
+}
+
+static uint32_t crc32c_neon(uint32_t crc, const uint8_t *buf, size_t length) {
+       poly8x8_t xor_constant, lhs1, lhs2, lhs3, lhs4, rhs1, rhs2, rhs3, rhs4;
+       poly8x16_t lhl1, lhl2;
+       
+       uint64_t residues[4];
+       uint32_t loop;
+
+       if (length % 32)
+               return crc32c_sb8(crc, buf, length);
+
+       /*
+        * because crc32c has an initial crc value of 0xffffffff, we need to
+        * pre-fold the buffer before folding begins proper.
+        * The following constant is computed by:
+        * 1) finding a 8x32 bit value that gives a 0xffffffff crc (with 
initial value 0)
+        *    (this will be 7x32 bit 0s and 1x32 bit constant)
+        * 2) run a buffer fold (with 0 xor_constant) on this 8x32 bit value to 
get the
+        *    xor_constant.
+        */
+       xor_constant = vcreate_p8(0x3E43E474A2870290);
+
+       if (crc != 0xffffffff)
+               return crc32c_sb8(crc, buf, length);
+
+       /* k1 = x^288 mod P(x) - bit reversed */
+       /* k2 = x^256 mod P(x) - bit reversed */
+
+       rhs1 = vcreate_p8(0x510AC59A9C25531D);  /* k2:k1 */
+       rhs2 = vcreate_p8(0x0A519AC5259C1D53);  /* byte swap */
+       rhs3 = vcreate_p8(0xC59A510A531D9C25);  /* half word swap */
+       rhs4 = vcreate_p8(0x9AC50A511D53259C);  /* byte swap of half word swap 
*/
+
+       lhl1 = vld1q_p8((const poly8_t *) buf);
+       lhl2 = vld1q_p8((const poly8_t *) buf + 16);
+
+       lhs1 = vget_low_p8(lhl1);
+       lhs2 = vget_high_p8(lhl1);
+       lhs3 = vget_low_p8(lhl2);
+       lhs4 = vget_high_p8(lhl2);
+
+       /* pre-fold lhs4 */
+       lhs4 = vreinterpret_p8_u16(veor_u16(vreinterpret_u16_p8(lhs4),
+               vreinterpret_u16_p8(xor_constant)));
+
+       for(loop = 0; loop < (length - 32)/32; ++loop) {
+               uint64x1_t l1f4, l2f4, l3f4, l4f4;
+
+               l1f4 = crc32c_neon_proc_part(lhs1, rhs1, rhs2, rhs3, rhs4);
+               l2f4 = crc32c_neon_proc_part(lhs2, rhs1, rhs2, rhs3, rhs4);
+               l3f4 = crc32c_neon_proc_part(lhs3, rhs1, rhs2, rhs3, rhs4);
+               l4f4 = crc32c_neon_proc_part(lhs4, rhs1, rhs2, rhs3, rhs4);
+
+               lhl1 = vld1q_p8((const poly8_t *) (buf + 32 * (loop + 1)));
+               lhl2 = vld1q_p8((const poly8_t *) (buf + 32 * (loop + 1) + 16));
+
+               __builtin_prefetch(buf + 32 * (loop + 2));
+
+               lhs1 = vget_low_p8(lhl1);
+               lhs2 = vget_high_p8(lhl1);
+               lhs3 = vget_low_p8(lhl2);
+               lhs4 = vget_high_p8(lhl2);
+
+               lhs1 = vreinterpret_p8_u64(veor_u64(vreinterpret_u64_p8(lhs1), 
l1f4));
+               lhs2 = vreinterpret_p8_u64(veor_u64(vreinterpret_u64_p8(lhs2), 
l2f4));
+               lhs3 = vreinterpret_p8_u64(veor_u64(vreinterpret_u64_p8(lhs3), 
l3f4));
+               lhs4 = vreinterpret_p8_u64(veor_u64(vreinterpret_u64_p8(lhs4), 
l4f4));
+       }
+
+       vst1q_p8((poly8_t *) &residues[0], vcombine_p8(lhs1, lhs2));
+       vst1q_p8((poly8_t *) &residues[2], vcombine_p8(lhs3, lhs4));
+
+       return crc32c_sb8(0, (const uint8_t *)residues, 32);
+}
+
+#endif /*__ARM_NEON__ */
+
 ///////////////////////////////////////////////////////////////////////////
 // Begin code for SSE4.2 specific hardware support of CRC32C
 ///////////////////////////////////////////////////////////////////////////
-- 
1.7.9.5


_______________________________________________
linaro-toolchain mailing list
linaro-toolchain@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-toolchain

Reply via email to