From: Vijay <[email protected]> Use Neon instructions to perform zero checking of buffer. This is helps in reducing downtime during live migration.
Signed-off-by: Vijaya Kumar K <[email protected]> --- util/cutils.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/util/cutils.c b/util/cutils.c index 43d1afb..d343b9a 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -352,6 +352,87 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void) return func; } #pragma GCC pop_options + +#elif defined __aarch64__ +#include "arm_neon.h" + +#define NEON_VECTYPE uint64x2_t +#define NEON_LOAD_N_ORR(v1, v2) vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2)) +#define NEON_ORR(v1, v2) vorrq_u64(v1, v2) +#define NEON_EQ_ZERO(v1) \ + ((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \ + (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0) + +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16 + +/* + * Zero page/buffer checking using SIMD(Neon) + */ + +static bool +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len) +{ + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON + * sizeof(NEON_VECTYPE)) == 0 + && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0); +} + +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len) +{ + size_t i; + NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6; + NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14; + uint64_t const *data = buf; + + assert(can_use_buffer_find_nonzero_offset_neon(buf, len)); + len /= sizeof(unsigned long); + + for (i = 0; i < len; i += 32) { + d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]); + d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]); + d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]); + d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]); + d4 = NEON_ORR(d0, d1); + d5 = NEON_ORR(d2, d3); + d6 = NEON_ORR(d4, d5); + + d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]); + d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]); + d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]); + d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]); + d11 = NEON_ORR(d7, d8); + d12 = NEON_ORR(d9, d10); + d13 = NEON_ORR(d11, d12); + + d14 = NEON_ORR(d6, d13); + if (NEON_EQ_ZERO(d14)) { + break; + } + } + + return i * sizeof(unsigned long); +} + +static inline bool neon_support(void) +{ + /* + * Check if neon feature is supported. + * By default neon is supported for aarch64. + */ + return true; +} + +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) : + can_use_buffer_find_nonzero_offset_inner(buf, len); +} + +size_t buffer_find_nonzero_offset(const void *buf, size_t len) +{ + return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) : + buffer_find_nonzero_offset_inner(buf, len); +} #else bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) { -- 1.7.9.5
