> > This patch use the ifunc mechanism to select the proper function when > > running, for platform supports AVX2, excute the AVX2 instructions, > > else, excute the original code. > > > > Signed-off-by: Liang Li <[email protected]> > > --- > > include/qemu-common.h | 28 +++++++++++++++------ > > util/Makefile.objs | 2 ++ > > util/avx2.c | 69 > +++++++++++++++++++++++++++++++++++++++++++++++++++ > > util/cutils.c | 53 +++++++++++++++++++++++++++++++++++++-- > > 4 files changed, 143 insertions(+), 9 deletions(-) create mode > > 100644 util/avx2.c > > > > diff --git a/include/qemu-common.h b/include/qemu-common.h index > > 2f74540..9fa7501 100644 > > --- a/include/qemu-common.h > > +++ b/include/qemu-common.h > > @@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp, > > const char *prefix, size_t size); #endif > > > > #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline > > bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > > -{ > > - return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > > - * sizeof(VECTYPE)) == 0 > > - && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); > > -} > > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); > > + > > size_t buffer_find_nonzero_offset(const void *buf, size_t len); > > > > +extern bool > > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len); > > + > > +extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t > > +len); > > + > > +extern bool > > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t > > +len); > > + > > +extern size_t buffer_find_nonzero_offset_inner(const void *buf, > > +size_t len); > > + > > +__asm__(".type can_use_buffer_find_nonzero_offset, > > +\%gnu_indirect_function"); __asm__(".type buffer_find_nonzero_offset, > > +\%gnu_indirect_function"); > > + > > + > > +void *can_use_buffer_find_nonzero_offset_ifunc(void) \ > > + __asm__("can_use_buffer_find_nonzero_offset"); > > + > > +void *buffer_find_nonzero_offset_ifunc(void) \ > > + __asm__("buffer_find_nonzero_offset"); > > /* > > * helper to parse debug environment variables > > */ > > diff --git a/util/Makefile.objs b/util/Makefile.objs index > > d7cc399..6aacad7 100644 > > --- a/util/Makefile.objs > > +++ b/util/Makefile.objs > > @@ -1,4 +1,5 @@ > > util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o > > +util-obj-y += avx2.o > > util-obj-$(CONFIG_POSIX) += compatfd.o > > util-obj-$(CONFIG_POSIX) += event_notifier-posix.o > > util-obj-$(CONFIG_POSIX) += mmap-alloc.o @@ -29,3 +30,4 @@ util-obj-y > > += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o > > util-obj-y += qemu-coroutine-sleep.o util-obj-y += > > coroutine-$(CONFIG_COROUTINE_BACKEND).o > > util-obj-y += buffer.o > > +avx2.o-cflags := $(AVX2_CFLAGS) > > diff --git a/util/avx2.c b/util/avx2.c new file mode 100644 index > > 0000000..0e6915a > > --- /dev/null > > +++ b/util/avx2.c > > @@ -0,0 +1,69 @@ > > +#include "qemu-common.h" > > + > > +#ifdef __AVX2__ > > +#include <immintrin.h> > > +#define AVX2_VECTYPE __m256i > > +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) > > +#define AVX2_ALL_EQ(v1, v2) \ > > + (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF) > > +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) > > + > > +inline bool > > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > > +{ > > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > > + * sizeof(AVX2_VECTYPE)) == 0 > > + && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); } > > + > > +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) { > > + const AVX2_VECTYPE *p = buf; > > + const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; > > + size_t i; > > + > > + assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); > > + > > + if (!len) { > > + return 0; > > + } > > + > > + for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { > > + if (!AVX2_ALL_EQ(p[i], zero)) { > > + return i * sizeof(AVX2_VECTYPE); > > + } > > + } > > + > > + for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; > > + i < len / sizeof(AVX2_VECTYPE); > > + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { > > + AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); > > + AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); > > + AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); > > + AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); > > + AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); > > + AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); > > + if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { > > + break; > > + } > > + } > > + > > + return i * sizeof(AVX2_VECTYPE); > > +} > > + > > +#else > > +/* use the original functions if avx2 is not enabled when buiding*/ > > + > > +inline bool > > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > > +{ > > + return can_use_buffer_find_nonzero_offset_inner(buf, len); } > > + > > +inline size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t > > +len) { > > + return buffer_find_nonzero_offset_inner(buf, len); } > > + > > +#endif > > + > > diff --git a/util/cutils.c b/util/cutils.c index cfeb848..cd478ce > > 100644 > > --- a/util/cutils.c > > +++ b/util/cutils.c > > @@ -26,6 +26,7 @@ > > #include <math.h> > > #include <limits.h> > > #include <errno.h> > > +#include <cpuid.h> > > > > #include "qemu/sockets.h" > > #include "qemu/iov.h" > > @@ -161,6 +162,54 @@ int qemu_fdatasync(int fd) #endif } > > > > +/* old compiler maynot define bit_AVX2 */ #ifndef bit_AVX2 #define > > +bit_AVX2 (1 << 5) #endif > > + > > +static inline bool avx2_support(void) { > > + int a, b, c, d; > > + > > + if (__get_cpuid_max(0, NULL) < 7) { > > + printf("max cpuid < 7\n"); > > + return false; > > + } > > + > > + __cpuid_count(7, 0, a, b, c, d); > > + printf("b = %x\n", b); > > + return b & bit_AVX2; > > +} > > + > > +void *buffer_find_nonzero_offset_ifunc(void) > > +{ > > + printf("deciding %s\n", __func__); > > + > > + typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ? > > + buffer_find_nonzero_offset_avx2 : > > + buffer_find_nonzero_offset_inner; > > + > > + return func; > > +} > > + > > +void *can_use_buffer_find_nonzero_offset_ifunc(void) > > +{ > > + printf("deciding %s\n", __func__); > > + > > + typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ? > > + can_use_buffer_find_nonzero_offset_avx2 : > > + can_use_buffer_find_nonzero_offset_inner; > > + > > + return func; > > +} > > + > > +inline bool > > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) > > +{ > > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > > + * sizeof(VECTYPE)) == 0 > > + && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); } > > + > > /* > > * Searches for an area with non-zero content in a buffer > > * > > @@ -181,13 +230,13 @@ int qemu_fdatasync(int fd) > > * If the buffer is all zero the return value is equal to len. > > */ > > > > -size_t buffer_find_nonzero_offset(const void *buf, size_t len) > > +size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) > > { > > const VECTYPE *p = buf; > > const VECTYPE zero = (VECTYPE){0}; > > size_t i; > > > > - assert(can_use_buffer_find_nonzero_offset(buf, len)); > > + assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); > > > > if (!len) { > > return 0; > > > > The main issue here is that you are not testing whether the compiler supports > gnu_indirect_function. > > I suggest that you start by moving the functions to util/buffer-zero.c > > Then the structure should be something like > > #ifdef CONFIG_HAVE_AVX2 > #include <immintrin.h> > #endif > > ... define buffer_find_nonzero_offset_inner ... > ... define can_use_buffer_find_nonzero_offset_inner ...
> #if defined CONFIG_HAVE_GNU_IFUNC && defined CONFIG_HAVE_AVX2 ... > define buffer_find_nonzero_offset_avx2 ... > ... define can_use_buffer_find_nonzero_offset_avx2 ... > ... define the indirect functions ... > #else > ... define buffer_find_nonzero_offset that just calls > buffer_find_nonzero_offset_inner ... > ... define can_use_buffer_find_nonzero_offset that just calls > can_use_buffer_find_nonzero_offset_inner ... > #endif > > Thanks, > > Paolo The buffer_find_nonzero_offset_inner & buffer_find_nonzero_offset_avx2 can't defined in the same .c file. Or, if the '-maxv2' is enabled, the " buffer_find_nonzero_offset_inner ()" will be compiled to AVX2 instructions. Liang
