https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89967
Bug ID: 89967 Summary: Inefficient code generation for vld2q_lane_u8 under aarch64 Product: gcc Version: 8.3.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: carsten.steger at gmail dot com Target Milestone: --- Using vld2q_lane_u8 can generate very inefficient code under aarch64. Consider the following code (compile it with gcc -march=armv8-a -Wall -Wextra -O3): #include <arm_neon.h> void func(unsigned char *in, int *out, int n, int32x4_t o1v, int32x4_t o2v, int32x4_t s1v, int32x4_t s2v, int32x4_t m1v, int32x4_t m2v, int32x4_t m3v, int32x4_t m4v) { int i; int32x4_t l1v, l2v, g1v, g2v, g3v, g4v, gv; uint8x16x2_t g1b, g2b; l1v = o1v; l2v = o2v; g1b.val[0] = vdupq_n_u8(0); g1b.val[1] = vdupq_n_u8(0); g2b.val[0] = vdupq_n_u8(0); g2b.val[1] = vdupq_n_u8(0); for (i=0; i<n; i+=4) { g1b = vld2q_lane_u8(&in[vgetq_lane_s32(l1v, 0)], g1b, 0); g1b = vld2q_lane_u8(&in[vgetq_lane_s32(l1v, 1)], g1b, 4); g1b = vld2q_lane_u8(&in[vgetq_lane_s32(l1v, 2)], g1b, 8); g1b = vld2q_lane_u8(&in[vgetq_lane_s32(l1v, 3)], g1b, 12); g2b = vld2q_lane_u8(&in[vgetq_lane_s32(l2v, 0)], g2b, 0); g2b = vld2q_lane_u8(&in[vgetq_lane_s32(l2v, 1)], g2b, 4); g2b = vld2q_lane_u8(&in[vgetq_lane_s32(l2v, 2)], g2b, 8); g2b = vld2q_lane_u8(&in[vgetq_lane_s32(l2v, 3)], g2b, 12); g1v = vreinterpretq_s32_u8(g1b.val[0]); g2v = vreinterpretq_s32_u8(g1b.val[1]); g3v = vreinterpretq_s32_u8(g2b.val[0]); g4v = vreinterpretq_s32_u8(g2b.val[1]); gv = vmlaq_s32(vmlaq_s32(vmlaq_s32(vmulq_s32(m4v, g4v), m3v, g3v), m2v, g2v), m1v, g1v); vst1q_s32(&out[i], gv); vaddq_s32(l1v, s1v); vaddq_s32(l2v, s2v); } } The calls to vld2q_lane_u8 generate the following assembler code: mov v30.16b, v2.16b mov v31.16b, v28.16b mov v24.16b, v26.16b mov v25.16b, v27.16b ld2 {v30.b - v31.b}[0], [x7] ld2 {v24.b - v25.b}[0], [x10] mov v22.16b, v30.16b mov v23.16b, v31.16b mov v20.16b, v24.16b mov v21.16b, v25.16b ld2 {v22.b - v23.b}[4], [x6] ld2 {v20.b - v21.b}[4], [x9] mov v18.16b, v22.16b mov v19.16b, v23.16b mov v0.16b, v20.16b mov v1.16b, v21.16b ld2 {v18.b - v19.b}[8], [x5] ld2 {v0.b - v1.b}[8], [x8] mov v2.16b, v18.16b mov v3.16b, v19.16b mov v16.16b, v0.16b mov v17.16b, v1.16b ld2 {v2.b - v3.b}[12], [x0] ld2 {v16.b - v17.b}[12], [x4] There is a large amount of unnecessary register copying going on. Since the compiler was smart enough to replace the vgetq_lane_s32 and vaddq_s32 calls with direct register accesses and manipulations, I would have expected the vld2q_lane_u8 calls to look like this (saving 16 unnecessary register copy instructions: ld2 {v0.b - v1.b}[0], [x7] ld2 {v2.b - v3.b}[0], [x10] ld2 {v0.b - v1.b}[4], [x6] ld2 {v2.b - v3.b}[4], [x9] ld2 {v0.b - v1.b}[8], [x5] ld2 {v2.b - v2.b}[8], [x8] ld2 {v0.b - v1.b}[12], [x0] ld2 {v2.b - v3.b}[12], [x4] In general (i.e., in cases the compiler isn't able to replace the vgetq_lane_s32 and vaddq_s32 calls with direct register accesses and manipulations), I would expect the code to look roughly like this (modulo the instruction order and register numbers): umov w0, v4.s[0] umov w1, v4.s[1] umov w2, v4.s[2] umov w3, v4.s[3] add x0, x5, w0, sxtw add x1, x5, w1, sxtw add x2, x5, w2, sxtw add x3, x5, w3, sxtw ld2 {v0.b, v1.b}[0], [x0] ld2 {v0.b, v1.b}[4], [x1] ld2 {v0.b, v1.b}[8], [x2] ld2 {v0.b, v1.b}[12], [x3] umov w0, v5.s[0] umov w1, v5.s[1] umov w2, v5.s[2] umov w3, v5.s[3] add x0, x5, w0, sxtw add x1, x5, w1, sxtw add x2, x5, w2, sxtw add x3, x5, w3, sxtw ld2 {v2.b, v3.b}[0], [x0] ld2 {v2.b, v3.b}[4], [x1] ld2 {v2.b, v3.b}[8], [x2] ld2 {v2.b, v3.b}[12], [x3] Tested with gcc 8.3.0 as an aarch64 cross compiler on an x86_64 system (built with crosstool-ng 1.24.0-rc3). Output of gcc -v: Using built-in specs. COLLECT_GCC=[...]/gcc-8.3-aarch64/bin/aarch64-unknown-linux-gnu-gcc COLLECT_LTO_WRAPPER=[...]/gcc-8.3-aarch64/bin/../libexec/gcc/aarch64-unknown-linux-gnu/8.3.0/lto-wrapper Target: aarch64-unknown-linux-gnu Configured with: [...]/.build/aarch64-unknown-linux-gnu/src/gcc/configure --build=x86_64-build_pc-linux-gnu --host=x86_64-build_pc-linux-gnu --target=aarch64-unknown-linux-gnu --prefix=[...]/gcc-8.3-aarch64 --with-sysroot=[...]/gcc-8.3-aarch64/aarch64-unknown-linux-gnu/sysroot --enable-languages=c,c++,fortran --with-pkgversion='crosstool-NG 1.24.0-rc3' --enable-__cxa_atexit --disable-libmudflap --enable-libgomp --disable-libssp --disable-libquadmath --disable-libquadmath-support --disable-libsanitizer --disable-libmpx --with-gmp=[...]/.build/aarch64-unknown-linux-gnu/buildtools --with-mpfr=[...]/.build/aarch64-unknown-linux-gnu/buildtools --with-mpc=[...]/.build/aarch64-unknown-linux-gnu/buildtools --with-isl=[...]/.build/aarch64-unknown-linux-gnu/buildtools --enable-lto --enable-threads=posix --enable-target-optspace --disable-plugin --disable-nls --disable-multilib --with-local-prefix=[...]/gcc-8.3-aarch64/aarch64-unknown-linux-gnu/sysroot --enable-long-long Thread model: posix gcc version 8.3.0 (crosstool-NG 1.24.0-rc3)