https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89967

            Bug ID: 89967
           Summary: Inefficient code generation for vld2q_lane_u8 under
                    aarch64
           Product: gcc
           Version: 8.3.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: carsten.steger at gmail dot com
  Target Milestone: ---

Using vld2q_lane_u8 can generate very inefficient code under aarch64. Consider
the following code (compile it with gcc -march=armv8-a -Wall -Wextra -O3):

#include <arm_neon.h>

void func(unsigned char *in, int *out, int n, int32x4_t o1v, int32x4_t o2v,
          int32x4_t s1v, int32x4_t s2v, int32x4_t m1v, int32x4_t m2v,
          int32x4_t m3v, int32x4_t m4v)
{
  int i;
  int32x4_t l1v, l2v, g1v, g2v, g3v, g4v, gv;
  uint8x16x2_t g1b, g2b;

  l1v = o1v;
  l2v = o2v;
  g1b.val[0] = vdupq_n_u8(0);
  g1b.val[1] = vdupq_n_u8(0);
  g2b.val[0] = vdupq_n_u8(0);
  g2b.val[1] = vdupq_n_u8(0);
  for (i=0; i<n; i+=4)
  {
    g1b = vld2q_lane_u8(&in[vgetq_lane_s32(l1v, 0)], g1b, 0);
    g1b = vld2q_lane_u8(&in[vgetq_lane_s32(l1v, 1)], g1b, 4);
    g1b = vld2q_lane_u8(&in[vgetq_lane_s32(l1v, 2)], g1b, 8);
    g1b = vld2q_lane_u8(&in[vgetq_lane_s32(l1v, 3)], g1b, 12);
    g2b = vld2q_lane_u8(&in[vgetq_lane_s32(l2v, 0)], g2b, 0);
    g2b = vld2q_lane_u8(&in[vgetq_lane_s32(l2v, 1)], g2b, 4);
    g2b = vld2q_lane_u8(&in[vgetq_lane_s32(l2v, 2)], g2b, 8);
    g2b = vld2q_lane_u8(&in[vgetq_lane_s32(l2v, 3)], g2b, 12);
    g1v = vreinterpretq_s32_u8(g1b.val[0]);
    g2v = vreinterpretq_s32_u8(g1b.val[1]);
    g3v = vreinterpretq_s32_u8(g2b.val[0]);
    g4v = vreinterpretq_s32_u8(g2b.val[1]);
    gv = vmlaq_s32(vmlaq_s32(vmlaq_s32(vmulq_s32(m4v, g4v), m3v, g3v),
                             m2v, g2v), m1v, g1v);
    vst1q_s32(&out[i], gv);
    vaddq_s32(l1v, s1v);
    vaddq_s32(l2v, s2v);
  }
}

The calls to vld2q_lane_u8 generate the following assembler code:

        mov     v30.16b, v2.16b
        mov     v31.16b, v28.16b
        mov     v24.16b, v26.16b
        mov     v25.16b, v27.16b
        ld2     {v30.b - v31.b}[0], [x7]
        ld2     {v24.b - v25.b}[0], [x10]
        mov     v22.16b, v30.16b
        mov     v23.16b, v31.16b
        mov     v20.16b, v24.16b
        mov     v21.16b, v25.16b
        ld2     {v22.b - v23.b}[4], [x6]
        ld2     {v20.b - v21.b}[4], [x9]
        mov     v18.16b, v22.16b
        mov     v19.16b, v23.16b
        mov     v0.16b, v20.16b
        mov     v1.16b, v21.16b
        ld2     {v18.b - v19.b}[8], [x5]
        ld2     {v0.b - v1.b}[8], [x8]
        mov     v2.16b, v18.16b
        mov     v3.16b, v19.16b
        mov     v16.16b, v0.16b
        mov     v17.16b, v1.16b
        ld2     {v2.b - v3.b}[12], [x0]
        ld2     {v16.b - v17.b}[12], [x4]

There is a large amount of unnecessary register copying going on. Since the
compiler was smart enough to replace the vgetq_lane_s32 and vaddq_s32 calls
with direct register accesses and manipulations, I would have expected the
vld2q_lane_u8 calls to look like this (saving 16 unnecessary register copy
instructions:

        ld2     {v0.b - v1.b}[0], [x7]
        ld2     {v2.b - v3.b}[0], [x10]
        ld2     {v0.b - v1.b}[4], [x6]
        ld2     {v2.b - v3.b}[4], [x9]
        ld2     {v0.b - v1.b}[8], [x5]
        ld2     {v2.b - v2.b}[8], [x8]
        ld2     {v0.b - v1.b}[12], [x0]
        ld2     {v2.b - v3.b}[12], [x4]

In general (i.e., in cases the compiler isn't able to replace the
vgetq_lane_s32 and vaddq_s32 calls with direct register accesses and
manipulations), I would expect the code to look roughly like this (modulo the
instruction order and register numbers):


        umov w0, v4.s[0]
        umov w1, v4.s[1]
        umov w2, v4.s[2]
        umov w3, v4.s[3]
        add  x0, x5, w0, sxtw
        add  x1, x5, w1, sxtw
        add  x2, x5, w2, sxtw
        add  x3, x5, w3, sxtw
        ld2  {v0.b, v1.b}[0], [x0]
        ld2  {v0.b, v1.b}[4], [x1]
        ld2  {v0.b, v1.b}[8], [x2]
        ld2  {v0.b, v1.b}[12], [x3]
        umov w0, v5.s[0]
        umov w1, v5.s[1]
        umov w2, v5.s[2]
        umov w3, v5.s[3]
        add  x0, x5, w0, sxtw
        add  x1, x5, w1, sxtw
        add  x2, x5, w2, sxtw
        add  x3, x5, w3, sxtw
        ld2  {v2.b, v3.b}[0], [x0]
        ld2  {v2.b, v3.b}[4], [x1]
        ld2  {v2.b, v3.b}[8], [x2]
        ld2  {v2.b, v3.b}[12], [x3]

Tested with gcc 8.3.0 as an aarch64 cross compiler on an x86_64 system (built
with crosstool-ng 1.24.0-rc3). Output of gcc -v:

Using built-in specs.
COLLECT_GCC=[...]/gcc-8.3-aarch64/bin/aarch64-unknown-linux-gnu-gcc
COLLECT_LTO_WRAPPER=[...]/gcc-8.3-aarch64/bin/../libexec/gcc/aarch64-unknown-linux-gnu/8.3.0/lto-wrapper
Target: aarch64-unknown-linux-gnu
Configured with: [...]/.build/aarch64-unknown-linux-gnu/src/gcc/configure
--build=x86_64-build_pc-linux-gnu --host=x86_64-build_pc-linux-gnu
--target=aarch64-unknown-linux-gnu --prefix=[...]/gcc-8.3-aarch64
--with-sysroot=[...]/gcc-8.3-aarch64/aarch64-unknown-linux-gnu/sysroot
--enable-languages=c,c++,fortran --with-pkgversion='crosstool-NG 1.24.0-rc3'
--enable-__cxa_atexit --disable-libmudflap --enable-libgomp --disable-libssp
--disable-libquadmath --disable-libquadmath-support --disable-libsanitizer
--disable-libmpx --with-gmp=[...]/.build/aarch64-unknown-linux-gnu/buildtools
--with-mpfr=[...]/.build/aarch64-unknown-linux-gnu/buildtools
--with-mpc=[...]/.build/aarch64-unknown-linux-gnu/buildtools
--with-isl=[...]/.build/aarch64-unknown-linux-gnu/buildtools --enable-lto
--enable-threads=posix --enable-target-optspace --disable-plugin --disable-nls
--disable-multilib
--with-local-prefix=[...]/gcc-8.3-aarch64/aarch64-unknown-linux-gnu/sysroot
--enable-long-long
Thread model: posix
gcc version 8.3.0 (crosstool-NG 1.24.0-rc3)

Reply via email to