https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93078

            Bug ID: 93078
           Summary: Missing fma and round functions auto-vectorization
                    with x86-64 (sse2)
           Product: gcc
           Version: 9.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: diegoandres91b at hotmail dot com
  Target Milestone: ---

The next code (with -Ofast):

#include <cmath>

using namespace std;

float a[4], b[4], c[4];

void vec_fma() {
    for (int i = 0; i < 4; ++i) c[i] = fma(a[i], b[i], c[i]);
}

void vec_round() {
    for (int i = 0; i < 4; ++i) c[i] = round(a[i]);
}

void vec_floor() {
    for (int i = 0; i < 4; ++i) c[i] = floor(a[i]);
}

void vec_ceil() {
    for (int i = 0; i < 4; ++i) c[i] = ceil(a[i]);
}

void vec_trunc() {
    for (int i = 0; i < 4; ++i) c[i] = trunc(a[i]);
}

void vec_rint() {
    for (int i = 0; i < 4; ++i) c[i] = rint(a[i]);
}

void vec_nearbyint() {
    for (int i = 0; i < 4; ++i) c[i] = nearbyint(a[i]);
}

Compiles without auto-vectorization:

vec_fma():
        sub     rsp, 8
        movss   xmm2, DWORD PTR c[rip]
        movss   xmm1, DWORD PTR b[rip]
        movss   xmm0, DWORD PTR a[rip]
        call    fmaf
        movss   xmm2, DWORD PTR c[rip+4]
        movss   xmm1, DWORD PTR b[rip+4]
        movss   DWORD PTR c[rip], xmm0
        movss   xmm0, DWORD PTR a[rip+4]
        call    fmaf
        movss   xmm2, DWORD PTR c[rip+8]
        movss   xmm1, DWORD PTR b[rip+8]
        movss   DWORD PTR c[rip+4], xmm0
        movss   xmm0, DWORD PTR a[rip+8]
        call    fmaf
        movss   xmm2, DWORD PTR c[rip+12]
        movss   xmm1, DWORD PTR b[rip+12]
        movss   DWORD PTR c[rip+8], xmm0
        movss   xmm0, DWORD PTR a[rip+12]
        call    fmaf
        movss   DWORD PTR c[rip+12], xmm0
        add     rsp, 8
        ret
vec_round():
        movss   xmm3, DWORD PTR a[rip]
        movss   xmm0, DWORD PTR .LC1[rip]
        movss   xmm2, DWORD PTR .LC0[rip]
        movaps  xmm4, xmm0
        movaps  xmm1, xmm3
        andps   xmm1, xmm0
        comiss  xmm2, xmm1
        jbe     .L5
        addss   xmm1, DWORD PTR .LC2[rip]
        andnps  xmm4, xmm3
        movaps  xmm3, xmm4
        cvttss2si       eax, xmm1
        pxor    xmm1, xmm1
        cvtsi2ss        xmm1, eax
        orps    xmm3, xmm1
.L5:
        movss   DWORD PTR c[rip], xmm3
        movss   xmm3, DWORD PTR a[rip+4]
        movaps  xmm4, xmm0
        movaps  xmm1, xmm3
        andps   xmm1, xmm0
        comiss  xmm2, xmm1
        jbe     .L6
        addss   xmm1, DWORD PTR .LC2[rip]
        andnps  xmm4, xmm3
        movaps  xmm3, xmm4
        cvttss2si       eax, xmm1
        pxor    xmm1, xmm1
        cvtsi2ss        xmm1, eax
        orps    xmm3, xmm1
.L6:
        movss   DWORD PTR c[rip+4], xmm3
        movss   xmm3, DWORD PTR a[rip+8]
        movaps  xmm4, xmm0
        movaps  xmm1, xmm3
        andps   xmm1, xmm0
        comiss  xmm2, xmm1
        jbe     .L7
        addss   xmm1, DWORD PTR .LC2[rip]
        andnps  xmm4, xmm3
        movaps  xmm3, xmm4
        cvttss2si       eax, xmm1
        pxor    xmm1, xmm1
        cvtsi2ss        xmm1, eax
        orps    xmm3, xmm1
.L7:
        movss   DWORD PTR c[rip+8], xmm3
        movss   xmm3, DWORD PTR a[rip+12]
        movaps  xmm1, xmm3
        andps   xmm1, xmm0
        comiss  xmm2, xmm1
        jbe     .L8
        addss   xmm1, DWORD PTR .LC2[rip]
        andnps  xmm0, xmm3
        cvttss2si       eax, xmm1
        pxor    xmm1, xmm1
        cvtsi2ss        xmm1, eax
        movaps  xmm3, xmm1
        orps    xmm3, xmm0
.L8:
        movss   DWORD PTR c[rip+12], xmm3
        ret

...

vec_nearbyint():
        sub     rsp, 8
        movss   xmm0, DWORD PTR a[rip]
        call    nearbyintf
        movss   DWORD PTR c[rip], xmm0
        movss   xmm0, DWORD PTR a[rip+4]
        call    nearbyintf
        movss   DWORD PTR c[rip+4], xmm0
        movss   xmm0, DWORD PTR a[rip+8]
        call    nearbyintf
        movss   DWORD PTR c[rip+8], xmm0
        movss   xmm0, DWORD PTR a[rip+12]
        call    nearbyintf
        movss   DWORD PTR c[rip+12], xmm0
        add     rsp, 8
        ret

In comparison, the icc compiler also fails to auto-vectorize fma in sse2 mode
(without vfmadd132ps native instruction of fma), but it does have vectorized
versions of rounding functions (in sse2 mode, withtout roundps native
instruction of sse4.1):

vec_round():
        push      rsi
        movups    xmm0, XMMWORD PTR a[rip]
        call      QWORD PTR [__svml_roundf4@GOTPCREL+rip]
        movups    XMMWORD PTR c[rip], xmm0
        pop       rcx
        ret

...

vec_nearbyint():
        push      rsi
        movups    xmm0, XMMWORD PTR a[rip]
        call      QWORD PTR [__svml_nearbyintf4@GOTPCREL+rip]
        movups    XMMWORD PTR c[rip], xmm0
        pop       rcx
        ret

Compiler Explorer Code: https://gcc.godbolt.org/z/xwKluO

With the -msse4.1 flag the gcc compiler stills fail to auto-vectorize fma and
nearbyint, i not sure why dont auto-vectorize the function round directly to
"roundps xmm0, XMMWORD PTR a[rip], 0":

vec_round():
        movaps  xmm0, XMMWORD PTR a[rip]
        andps   xmm0, XMMWORD PTR .LC1[rip]
        orps    xmm0, XMMWORD PTR .LC0[rip]
        addps   xmm0, XMMWORD PTR a[rip]
        roundps xmm0, xmm0, 3
        movaps  XMMWORD PTR c[rip], xmm0
        ret
vec_floor():
        roundps xmm0, XMMWORD PTR a[rip], 1
        movaps  XMMWORD PTR c[rip], xmm0
        ret
vec_ceil():
        roundps xmm0, XMMWORD PTR a[rip], 2
        movaps  XMMWORD PTR c[rip], xmm0
        ret
vec_trunc():
        roundps xmm0, XMMWORD PTR a[rip], 3
        movaps  XMMWORD PTR c[rip], xmm0
        ret
vec_rint():
        roundps xmm0, XMMWORD PTR a[rip], 4
        movaps  XMMWORD PTR c[rip], xmm0
        ret
vec_nearbyint():
        mov     eax, OFFSET FLAT:a
        movss   xmm0, DWORD PTR [rax]
        roundss xmm0, xmm0, 12
        movss   DWORD PTR c[rip], xmm0
        movss   xmm0, DWORD PTR [rax+4]
        roundss xmm0, xmm0, 12
        movss   DWORD PTR c[rip+4], xmm0
        movss   xmm0, DWORD PTR [rax+8]
        roundss xmm0, xmm0, 12
        movss   DWORD PTR c[rip+8], xmm0
        movss   xmm0, DWORD PTR [rax+12]
        roundss xmm0, xmm0, 12
        movss   DWORD PTR c[rip+12], xmm0
        ret

Compiler Explorer Code: https://gcc.godbolt.org/z/Rc63b9

With -mfma flag, the nearbyint function continues without auto-vectorization:

vec_fma():
        vmovaps xmm1, XMMWORD PTR c[rip]
        vmovaps xmm0, XMMWORD PTR a[rip]
        vfmadd132ps     xmm0, xmm1, XMMWORD PTR b[rip]
        vmovaps XMMWORD PTR c[rip], xmm0
        ret
vec_round():
        vmovaps xmm1, XMMWORD PTR a[rip]
        vandps  xmm0, xmm1, XMMWORD PTR .LC1[rip]
        vorps   xmm0, xmm0, XMMWORD PTR .LC0[rip]
        vaddps  xmm0, xmm0, xmm1
        vroundps        xmm0, xmm0, 3
        vmovaps XMMWORD PTR c[rip], xmm0
        ret
vec_floor():
        vroundps        xmm0, XMMWORD PTR a[rip], 1
        vmovaps XMMWORD PTR c[rip], xmm0
        ret
vec_ceil():
        vroundps        xmm0, XMMWORD PTR a[rip], 2
        vmovaps XMMWORD PTR c[rip], xmm0
        ret
vec_trunc():
        vroundps        xmm0, XMMWORD PTR a[rip], 3
        vmovaps XMMWORD PTR c[rip], xmm0
        ret
vec_rint():
        vroundps        xmm0, XMMWORD PTR a[rip], 4
        vmovaps XMMWORD PTR c[rip], xmm0
        ret
vec_nearbyint():
        mov     eax, OFFSET FLAT:a
        vmovss  xmm1, DWORD PTR [rax]
        vmovss  xmm2, DWORD PTR [rax+4]
        vmovss  xmm3, DWORD PTR [rax+8]
        vmovss  xmm4, DWORD PTR [rax+12]
        vroundss        xmm0, xmm1, xmm1, 12
        vmovss  DWORD PTR c[rip], xmm0
        vroundss        xmm0, xmm2, xmm2, 12
        vmovss  DWORD PTR c[rip+4], xmm0
        vroundss        xmm0, xmm3, xmm3, 12
        vmovss  DWORD PTR c[rip+8], xmm0
        vroundss        xmm0, xmm4, xmm4, 12
        vmovss  DWORD PTR c[rip+12], xmm0
        ret

Compiler Explorer Code: https://gcc.godbolt.org/z/_WeniA

Reply via email to