[Bug tree-optimization/93078] New: Missing fma and round functions auto-vectorization with x86-64 (sse2)

2019-12-26 Thread diegoandres91b at hotmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93078

Bug ID: 93078
   Summary: Missing fma and round functions auto-vectorization
with x86-64 (sse2)
   Product: gcc
   Version: 9.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: diegoandres91b at hotmail dot com
  Target Milestone: ---

The next code (with -Ofast):

#include 

using namespace std;

float a[4], b[4], c[4];

void vec_fma() {
for (int i = 0; i < 4; ++i) c[i] = fma(a[i], b[i], c[i]);
}

void vec_round() {
for (int i = 0; i < 4; ++i) c[i] = round(a[i]);
}

void vec_floor() {
for (int i = 0; i < 4; ++i) c[i] = floor(a[i]);
}

void vec_ceil() {
for (int i = 0; i < 4; ++i) c[i] = ceil(a[i]);
}

void vec_trunc() {
for (int i = 0; i < 4; ++i) c[i] = trunc(a[i]);
}

void vec_rint() {
for (int i = 0; i < 4; ++i) c[i] = rint(a[i]);
}

void vec_nearbyint() {
for (int i = 0; i < 4; ++i) c[i] = nearbyint(a[i]);
}

Compiles without auto-vectorization:

vec_fma():
sub rsp, 8
movss   xmm2, DWORD PTR c[rip]
movss   xmm1, DWORD PTR b[rip]
movss   xmm0, DWORD PTR a[rip]
callfmaf
movss   xmm2, DWORD PTR c[rip+4]
movss   xmm1, DWORD PTR b[rip+4]
movss   DWORD PTR c[rip], xmm0
movss   xmm0, DWORD PTR a[rip+4]
callfmaf
movss   xmm2, DWORD PTR c[rip+8]
movss   xmm1, DWORD PTR b[rip+8]
movss   DWORD PTR c[rip+4], xmm0
movss   xmm0, DWORD PTR a[rip+8]
callfmaf
movss   xmm2, DWORD PTR c[rip+12]
movss   xmm1, DWORD PTR b[rip+12]
movss   DWORD PTR c[rip+8], xmm0
movss   xmm0, DWORD PTR a[rip+12]
callfmaf
movss   DWORD PTR c[rip+12], xmm0
add rsp, 8
ret
vec_round():
movss   xmm3, DWORD PTR a[rip]
movss   xmm0, DWORD PTR .LC1[rip]
movss   xmm2, DWORD PTR .LC0[rip]
movaps  xmm4, xmm0
movaps  xmm1, xmm3
andps   xmm1, xmm0
comiss  xmm2, xmm1
jbe .L5
addss   xmm1, DWORD PTR .LC2[rip]
andnps  xmm4, xmm3
movaps  xmm3, xmm4
cvttss2si   eax, xmm1
pxorxmm1, xmm1
cvtsi2ssxmm1, eax
orpsxmm3, xmm1
.L5:
movss   DWORD PTR c[rip], xmm3
movss   xmm3, DWORD PTR a[rip+4]
movaps  xmm4, xmm0
movaps  xmm1, xmm3
andps   xmm1, xmm0
comiss  xmm2, xmm1
jbe .L6
addss   xmm1, DWORD PTR .LC2[rip]
andnps  xmm4, xmm3
movaps  xmm3, xmm4
cvttss2si   eax, xmm1
pxorxmm1, xmm1
cvtsi2ssxmm1, eax
orpsxmm3, xmm1
.L6:
movss   DWORD PTR c[rip+4], xmm3
movss   xmm3, DWORD PTR a[rip+8]
movaps  xmm4, xmm0
movaps  xmm1, xmm3
andps   xmm1, xmm0
comiss  xmm2, xmm1
jbe .L7
addss   xmm1, DWORD PTR .LC2[rip]
andnps  xmm4, xmm3
movaps  xmm3, xmm4
cvttss2si   eax, xmm1
pxorxmm1, xmm1
cvtsi2ssxmm1, eax
orpsxmm3, xmm1
.L7:
movss   DWORD PTR c[rip+8], xmm3
movss   xmm3, DWORD PTR a[rip+12]
movaps  xmm1, xmm3
andps   xmm1, xmm0
comiss  xmm2, xmm1
jbe .L8
addss   xmm1, DWORD PTR .LC2[rip]
andnps  xmm0, xmm3
cvttss2si   eax, xmm1
pxorxmm1, xmm1
cvtsi2ssxmm1, eax
movaps  xmm3, xmm1
orpsxmm3, xmm0
.L8:
movss   DWORD PTR c[rip+12], xmm3
ret

...

vec_nearbyint():
sub rsp, 8
movss   xmm0, DWORD PTR a[rip]
callnearbyintf
movss   DWORD PTR c[rip], xmm0
movss   xmm0, DWORD PTR a[rip+4]
callnearbyintf
movss   DWORD PTR c[rip+4], xmm0
movss   xmm0, DWORD PTR a[rip+8]
callnearbyintf
movss   DWORD PTR c[rip+8], xmm0
movss   xmm0, DWORD PTR a[rip+12]
callnearbyintf
movss   DWORD PTR c[rip+12], xmm0
add rsp, 8
ret

In comparison, the icc compiler also fails to auto-vectorize fma in sse2 mode
(without vfmadd132ps native instruction of fma), but it does have vectorized
versions of rounding functions (in sse2 mode, withtout roundps native
instruction of sse4.1):

vec_round():
push  rsi
movupsxmm0, XMMWORD PTR a[rip]
call  QWORD PTR [__svml_roundf4@GOTPCREL+rip]
movupsXMMWORD PTR c[rip], xmm0
pop   rcx
ret

...

vec_nearbyint():
push  rsi
movupsxmm0, XMMWORD PTR a[rip]
call  QWORD PTR [__svml_nearbyintf4@GOTPCREL+rip]
movupsXMMWORD PTR c[rip], xmm0
pop   rcx
ret

Compiler

[Bug target/91594] New: Missing horizontal addition auto-vectorization

2019-08-28 Thread diegoandres91b at hotmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91594

Bug ID: 91594
   Summary: Missing horizontal addition auto-vectorization
   Product: gcc
   Version: 9.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: target
  Assignee: unassigned at gcc dot gnu.org
  Reporter: diegoandres91b at hotmail dot com
  Target Milestone: ---

The next code (with -O3 -ffast-math -msse3):

float a2[4], b2[4], c2[4];

void hadd2() {
c2[0] = a2[0] + a2[1];
c2[1] = a2[2] + a2[3];
c2[2] = b2[0] + b2[1];
c2[3] = b2[2] + b2[3];
}

Compiles without auto-vectorization:

hadd2():
movss   xmm0, DWORD PTR a2[rip]
addss   xmm0, DWORD PTR a2[rip+4]
movss   DWORD PTR c2[rip], xmm0
movss   xmm0, DWORD PTR a2[rip+8]
addss   xmm0, DWORD PTR a2[rip+12]
movss   DWORD PTR c2[rip+4], xmm0
movss   xmm0, DWORD PTR b2[rip]
addss   xmm0, DWORD PTR b2[rip+4]
movss   DWORD PTR c2[rip+8], xmm0
movss   xmm0, DWORD PTR b2[rip+8]
addss   xmm0, DWORD PTR b2[rip+12]
movss   DWORD PTR c2[rip+12], xmm0
ret

The expected code with HADDPS instruction (which does not compile):

hadd2():
movaps  xmm0, XMMWORD PTR a1[rip]
haddps  xmm0, XMMWORD PTR b1[rip]
movaps  XMMWORD PTR c1[rip], xmm0
ret

In contrast, the normal addition code:

void add2() {
c2[0] = a2[0] + b2[0];
c2[1] = a2[1] + b2[1];
c2[2] = a2[2] + b2[2];
c2[3] = a2[3] + b2[3];
}

Compiles with auto-vectorization:

add2():
movaps  xmm0, XMMWORD PTR a2[rip]
addps   xmm0, XMMWORD PTR b2[rip]
movaps  XMMWORD PTR c2[rip], xmm0
ret

Compiler Explorer Code: https://gcc.godbolt.org/z/9Hs9su

[Bug c++/81706] New: std::sin vectorization bug

2017-08-03 Thread diegoandres91b at hotmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81706

Bug ID: 81706
   Summary: std::sin vectorization bug
   Product: gcc
   Version: unknown
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c++
  Assignee: unassigned at gcc dot gnu.org
  Reporter: diegoandres91b at hotmail dot com
  Target Milestone: ---

The next code (with -O3 -ffast-math):

#include 

float a[4];

void sin1() {
for(unsigned i = 0; i < 4; i++) a[i] = sinf(a[i]);
}

Compiles vectorized version of sinf (_ZGVbN4v_sinf):

sin1():
sub rsp, 8
movaps  xmm0, XMMWORD PTR a[rip]
call_ZGVbN4v_sinf
movaps  XMMWORD PTR a[rip], xmm0
add rsp, 8
ret

But when i use c++ version of sinf (std::sin) no vectorization occurrs:

void sin2() {
for(unsigned i = 0; i < 4; i++) a[i] = std::sin(a[i]);
}

sin2():
sub rsp, 8
movss   xmm0, DWORD PTR a[rip]
callsinf
movss   DWORD PTR a[rip], xmm0
movss   xmm0, DWORD PTR a[rip+4]
callsinf
movss   DWORD PTR a[rip+4], xmm0
movss   xmm0, DWORD PTR a[rip+8]
callsinf
movss   DWORD PTR a[rip+8], xmm0
movss   xmm0, DWORD PTR a[rip+12]
callsinf
movss   DWORD PTR a[rip+12], xmm0
add rsp, 8
ret

Compiler Explorer Code: https://godbolt.org/g/zSrJrK

[Bug c++/108320] New: Missing vector/array arithmetic optimization compared to valarray

2023-01-06 Thread diegoandres91b at hotmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108320

Bug ID: 108320
   Summary: Missing vector/array arithmetic optimization compared
to valarray
   Product: gcc
   Version: 12.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c++
  Assignee: unassigned at gcc dot gnu.org
  Reporter: diegoandres91b at hotmail dot com
  Target Milestone: ---

The next code (with -O3 -mavx2 -mfma):

#include 
#include 
#include 

using namespace std;

valarray fma1(const valarray &a, const valarray &b, const
valarray &c) {
return a * b + c;
}

template
struct vec : vector {
constexpr vec(size_t count) : vector(count) {}
};

template
constexpr vec operator*(const vec &a, const vec &b) {
vec c(a.size());
for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] * b[i];
return c;
}

template
constexpr vec operator+(const vec &a, const vec &b) {
vec c(a.size());
for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] + b[i];
return c;
}

vec fma2(const vec &a, const vec &b, const vec &c)
{
return a * b + c;
}

template
struct arr : array {
};

template
constexpr arr operator*(const arr &a, const arr &b) {
arr c;
for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] * b[i];
return c;
}

template
constexpr arr operator+(const arr &a, const arr &b) {
arr c;
for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] + b[i];
return c;
}

constexpr size_t N = 1024;

arr fma3(const arr &a, const arr &b, const
arr &c) {
return a * b + c;
}

Only optimizes the valarray version (fma1) of the fma function (uses
vfmadd132ps):

...

.L4:
vmovups ymm0, YMMWORD PTR [rdi+rax]
vmovups ymm1, YMMWORD PTR [rcx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdx+rax], ymm0
add rax, 32
cmp rax, r8
jne .L4
mov rax, r10
and rax, -8
lea r9, [0+rax*4]
lea r11, [rdx+r9]
testr10b, 7
je  .L22
vzeroupper
.L3:
mov r8, r10
sub r8, rax
lea r12, [r8-1]
cmp r12, 2
jbe .L6
vmovups xmm0, XMMWORD PTR [rdi+rax*4]
vmovups xmm2, XMMWORD PTR [rcx+rax*4]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+rax*4]
vmovups XMMWORD PTR [rdx+r9], xmm0
testr8b, 3
je  .L1
and r8, -4
add rax, r8
lea r11, [r11+r8*4]
lea r9, [0+rax*4]

...

But it does not optimize the vector or array versions of the function (fma2 and
fma3).

Note: For smaller N in fma3 optimizes, but for larger numbers like 1024 in the
example it does not.

Compiler Explorer code: https://godbolt.org/z/v8dnx5aMo