[Bug target/92188] New: Cannot merge memory write for _mm_cvtps_ph/_mm256_cvtps_ph and x86-64

2019-10-23 Thread fredrik987 at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92188

Bug ID: 92188
   Summary: Cannot merge memory write for
_mm_cvtps_ph/_mm256_cvtps_ph and x86-64
   Product: gcc
   Version: 9.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: target
  Assignee: unassigned at gcc dot gnu.org
  Reporter: fredrik987 at gmail dot com
  Target Milestone: ---

Created attachment 47089
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=47089&action=edit
Test code

For this code, the memory write cannot be merged with vcvtps2ph.

void test1(__m128i *x, const __m256 *y)
{
// Cannot merge memory write
*x = _mm256_cvtps_ph(*y, _MM_FROUND_CUR_DIRECTION);
}

  ...
  vcvtps2ph $4, %ymm0, %xmm0
  vmovaps %xmm0, (%rdi)
  ...

A workaround is to change the output type to __v8hi as.

void test2(__v8hi *x, const __m256 *y)
{
// Memory write merged
*x = (__v8hi)_mm256_cvtps_ph(*y, _MM_FROUND_CUR_DIRECTION);
}

  ...
  vcvtps2ph $4, %ymm0, (%rdi)
  ...

However it does not work for the 128 bit variant of vcvtps2ph.

void test4(__v4hi *x, const __m128 *y)
{
// Cannot merge memory write
*x = (__v4hi)(((__v2di)_mm_cvtps_ph(*y, _MM_FROUND_CUR_DIRECTION))[0]);
}

  ...
  vcvtps2ph $4, %xmm0, %xmm0
  vmovq %xmm0, (%rdi)
  ...

The opposite problem exists for e.g. _mm256_extracti128_si256, which normally
merges the memory write but not for output type __v8hi.

void test6(__v8hi *x, const __m256i *y)
{
// Cannot merge memory write
*x = (__v8hi)_mm256_extracti128_si256(*y, 1);
}

  ...
  vextracti128 $0x1, %ymm0, %xmm0
  vmovaps %xmm0, (%rdi)
  ...

It would be good if all variants behave the same, with memory write merged.

I use "-O3 -march=core-avx2" when compiling (using compiler explorer).

[Bug target/93930] New: Unnecessary broadcast instructions for AVX512

2020-02-25 Thread fredrik987 at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93930

Bug ID: 93930
   Summary: Unnecessary broadcast instructions for AVX512
   Product: gcc
   Version: 9.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: target
  Assignee: unassigned at gcc dot gnu.org
  Reporter: fredrik987 at gmail dot com
  Target Milestone: ---

Created attachment 47908
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=47908&action=edit
Test case

The code below generates unnecessary broadcast instructions for AVX512,
compiled with "-Ofast -march=skylake-avx512". This occurs for gcc trunk and
9.2/8.3 but not 7.5.

Most constants are read from memory via vbroadcastss except two, which are read
as scalars and then broadcast within the loop. For gcc 7.5 all constants are
read via vbroadcastss.

The problem seems to be more frequent for larger functions.

 ---

Compiler output for gcc 9.2:

...
.L3:
vmovaps zmm0, ZMMWORD PTR [rdi]
add rdi, 64
vmovaps zmm3, zmm0
vmovaps zmm1, zmm0
vmulps  zmm2, zmm0, zmm0
vfmadd132ps zmm3, zmm11, zmm12
vfmadd132ps zmm1, zmm13, zmm14
vmovaps zmm4, zmm0
vfmadd132ps zmm4, zmm7, zmm8
sub rsi, -128
vfmadd132ps zmm1, zmm3, zmm2
vmovaps zmm3, zmm0
vfmadd132ps zmm3, zmm9, zmm10
vfmadd132ps zmm3, zmm4, zmm2
vbroadcastsszmm4, xmm15 <--- Broadcast within loop
vmulps  zmm3, zmm3, zmm1
vmovaps ZMMWORD PTR [rsi-128], zmm3
vbroadcastsszmm3, xmm16 <--- Broadcast within loop
vfmadd132ps zmm3, zmm4, zmm0
vfmadd132ps zmm0, zmm5, zmm6
vfmadd132ps zmm0, zmm3, zmm2
vmulps  zmm1, zmm1, zmm0
vmovaps ZMMWORD PTR [rsi-64], zmm1
cmp rdi, rax
jne .L3
...

 ---

#include 

static __m512 f(__m512 x)
{
__m512 a = _mm512_set1_ps(11);
__m512 b = _mm512_set1_ps(12);
__m512 c = _mm512_set1_ps(13);
__m512 d = _mm512_set1_ps(14);

__m512 y = _mm512_mul_ps(x, x);

return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c,
d));
}

static __m512 g(__m512 x)
{
__m512 a = _mm512_set1_ps(21);
__m512 b = _mm512_set1_ps(22);
__m512 c = _mm512_set1_ps(23);
__m512 d = _mm512_set1_ps(24);

__m512 y = _mm512_mul_ps(x, x);

return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c,
d));
}

static __m512 h(__m512 x)
{
__m512 a = _mm512_set1_ps(31);
__m512 b = _mm512_set1_ps(32);
__m512 c = _mm512_set1_ps(33);
__m512 d = _mm512_set1_ps(34);

__m512 y = _mm512_mul_ps(x, x);

return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c,
d));
}

void test(__m512 *x, __m512 *y, int n)
{
for (int i = 0; i < n; i++) {
__m512 u = *x++;
__m512 v = h(u);

*y++ = _mm512_mul_ps(f(u), v);
*y++ = _mm512_mul_ps(g(u), v);
}
}