[gcc r15-2991] AVX10.2 ymm rounding: Support vadd{s, d, h} and vcmp{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:e22e3af1954469c40b139b7cfa8e7708592f4bfd

commit r15-2991-ge22e3af1954469c40b139b7cfa8e7708592f4bfd
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:51 2024 +0800

AVX10.2 ymm rounding: Support vadd{s,d,h} and vcmp{s,d,h} intrins

gcc/ChangeLog:

* config.gcc: Add avx10_2roundingintrin.h.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V4DF_FTYPE_V4DF_V4DF_V4DF_UQI_INT, 
V8SF_FTYPE_V8SF_V8SF_V8SF_UQI_INT,
V16HF_FTYPE_V16HF_V16HF_V16HF_UHI_INT, 
UQI_FTYPE_V4DF_V4DF_INT_UQI_INT,
UHI_FTYPE_V16HF_V16HF_INT_UHI_INT, UQI_FTYPE_V8SF_V8SF_INT_UQI_INT.
* config/i386/immintrin.h: Include avx10_2roundingintrin.h.
* config/i386/sse.md: Change subst_attr name due to renaming.
* config/i386/subst.md:
(): Add condition check for avx10.2
rounding control 256bit intrins and renamed to ...
(): ...this.
(round_saeonly_mode512bit_condition): Add condition check for
avx10.2 rounding control 256 bit intris and renamed to ...
(round_saeonly_mode_condition): ...this.
* config/i386/avx10_2roundingintrin.h: New file.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add -mavx10.2 and new builtin test.
* gcc.target/i386/avx-2.c: Ditto.
* gcc.target/i386/sse-13.c: Add new tests.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: New test.

Diff:
---
 gcc/config.gcc |   2 +-
 gcc/config/i386/avx10_2roundingintrin.h| 337 +
 gcc/config/i386/i386-builtin-types.def |   8 +
 gcc/config/i386/i386-builtin.def   |   8 +
 gcc/config/i386/i386-expand.cc |   6 +
 gcc/config/i386/immintrin.h|   2 +
 gcc/config/i386/sse.md | 100 +++---
 gcc/config/i386/subst.md   |  32 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  10 +-
 gcc/testsuite/gcc.target/i386/avx-2.c  |   2 +-
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  64 
 gcc/testsuite/gcc.target/i386/sse-13.c |   8 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  17 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  17 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   8 +
 15 files changed, 558 insertions(+), 63 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index a36dd1bcbc6..2c0f4518638 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -452,7 +452,7 @@ i[34567]86-*-* | x86_64-*-*)
   cmpccxaddintrin.h amxfp16intrin.h prfchiintrin.h
   raointintrin.h amxcomplexintrin.h avxvnniint16intrin.h
   sm3intrin.h sha512intrin.h sm4intrin.h
-  usermsrintrin.h"
+  usermsrintrin.h avx10_2roundingintrin.h"
;;
 ia64-*-*)
extra_headers=ia64intrin.h
diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
new file mode 100644
index 000..5698ed05c1d
--- /dev/null
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -0,0 +1,337 @@
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use  directly; include  
instead."
+#endif
+
+#ifndef _AVX10_2ROUNDINGINTRIN_H_INCLUDED
+#define _AVX10_2ROUNDINGINTRIN_H_INCLUDED
+
+#ifndef __AVX10_2_256__
+#pragma GCC push_options
+#pragma GCC target("avx10.2-256")
+#define __DISABLE_AVX10_2_256__
+#endif /* __AVX10_2_256__ */
+
+#ifdef  __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inl

[gcc r15-2992] AVX10.2 ymm rounding: Support vcvtdq2p{s, h} and vcvtpd2p{s, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:85e874d19548f0dcb9a3f14f9e4b1e3411c88c4b

commit r15-2992-g85e874d19548f0dcb9a3f14f9e4b1e3411c88c4b
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:53 2024 +0800

AVX10.2 ymm rounding: Support vcvtdq2p{s,h} and vcvtpd2p{s,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: Add new intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SF_FTYPE_V8SI_V8SF_UQI_INT, V4SF_FTYPE_V4DF_V4SF_UQI_INT,
V8HF_FTYPE_V8SI_V8HF_UQI_INT, V8HF_FTYPE_V4DF_V8HF_UQI_INT.
* config/i386/sse.md:

(avx512fp16_vcvt2ph_):
Add condition check.
(avx512fp16_vcvtpd2ph_v4df_mask_round): New expand.
(*avx512fp16_vcvt2ph__mask): Change name to
avx512fp16_vcvt2ph__mask_1
and extend pattern to generate 256bit insns.
(avx_cvtpd2ps256): Change name to
avx_cvtpd2ps256 and extend pattern to
generate 256bit insns.
* config/i386/subst.md (round_applied): New condition.
(round_suff): New iterator.
(round_mode_condition): Add V32HI check for 512bit.
(round_saeonly_mode_condition): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/sse-14.c: Add new macro test.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 210 +
 gcc/config/i386/i386-builtin-types.def |   4 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   4 +
 gcc/config/i386/sse.md |  32 +++-
 gcc/config/i386/subst.md   |   4 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  44 -
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 322 insertions(+), 16 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 5698ed05c1d..09285c1ffcd 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -216,6 +216,138 @@ _mm256_mask_cmp_round_ps_mask (__mmask8 __U, __m256 __A, 
__m256 __B,
(__mmask8) __U,
__R);
 }
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi32_ph (__m256i __A, const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtdq2ph256_mask_round ((__v8si) __A,
+  (__v8hf)
+  _mm_setzero_ph (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepi32_ph (__m128h __W, __mmask8 __U, __m256i __A,
+  const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtdq2ph256_mask_round ((__v8si) __A,
+  (__v8hf) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepi32_ph (__mmask8 __U, __m256i __A, const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtdq2ph256_mask_round ((__v8si) __A,
+  (__v8hf)
+  _mm_setzero_ph (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi32_ps (__m256i __A, const int __R)
+{
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask_round ((__v8si) __A,
+(__v8sf)
+_mm256_undefined_ps (),
+  

[gcc r15-2993] AVX10.2 ymm rounding: Support vcvtpd2{, u}{dq, qq} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:508ac49e1a94c28346642bff512d0ed5f4f58b64

commit r15-2993-g508ac49e1a94c28346642bff512d0ed5f4f58b64
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:55 2024 +0800

AVX10.2 ymm rounding: Support vcvtpd2{,u}{dq,qq} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: Add new intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V4DI_FTYPE_V4DF_V4DI_UQI_INT, V4SI_FTYPE_V4DF_V4SI_UQI_INT.
* config/i386/sse.md:
(avx_cvtpd2dq256): Change name to
avx_cvtpd2dq256 and extend pattern to
generate 256bit insns.
(fixuns_notrunc2):
Add round_mode_condition.
* config/i386/subst.md (round_pd2udqsuff): New iterator.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/sse-14.c: Add new macro test.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 218 +
 gcc/config/i386/i386-builtin-types.def |   2 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   2 +
 gcc/config/i386/sse.md |  13 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  33 
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 303 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 09285c1ffcd..3e5e9f3ba0e 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -348,6 +348,144 @@ _mm256_maskz_cvt_roundpd_ps (__mmask8 __U, __m256d __A, 
const int __R)
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundpd_epi32 (__m256d __A, const int __R)
+{
+  return
+(__m128i) __builtin_ia32_cvtpd2dq256_mask_round ((__v4df) __A,
+(__v4si)
+_mm_undefined_si128 (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A,
+  const int __R)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask_round ((__v4df) __A,
+ (__v4si) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m256d __A, const int __R)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask_round ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundpd_epi64 (__m256d __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvtpd2qq256_mask_round ((__v4df) __A,
+(__v4di)
+_mm256_setzero_si256 (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask_round ((__v4df) __A,
+ (__v4di) __W,
+ 

[gcc r15-2994] AVX10.2 ymm rounding: Support vcvtph2p{s, d, sx} and vcvtph2{, u}{dq, qq} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6f2eac53b6026836f3222961c32312e02c2c7dbc

commit r15-2994-g6f2eac53b6026836f3222961c32312e02c2c7dbc
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:56 2024 +0800

AVX10.2 ymm rounding: Support vcvtph2p{s,d,sx} and vcvtph2{,u}{dq,qq} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SF_FTYPE_V8HF_V8SF_UQI_INT, V8SI_FTYPE_V8HF_V8SI_UQI_INT,
V4DF_FTYPE_V8HF_V4DF_UQI_INT, V4DI_FTYPE_V8HF_V4DI_UQI_INT.
* config/i386/sse.md:
(avx512fp16_float_extend_ph2):
Add condition check.
(avx512fp16_vcvtph2_
):
Ditto.
(avx512fp16_float_extend_ph2): Extend round 
saeonly.
(vcvtph2ps256): Ditto.
* config/i386/subst.md
(round_saeonly_applied): New condition.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 384 +
 gcc/config/i386/i386-builtin-types.def |   4 +
 gcc/config/i386/i386-builtin.def   |   7 +
 gcc/config/i386/i386-expand.cc |   4 +
 gcc/config/i386/sse.md |  19 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   7 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  57 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   7 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  20 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  21 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   7 +
 12 files changed, 529 insertions(+), 9 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 3e5e9f3ba0e..29966f5e1bf 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -486,6 +486,246 @@ _mm256_maskz_cvt_roundpd_epu64 (__mmask8 __U, __m256d 
__A, const int __R)
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_epi32 (__m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2dq256_mask_round ((__v8hf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_epi32 (__m256i __W, __mmask8 __U, __m128h __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtph2dq256_mask_round ((__v8hf) __A,
+  (__v8si) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundph_epi32 (__mmask8 __U, __m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2dq256_mask_round ((__v8hf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_pd (__m128h __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_vcvtph2pd256_mask_round ((__v8hf) __A,
+  (__v4df)
+  _mm256_setzero_pd (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_pd (__m256d __W, __mmask8 __U, __m128h __A,
+   const int __R)
+{
+  return (_

[gcc r15-2995] AVX10.2 ymm rounding: Support vcvtph2{, u}w and vcvtps2p{d, hx} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b70bb94aca7bc10a54f744d793c32c51f91ce195

commit r15-2995-gb70bb94aca7bc10a54f744d793c32c51f91ce195
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:57 2024 +0800

AVX10.2 ymm rounding: Support vcvtph2{,u}w and vcvtps2p{d,hx} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V16HI_FTYPE_V16HF_V16HI_UHI_INT, V4DF_FTYPE_V4SF_V4DF_UQI_INT
V8HF_FTYPE_V8SF_V8HF_UQI_INT.
* config/i386/sse.md
(avx512fp16_vcvt2ph_):
Add round condition check.
* config/i386/subst.md (round_mode_condition): Add V16HI check for
256bit.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 220 +
 gcc/config/i386/i386-builtin-types.def |   3 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   3 +
 gcc/config/i386/sse.md |   2 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  36 
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 304 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 29966f5e1bf..bc3f92a7d1a 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -726,6 +726,143 @@ _mm256_maskz_cvt_roundph_epu64 (__mmask8 __U, __m128h 
__A, const int __R)
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_epu16 (__m256h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2uw256_mask_round ((__v16hf) __A,
+ (__v16hi)
+ _mm256_undefined_si256 (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_epu16 (__m256i __W, __mmask16 __U, __m256h __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtph2uw256_mask_round ((__v16hf) __A,
+  (__v16hi) __W,
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundph_epu16 (__mmask16 __U, __m256h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2uw256_mask_round ((__v16hf) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_epi16 (__m256h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2w256_mask_round ((__v16hf) __A,
+(__v16hi)
+_mm256_undefined_si256 (),
+(__mmask16) -1,
+__R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_epi16 (__m256i __W, __mmask16 __U, __m256h __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtph2w256_mask_round ((__v16hf) __A,
+ (__v16hi) __W,
+   

[gcc r15-2996] AVX10.2 ymm rounding: Support vcvtps2{, u}{dq, qq} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:0f5a42d41b46b746c6f77374d76a3b918a1e2b57

commit r15-2996-g0f5a42d41b46b746c6f77374d76a3b918a1e2b57
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:58 2024 +0800

AVX10.2 ymm rounding: Support vcvtps2{,u}{dq,qq} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SI_FTYPE_V8SF_V8SI_UQI_INT, V4DI_FTYPE_V4SF_V4DI_UQI_INT.
* config/i386/sse.md
(_fix_notrunc):
Extend to round.

(_fixuns_notrunc):
Add round condition check.
* config/i386/subst.md (round_constraint4): New.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 226 +
 gcc/config/i386/i386-builtin-types.def |   2 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   2 +
 gcc/config/i386/sse.md |  10 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  32 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 308 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index bc3f92a7d1a..fca10a6b586 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -863,6 +863,146 @@ _mm256_maskz_cvtx_roundps_ph (__mmask8 __U, __m256 __A, 
const int __R)
(__mmask8) __U,
__R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundps_epi32 (__m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundps_epi32 (__m256i __W, __mmask8 __U, __m256 __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtps2dq256_mask_round ((__v8sf) __A,
+  (__v8si) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundps_epi32 (__mmask8 __U, __m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundps_epi64 (__m128 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvtps2qq256_mask_round ((__v4sf) __A,
+(__v4di)
+_mm256_setzero_si256 (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundps_epi64 (__m256i __W, __mmask8 __U, __m128 __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask_round ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U,
+   

[gcc r15-2997] AVX10.2 ymm rounding: Support vcvtqq2p{s, d, h} and vcvttpd2{, u}{dq, qq} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6e231f8504874828b23bbe89f3ef4086dcc15a44

commit r15-2997-g6e231f8504874828b23bbe89f3ef4086dcc15a44
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:59 2024 +0800

AVX10.2 ymm rounding: Support vcvtqq2p{s,d,h} and vcvttpd2{,u}{dq,qq} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V4DF_FTYPE_V4DI_V4DF_UQI_INT, V4SF_FTYPE_V4DI_V4SF_UQI_INT,
V8HF_FTYPE_V4DI_V8HF_UQI_INT.
* config/i386/sse.md:
(avx512fp16_vcvtqq2ph_v4di_mask_round): New expand.
(*avx512fp16_vcvt2ph__mask):
Extend round control and add "_1" suffix.

(float2):
Add condition check.

(float2):
Ditto.

(float2):
Limit suffix output.
(unspec_fix_truncv4dfv4si2): Extend round control.
(unspec_fixuns_truncv4dfv4si2): Ditto.
* config/i386/subst.md (round_qq2pssuff): New iterator.
(round_saeonly_suff): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-2.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 390 +
 gcc/config/i386/i386-builtin-types.def |   3 +
 gcc/config/i386/i386-builtin.def   |   7 +
 gcc/config/i386/i386-expand.cc |   3 +
 gcc/config/i386/sse.md |  43 ++-
 gcc/config/i386/subst.md   |   2 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   7 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-2.c |  72 
 gcc/testsuite/gcc.target/i386/sse-13.c |   7 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  21 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  21 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   7 +
 12 files changed, 569 insertions(+), 14 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index fca10a6b586..25efd9d7b96 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1003,6 +1003,244 @@ _mm256_maskz_cvt_roundps_epu64 (__mmask8 __U, __m128 
__A, const int __R)
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi64_pd (__m256i __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask_round ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepi64_pd (__m256d __W, __mmask8 __U, __m256i __A,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask_round ((__v4di) __A,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepi64_pd (__mmask8 __U, __m256i __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask_round ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi64_ph (__m256i __A, const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtqq2ph256_mask_round ((__v4di) __A,
+  (__v8hf)
+  _mm_setzero_ph (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m128h
+__attr

[gcc r15-2998] AVX10.2 ymm rounding: Support vcvttph2{, u}{dq, qq, w} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:493c5096050523ebc05e5fa21612683a996b97a7

commit r15-2998-g493c5096050523ebc05e5fa21612683a996b97a7
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:00 2024 +0800

AVX10.2 ymm rounding: Support vcvttph2{,u}{dq,qq,w} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md 
(avx512fp16_fix_trunc2):
Extend round control for 256bit.
(unspec_avx512fp16_fix_trunc2):
Ditto.

(avx512fp16_fix_trunc2):
Add condition check.
* config/i386/subst.md
(round_saeonly_mode_condition): Add V16HI check for 256bit.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-2.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 335 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/config/i386/sse.md |  10 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-2.c |  46 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 10 files changed, 447 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 25efd9d7b96..45a04e5a7a8 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1241,6 +1241,216 @@ _mm256_maskz_cvtt_roundpd_epu64 (__mmask8 __U, __m256d 
__A, const int __R)
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundph_epi32 (__m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvttph2dq256_mask_round ((__v8hf) __A,
+  (__v8si)
+  _mm256_setzero_si256 (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundph_epi32 (__m256i __W, __mmask8 __U, __m128h __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvttph2dq256_mask_round ((__v8hf) __A,
+   (__v8si) __W,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundph_epi32 (__mmask8 __U, __m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvttph2dq256_mask_round ((__v8hf) __A,
+  (__v8si)
+  _mm256_setzero_si256 (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundph_epi64 (__m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvttph2qq256_mask_round ((__v8hf) __A,
+  (__v4di)
+  _mm256_setzero_si256 (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundph_epi64 (__m256i __W, __mmask8 __U, __m128h __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvttph2qq256_mask_round ((__v8hf) __A,
+   (__v4di) __W,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundph_epi64 (__mmask8 __U, __m

[gcc r15-3000] AVX10.2 ymm rounding: Support vcvt{, u}w2ph and vdivp{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:3d1b5530ea1d23e26dc5ab70aa4a2e7b9dc19b50

commit r15-3000-g3d1b5530ea1d23e26dc5ab70aa4a2e7b9dc19b50
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:03 2024 +0800

AVX10.2 ymm rounding: Support vcvt{,u}w2ph and vdivp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V16HF_FTYPE_V16HI_V16HF_UHI_INT.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 286 +
 gcc/config/i386/i386-builtin-types.def |   1 +
 gcc/config/i386/i386-builtin.def   |   5 +
 gcc/config/i386/i386-expand.cc |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   5 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  58 +
 gcc/testsuite/gcc.target/i386/sse-13.c |   5 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  15 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   5 +
 10 files changed, 396 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 384facb424c..15ea46b5983 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1757,6 +1757,183 @@ _mm256_maskz_cvt_roundepu64_ps (__mmask8 __U, __m256i 
__A, const int __R)
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepu16_ph (__m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtuw2ph256_mask_round ((__v16hi) __A,
+  (__v16hf)
+  _mm256_setzero_ph (),
+  (__mmask16) -1,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepu16_ph (__m256h __W, __mmask16 __U, __m256i __A,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtuw2ph256_mask_round ((__v16hi) __A,
+  (__v16hf) __W,
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepu16_ph (__mmask16 __U, __m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtuw2ph256_mask_round ((__v16hi) __A,
+  (__v16hf)
+  _mm256_setzero_ph (),
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi16_ph (__m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtw2ph256_mask_round ((__v16hi) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepi16_ph (__m256h __W, __mmask16 __U, __m256i __A,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtw2ph256_mask_round ((__v16hi) __A,
+ (__v16hf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepi16_ph (__mmask16 __U, __m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtw2ph256_mask_round ((__v16hi) __A,
+  

[gcc r15-2999] AVX10.2 ymm rounding: Support vcvttps2{, u}{dq, qq} and vcvtu{dq, qq}2p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b2754227139512adecb6fda067632b587ff4a017

commit r15-2999-gb2754227139512adecb6fda067632b587ff4a017
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:01 2024 +0800

AVX10.2 ymm rounding: Support vcvttps2{,u}{dq,qq} and vcvtu{dq,qq}2p{s,d,h} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md
(unspec_fix_truncv8sfv8si2): Extend rounding control.
(fixuns_trunc2):
Ditto.

(floatuns2):
Add condition check.

(fix_trunc2):
Remove round_saeonly_name.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-2.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 492 +
 gcc/config/i386/i386-builtin.def   |   9 +
 gcc/config/i386/sse.md |  27 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   9 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-2.c |  75 
 gcc/testsuite/gcc.target/i386/sse-13.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  26 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  27 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   9 +
 9 files changed, 670 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 45a04e5a7a8..384facb424c 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1451,6 +1451,312 @@ _mm256_maskz_cvtt_roundph_epi16 (__mmask16 __U, __m256h 
__A, const int __R)
  (__mmask16) __U,
  __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundps_epi32 (__m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundps_epi32 (__m256i __W, __mmask8 __U, __m256 __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask_round ((__v8sf) __A,
+  (__v8si) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundps_epi32 (__mmask8 __U, __m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundps_epi64 (__m128 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2qq256_mask_round ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundps_epi64 (__m256i __W, __mmask8 __U, __m128 __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask_round ((__v4sf) __A,
+  (__v4di) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundps_epi64 (__mmask8 __U, __m128 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2qq256_mask_round ((__v4sf) __A,
+  

[gcc r15-3001] AVX10.2 ymm rounding: Support vfc{madd, mul}cph, vfixupimmp{s, d} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:95980b292b24110d3f1dffb81926df23c61b4fe7

commit r15-3001-g95980b292b24110d3f1dffb81926df23c61b4fe7
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:04 2024 +0800

AVX10.2 ymm rounding: Support vfc{madd,mul}cph, vfixupimmp{s,d} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V16HF_FTYPE_V16HF_V16HF_INT, V16HF_FTYPE_V16HF_V16HF_V16HF_INT,
V16HF_FTYPE_V16HF_V16HF_V16HF_UQI_INT,
V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI_INT,
V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI_INT.
* config/i386/sse.md:
(_fixupimm):
Add condition check.
(_fixupimm_mask): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 247 +
 gcc/config/i386/i386-builtin-types.def |   5 +
 gcc/config/i386/i386-builtin.def   |  10 +
 gcc/config/i386/i386-expand.cc |   5 +
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  10 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  49 
 gcc/testsuite/gcc.target/i386/sse-13.c |  10 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  13 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  13 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  10 +
 11 files changed, 374 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 15ea46b5983..d5ea6bc57da 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1934,6 +1934,164 @@ _mm256_maskz_div_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
  (__mmask8) __U,
  __R);
 }
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmadd_round_pch (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmadd_round_pch (__m256h __A, __mmask8 __U, __m256h __B,
+ __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_mask_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fcmadd_round_pch (__m256h __A, __m256h __B, __m256h __D,
+  __mmask8 __U, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_mask3_round ((__v16hf) __A,
+(__v16hf) __B,
+(__v16hf) __D,
+__U,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmadd_round_pch (__mmask8 __U, __m256h __A, __m256h __B,
+  __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_maskz_round ((__v16hf) __A,
+(__v16hf) __B,
+(__v16hf) __D,
+__U,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmul_round_pch (__m256h __A, __m256h __B, const int __R)
+{
+  return
+(__m256h) __builtin_ia32_vfcmulcph256_round ((__v16hf) __A,
+  

[gcc r15-3002] AVX10.2 ymm rounding: Support vfmadd{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:0683ca355a87fd36a2e7ae1721199204ceff4c4c

commit r15-3002-g0683ca355a87fd36a2e7ae1721199204ceff4c4c
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:05 2024 +0800

AVX10.2 ymm rounding: Support vfmadd{132,231,213}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fmadd__mask3): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 176 +
 gcc/config/i386/i386-builtin.def   |   9 ++
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   9 ++
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  31 
 gcc/testsuite/gcc.target/i386/sse-13.c |   9 ++
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   9 ++
 9 files changed, 268 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index d5ea6bc57da..9015095144e 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2092,6 +2092,146 @@ _mm256_maskz_fixupimm_round_ps (__mmask8 __U, __m256 
__A, __m256 __B,
(__mmask8) __U,
__R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+   __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_round_pd (__m256d __A, __m256d __B, __m256d __D,
+__mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_round_ph (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddph256_mask_round ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __D,
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_round_ph (__m256h __A, __mmask16 __U, __m256h __B,
+   __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmadd

[gcc r15-3003] AVX10.2 ymm rounding: Support vfmaddcph and vfmaddsub{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:cfbc94eaf167ae7aecd21ee6054556e1cf9d7143

commit r15-3003-gcfbc94eaf167ae7aecd21ee6054556e1cf9d7143
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:06 2024 +0800

AVX10.2 ymm rounding: Support vfmaddcph and vfmaddsub{132,231,213}p{s,d,h} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fmaddsub__mask): Add condition check.
(_fmaddsub__mask3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 238 +
 gcc/config/i386/i386-builtin.def   |  13 ++
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  13 ++
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  43 
 gcc/testsuite/gcc.target/i386/sse-13.c |  13 ++
 gcc/testsuite/gcc.target/i386/sse-14.c |  16 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  13 ++
 9 files changed, 366 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 9015095144e..95e42410a10 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2232,6 +2232,193 @@ _mm256_maskz_fmadd_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_round_pch (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_round ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __D,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_round_pch (__m256h __A, __mmask16 __U, __m256h __B,
+__m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_round_pch (__m256h __A, __m256h __B, __m256h __D,
+ __mmask16 __U, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_mask3_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_round_pch (__mmask16 __U, __m256h __A, __m256h __B,
+ __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_maskz_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   __U,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask_round ((__v4df) __A,
+(__v4df) __B,
+(__v4df) __D,
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmaddsub_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+  __m256d __D, const int __R)

[gcc r15-3005] AVX10.2 ymm rounding: Support vfmulcph and vfnmadd{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6f0aa7add1d9177f60016b32ca9ca8b16b173a56

commit r15-3005-g6f0aa7add1d9177f60016b32ca9ca8b16b173a56
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:09 2024 +0800

AVX10.2 ymm rounding: Support vfmulcph and vfnmadd{132,231,213}p{s,d,h} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 241 +
 gcc/config/i386/i386-builtin.def   |  11 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |  11 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +
 gcc/testsuite/gcc.target/i386/sse-13.c |  11 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  14 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  14 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  11 +
 8 files changed, 363 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 346a32c1a8a..3f833bffa54 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2697,6 +2697,185 @@ _mm256_maskz_fmsubadd_round_ps (__mmask8 __U, __m256 
__A, __m256 __B,
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmul_round_pch (__m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_round ((__v16hf) __B,
+(__v16hf) __D,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmul_round_pch (__m256h __A, __mmask8 __U, __m256h __B,
+   __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask_round ((__v16hf) __B,
+ (__v16hf) __D,
+ (__v16hf) __A,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmul_round_pch (__mmask8 __U, __m256h __B, __m256h __D,
+const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask_round ((__v16hf) __B,
+ (__v16hf) __D,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmadd_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmadd_round_pd (__m256d __A, __m256d __B, __m256d __D,
+ __mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask3_round ((__v4df) __A,
+   (__v4df) __B,
+   (__v4df) __D,
+  

[gcc r15-3004] AVX10.2 ymm rounding: Support vfm{sub, subadd}{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:dd48acbe85ca55dd23ffafbb917ffe559d13b6a3

commit r15-3004-gdd48acbe85ca55dd23ffafbb917ffe559d13b6a3
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:08 2024 +0800

AVX10.2 ymm rounding: Support vfm{sub,subadd}{132,231,213}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fmsub__mask): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 350 +
 gcc/config/i386/i386-builtin.def   |  18 ++
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  18 ++
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  62 
 gcc/testsuite/gcc.target/i386/sse-13.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-14.c |  24 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  24 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  18 ++
 9 files changed, 533 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 95e42410a10..346a32c1a8a 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2419,6 +2419,284 @@ _mm256_maskz_fmaddsub_round_ps (__mmask8 __U, __m256 
__A, __m256 __B,
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+   __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsub_round_pd (__m256d __A, __m256d __B, __m256d __D,
+__mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask3_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsub_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_maskz_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U, __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_round_ph (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h)
+__builtin_ia32_vfmsubph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  (__mmask16) -1, __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_round_ph (__m256h __A, __mmask16 __U, __m256h __B,
+   __m256h __D, const int __R)
+{
+  return (__m256h)
+__builtin_ia32_vfmsubph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  (__mmask16) __U, __R);
+}
+
+extern __inline __m256h
+__attribute__

[gcc r15-3006] AVX10.2 ymm rounding: Support vfnmsub{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:0983d406ae2e84394b25248865f51c686b119a57

commit r15-3006-g0983d406ae2e84394b25248865f51c686b119a57
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:10 2024 +0800

AVX10.2 ymm rounding: Support vfnmsub{132,231,213}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fnmsub__mask3): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 181 +
 gcc/config/i386/i386-builtin.def   |   9 +
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   9 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  31 
 gcc/testsuite/gcc.target/i386/sse-13.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   9 +
 9 files changed, 273 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 3f833bffa54..afc1220fea4 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2876,6 +2876,151 @@ _mm256_maskz_fnmadd_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmsub_round_pd (__m256d __A, __m256d __B, __m256d __D,
+ __mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3_round ((__v4df) __A,
+   (__v4df) __B,
+   (__v4df) __D,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmsub_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_maskz_round ((__v4df) __A,
+   (__v4df) __B,
+   (__v4df) __D,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_round_ph (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h)
+__builtin_ia32_vfnmsubph256_mask_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   (__mmask16) -1,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_round_ph (__m256h __A, __mmask16 __U, __m256h __B,
+__m256h __D, const int __R)
+{
+  return

[gcc r15-3007] AVX10.2 ymm rounding: Support vgetexpp{s, d, h} and vgetmantp{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:8d4f542935c09f40bb7fd8fd863cc8df80cc970e

commit r15-3007-g8d4f542935c09f40bb7fd8fd863cc8df80cc970e
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:11 2024 +0800

AVX10.2 ymm rounding: Support vgetexpp{s,d,h} and vgetmantp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SF_FTYPE_V8SF_V8SF_UQI_INT, V4DF_FTYPE_V4DF_V4DF_UQI_INT,
V16HF_FTYPE_V16HF_V16HF_UHI_INT, 
V16HF_FTYPE_V16HF_INT_V16HF_UHI_INT,
V4DF_FTYPE_V4DF_INT_V4DF_UQI_INT, V8SF_FTYPE_V8SF_INT_V8SF_UQI_INT.
* config/i386/sse.md:
(_getexp):
Add condition check.
(_getmant):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 341 +
 gcc/config/i386/i386-builtin-types.def |   6 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/config/i386/i386-expand.cc |   6 +
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  59 
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 11 files changed, 474 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index afc1220fea4..07729a6cc04 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3021,6 +3021,217 @@ _mm256_maskz_fnmsub_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_round_pd (__m256d __A, const int __R)
+{
+  return
+(__m256d) __builtin_ia32_getexppd256_mask_round ((__v4df) __A,
+(__v4df)
+_mm256_undefined_pd (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+const int __R)
+{
+  return (__m256d) __builtin_ia32_getexppd256_mask_round ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_round_pd (__mmask8 __U, __m256d __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_getexppd256_mask_round ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_round_ph (__m256h __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask_round ((__v16hf) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+const int __R)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask_round ((__v16hf) __A,
+ (__v16hf) __W,
+ (__mmask16) __

[gcc r15-3008] AVX10.2 ymm rounding: Support v{max, min}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:cc8a7596477e9d6ac972aadabbb2fd88baa1abf4

commit r15-3008-gcc8a7596477e9d6ac972aadabbb2fd88baa1abf4
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:13 2024 +0800

AVX10.2 ymm rounding: Support v{max,min}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 360 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 8 files changed, 470 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 07729a6cc04..a5712f5230a 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3232,6 +3232,228 @@ _mm256_maskz_getmant_round_ps (__mmask8 __U, __m256 __A,
  _mm256_setzero_ps (),
  __U, __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_round_pd (__m256d __A, __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_undefined_pd (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_setzero_pd (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_round_ph (__m256h __A, __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_maxph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf)
+  _mm256_undefined_ph (),
+  (__mmask16) -1,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+ __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_maxph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __W,
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_round_ph (__mmask16 __U, __m256h __A, __m256h __B,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_maxph256_mask_round ((__v16hf) __A,
+

[gcc r15-3009] AVX10.2 ymm rounding: Support vmulp{s, d, h} and vrangep{s, d} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:90cc5b0c4609a9fb3257d2cce7b7abc896c6faab

commit r15-3009-g90cc5b0c4609a9fb3257d2cce7b7abc896c6faab
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:14 2024 +0800

AVX10.2 ymm rounding: Support vmulp{s,d,h} and vrangep{s,d} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin):
Handle V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI_INT,
V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI_INT.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 313 +
 gcc/config/i386/i386-builtin-types.def |   2 +
 gcc/config/i386/i386-builtin.def   |   5 +
 gcc/config/i386/i386-expand.cc |   2 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   5 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  43 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   5 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  15 +
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   5 +
 10 files changed, 410 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index a5712f5230a..ac0914415c9 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3454,6 +3454,198 @@ _mm256_maskz_min_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_round_pd (__m256d __A, __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_undefined_pd (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_setzero_pd (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_round_ph (__m256h __A, __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_mulph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf)
+  _mm256_undefined_ph (),
+  (__mmask16) -1,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+ __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_mulph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __W,
+  (__mmask16)

[gcc r15-3010] AVX10.2 ymm rounding: Support vreducep{s, d, h} and vrndscalep{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:9afa5081212e1fc3cb2c4efc9b4f421eecf68810

commit r15-3010-g9afa5081212e1fc3cb2c4efc9b4f421eecf68810
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:18 2024 +0800

AVX10.2 ymm rounding: Support vreducep{s,d,h} and vrndscalep{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(reducep):
Add condition check.
(_rndscale): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 367 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 +
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 9 files changed, 479 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index ac0914415c9..d6b8e2695de 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3646,6 +3646,233 @@ _mm256_maskz_range_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
(__mmask8) __U,
__R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_round_pd (__m256d __A, const int __C, const int __R)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask_round ((__v4df) __A,
+ __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+const int __C, const int __R)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask_round ((__v4df) __A,
+ __C,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_round_pd (__mmask8 __U, __m256d __A, const int __C,
+ const int __R)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask_round ((__v4df) __A,
+ __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_round_ph (__m256h __A, const int __C, const int __R)
+{
+  return (__m256h) __builtin_ia32_reduceph256_mask_round ((__v16hf) __A,
+ __C,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+const int __C, const int __R)
+{
+  return (__m256h) __builtin_ia32_reduceph256_mask_round ((__v16hf) __A,
+ __C,
+ (__v16hf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h

[gcc r15-3011] AVX10.2 ymm rounding: Support vscalefp{s,d,h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:1f86cf06c7897f6ab467443b5fe8789cc95fe0c4

commit r15-3011-g1f86cf06c7897f6ab467443b5fe8789cc95fe0c4
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:19 2024 +0800

AVX10.2 ymm rounding: Support vscalefp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def: Add new builtins.
* config/i386/sse.md:
(_scalef): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 182 +
 gcc/config/i386/i386-builtin.def   |   3 +
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   3 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  25 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   3 +
 gcc/testsuite/gcc.target/i386/sse-14.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-22.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   3 +
 9 files changed, 238 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index d6b8e2695de..f35f2337858 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3873,6 +3873,119 @@ _mm256_maskz_roundscale_round_ps (__mmask8 __U, __m256 
__A, const int __C,
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_round_pd (__m256d __A, __m256d __B, const int __R)
+{
+  return
+(__m256d) __builtin_ia32_scalefpd256_mask_round ((__v4df) __A,
+(__v4df) __B,
+(__v4df)
+_mm256_undefined_pd (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+__m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_scalefpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ const int __R)
+{
+  return (__m256d) __builtin_ia32_scalefpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_round_ph (__m256h __A, __m256h __B, const int __R)
+{
+  return
+(__m256h) __builtin_ia32_scalefph256_mask_round ((__v16hf) __A,
+(__v16hf) __B,
+(__v16hf)
+_mm256_undefined_ph (),
+(__mmask16) -1,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+__m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_scalefph256_mask_round ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artifi

[gcc r15-3012] AVX10.2 ymm rounding: Support vsqrtp{s, d, h} and vsubp{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:7f62e7104ebc11c4570745972a023579922ef265

commit r15-3012-g7f62e7104ebc11c4570745972a023579922ef265
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:20 2024 +0800

AVX10.2 ymm rounding: Support vsqrtp{s,d,h} and vsubp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 339 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   7 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 8 files changed, 447 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index f35f2337858..c7146e37ec9 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3986,6 +3986,216 @@ _mm256_maskz_scalef_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_round_pd (__m256d __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask_round ((__v4df) __A,
+   (__v4df)
+   _mm256_undefined_pd (),
+   (__mmask8) -1,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask_round ((__v4df) __A,
+   (__v4df) __W,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_round_pd (__mmask8 __U, __m256d __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask_round ((__v4df) __A,
+   (__v4df)
+   _mm256_setzero_pd (),
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_round_ph (__m256h __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_sqrtph256_mask_round ((__v16hf) __A,
+   (__v16hf)
+   _mm256_undefined_ph (),
+   (__mmask16) -1,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_sqrtph256_mask_round ((__v16hf) __A,
+   (__v16hf) __W,
+   (__mmask16) __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_round_ph (__mmask16 __U, __m256h __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_sqrtph256_mask_round ((__v16hf) __A,
+   (__v16hf)
+   _mm256_setzero_ph (),
+   (__mmask16) __U,
+   __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_round_ps (__m256 __A, const int __R)
+{
+  return (__m256) __builtin_ia3

[gcc r15-3175] i386: Refactor m512-check.h

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:cba4566879192abdc54bdf76b010e22d67484129

commit r15-3175-gcba4566879192abdc54bdf76b010e22d67484129
Author: Haochen Jiang 
Date:   Mon Aug 26 10:53:35 2024 +0800

i386: Refactor m512-check.h

After AVX10 introduction, we still want to use AVX512 helper functions
to avoid duplicate code. In order to reuse them, we need to do some refactor
to make sure each function define happen under correct ISA to avoid ABI
warnings.

gcc/testsuite/ChangeLog:

* gcc.target/i386/m512-check.h: Wrap the function define with
correct vector size.

Diff:
---
 gcc/testsuite/gcc.target/i386/m512-check.h | 66 --
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/m512-check.h 
b/gcc/testsuite/gcc.target/i386/m512-check.h
index 68e74fce68d2..d5d183729473 100644
--- a/gcc/testsuite/gcc.target/i386/m512-check.h
+++ b/gcc/testsuite/gcc.target/i386/m512-check.h
@@ -61,6 +61,12 @@ typedef union
  unsigned long long a[8];
 } union512i_uq;
 
+typedef union
+{
+  __m512h x;
+  _Float16 a[32];
+} union512h;
+
 typedef union
 {
   __m128h x;
@@ -73,27 +79,6 @@ typedef union
   _Float16 a[16];
 } union256h;
 
-typedef union
-{
-  __m512h x;
-  _Float16 a[32];
-} union512h;
-
-CHECK_EXP (union512i_b, char, "%d")
-CHECK_EXP (union512i_w, short, "%d")
-CHECK_EXP (union512i_d, int, "0x%x")
-CHECK_EXP (union512i_q, long long, "0x%llx")
-CHECK_EXP (union512, float, "%f")
-CHECK_EXP (union512d, double, "%f")
-CHECK_EXP (union512i_ub, unsigned char, "%d")
-CHECK_EXP (union512i_uw, unsigned short, "%d")
-CHECK_EXP (union512i_ud, unsigned int, "0x%x")
-CHECK_EXP (union512i_uq, unsigned long long, "0x%llx")
- 
-
-CHECK_FP_EXP (union512, float, ESP_FLOAT, "%f")
-CHECK_FP_EXP (union512d, double, ESP_DOUBLE, "%f")
-
 #define CHECK_ROUGH_EXP(UNION_TYPE, VALUE_TYPE, FMT)   \
 static int \
 __attribute__((noinline, unused))  \
@@ -126,28 +111,47 @@ check_rough_##UNION_TYPE (UNION_TYPE u, const VALUE_TYPE 
*v,  \
   return err;  \
 }
 
-CHECK_ROUGH_EXP (union512, float, "%f")
-CHECK_ROUGH_EXP (union512d, double, "%f")
+#ifndef ESP_FLOAT16
+#define ESP_FLOAT16 0.27
+#endif
+
 CHECK_ROUGH_EXP (union256, float, "%f")
 CHECK_ROUGH_EXP (union256d, double, "%f")
 CHECK_ROUGH_EXP (union128, float, "%f")
 CHECK_ROUGH_EXP (union128d, double, "%f")
 
-#ifdef AVX512FP16
+#ifndef AVX512F_LEN
+CHECK_EXP (union512i_b, char, "%d")
+CHECK_EXP (union512i_w, short, "%d")
+CHECK_EXP (union512i_d, int, "0x%x")
+CHECK_EXP (union512i_q, long long, "0x%llx")
+CHECK_EXP (union512, float, "%f")
+CHECK_EXP (union512d, double, "%f")
+CHECK_EXP (union512i_ub, unsigned char, "%d")
+CHECK_EXP (union512i_uw, unsigned short, "%d")
+CHECK_EXP (union512i_ud, unsigned int, "0x%x")
+CHECK_EXP (union512i_uq, unsigned long long, "0x%llx")
+ 
+CHECK_FP_EXP (union512, float, ESP_FLOAT, "%f")
+CHECK_FP_EXP (union512d, double, ESP_DOUBLE, "%f")
 
-CHECK_EXP (union128h, _Float16, "%f")
-CHECK_EXP (union256h, _Float16, "%f")
-CHECK_EXP (union512h, _Float16, "%f")
+CHECK_ROUGH_EXP (union512, float, "%f")
+CHECK_ROUGH_EXP (union512d, double, "%f")
 
-#ifndef ESP_FLOAT16
-#define ESP_FLOAT16 0.27
+#if defined(AVX512FP16)
+CHECK_EXP (union512h, _Float16, "%f")
+CHECK_FP_EXP (union512h, _Float16, ESP_FLOAT16, "%f")
+CHECK_ROUGH_EXP (union512h, _Float16, "%f")
+#endif
 #endif
 
+#if defined(AVX512FP16)
+CHECK_EXP (union128h, _Float16, "%f")
+CHECK_EXP (union256h, _Float16, "%f")
+
 CHECK_FP_EXP (union128h, _Float16, ESP_FLOAT16, "%f")
 CHECK_FP_EXP (union256h, _Float16, ESP_FLOAT16, "%f")
-CHECK_FP_EXP (union512h, _Float16, ESP_FLOAT16, "%f")
 
 CHECK_ROUGH_EXP (union128h, _Float16, "%f")
 CHECK_ROUGH_EXP (union256h, _Float16, "%f")
-CHECK_ROUGH_EXP (union512h, _Float16, "%f")
 #endif


[gcc r15-3176] [PATCH 1/2] AVX10.2: Support media instructions

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:8db80b2735782d793a83a9ef7eb012d83be7660d

commit r15-3176-g8db80b2735782d793a83a9ef7eb012d83be7660d
Author: Hongyu Wang 
Date:   Mon Aug 26 10:53:37 2024 +0800

[PATCH 1/2] AVX10.2: Support media instructions

gcc/ChangeLog

* config.gcc: Add avx10_2mediaintrin.h and
avx10_2-512mediaintrin.h.
* config/i386/i386-builtin.def: Add new builtins.
* config/i386/i386-builtins.cc (def_builtin): Handle shared
builtins between AVXVNNIINT8 and AVX10.2.
* config/i386/i386-expand.cc (ix86_check_builtin_isa_match):
Ditto.
* config/i386/immintrin.h: Include avx10_2mediaintrin.h and
avx10_2-512mediaintrin.h
* config/i386/sse.md: (VI4_AVX10_2): New.
(vpdp_): Add AVX10_2_256.
(vpdp_v16si): New define_insn.
(vpdp__mask): Ditto.
(*vpdp__maskz): Ditto.
(vpdp__maskz): New expander.
* config/i386/avx10_2-512mediaintrin.h: New file.
* config/i386/avx10_2mediaintrin.h: Ditto.

gcc/testsuite/ChangeLog

* gcc.target/i386/avx512f-helper.h: Reuse AVX512F macros
for AVX10.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* lib/target-supports.exp
(check_effective_target_avx10_2): New.
(check_effective_target_avx10_2_512): Ditto.
* gcc.target/i386/avx10-check.h: New test file.
* gcc.target/i386/avx10-helper.h: Ditto.
* gcc.target/i386/avx10_2-builtin-1.c: Ditto.
* gcc.target/i386/avx10_2-512-media-1.c: Ditto.
* gcc.target/i386/avx10_2-media-1.c: Ditto..
* gcc.target/i386/avxvnniint8-builtin.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbssd-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbssds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbsud-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbsuds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbuud-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbuuds-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpbssd-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpbssds-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpbsud-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpbsuds-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpbuud-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpbuuds-2.c: Ditto.

Co-authored-by: Haochen Jiang 

Diff:
---
 gcc/config.gcc |   3 +-
 gcc/config/i386/avx10_2-512mediaintrin.h   | 234 +
 gcc/config/i386/avx10_2mediaintrin.h   | 367 +
 gcc/config/i386/i386-builtin.def   |  68 +++-
 gcc/config/i386/i386-builtins.cc   |  10 +-
 gcc/config/i386/i386-expand.cc |   3 +
 gcc/config/i386/immintrin.h|   4 +
 gcc/config/i386/sse.md |  66 +++-
 gcc/testsuite/gcc.target/i386/avx10-check.h|  61 
 gcc/testsuite/gcc.target/i386/avx10-helper.h   |  23 ++
 gcc/testsuite/gcc.target/i386/avx10-os-support.h   |  23 ++
 .../gcc.target/i386/avx10_2-512-media-1.c  |  52 +++
 .../gcc.target/i386/avx10_2-512-vpdpbssd-2.c   |  71 
 .../gcc.target/i386/avx10_2-512-vpdpbssds-2.c  |  74 +
 .../gcc.target/i386/avx10_2-512-vpdpbsud-2.c   |  71 
 .../gcc.target/i386/avx10_2-512-vpdpbsuds-2.c  |  74 +
 .../gcc.target/i386/avx10_2-512-vpdpbuud-2.c   |  70 
 .../gcc.target/i386/avx10_2-512-vpdpbuuds-2.c  |  73 
 gcc/testsuite/gcc.target/i386/avx10_2-builtin-1.c  |   8 +
 gcc/testsuite/gcc.target/i386/avx10_2-media-1.c|  96 ++
 gcc/testsuite/gcc.target/i386/avx10_2-vpdpbssd-2.c |  16 +
 .../gcc.target/i386/avx10_2-vpdpbssds-2.c  |  16 +
 gcc/testsuite/gcc.target/i386/avx10_2-vpdpbsud-2.c |  16 +
 .../gcc.target/i386/avx10_2-vpdpbsuds-2.c  |  16 +
 gcc/testsuite/gcc.target/i386/avx10_2-vpdpbuud-2.c |  16 +
 .../gcc.target/i386/avx10_2-vpdpbuuds-2.c  |  16 +
 gcc/testsuite/gcc.target/i386/avx512f-helper.h |   6 +-
 .../gcc.target/i386/avxvnniint8-builtin.c  |   8 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc  |   4 +
 gcc/testsuite/lib/target-supports.exp  |  36 ++
 30 files changed, 1577 insertions(+), 24 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 2c0f4518638d..22353f2d69e6 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -452,7 +452,8 @@ i[34567]86-*-* | x86_64-*-*)
   cmpccxaddintrin.h amxfp16intrin.h prfchiintrin.h
   raointintrin.h amxcomplexintrin.h avxvnniint16intrin.h
   sm3intrin.h sha512intrin.h sm4intrin.h
-  usermsrintrin.h avx10_2roundingintrin.h"
+  us

[gcc r15-3177] [PATCH 2/2] AVX10.2: Support media instructions

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:af0a06274fce2ca64456f5b13b4bc8ff864a45e4

commit r15-3177-gaf0a06274fce2ca64456f5b13b4bc8ff864a45e4
Author: Haochen Jiang 
Date:   Mon Aug 26 10:53:39 2024 +0800

[PATCH 2/2] AVX10.2: Support media instructions

gcc/ChangeLog:

* config/i386/avx10_2-512mediaintrin.h: Add new intrins.
* config/i386/avx10_2mediaintrin.h: Ditto.
* config/i386/i386-builtin.def: Add new builtins.
* config/i386/i386-builtins.cc (def_builtin): Handle shared
builtins between AVXVNNIINT16 and AVX10.2.
* config/i386/i386-expand.cc (ix86_check_builtin_isa_match):
Ditto.
* config/i386/sse.md (unspec): Add UNSPEC_VDPPHPS.
(avx10_2_mpsadbw): New define_insn.
(_mpsadbw): Ditto.
(vpdp_): Add AVX10_2_256.
(vpdp_v16si): New defin_insn.
(vpdp__mask): Ditto.
(*vpdp__maskz): Ditto.
(vpdp__maskz): New expander.
(vdpphps_): New define_insn.
(vdpphps__mask): Ditto.
(*vdpphps__maskz): Ditto.
(vdpphps__maskz): New expander.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avxvnniint16-1.c: Add new macro test.
* gcc.target/i386/avx-1.c: Ditto.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-512-media-1.c: Add test.
* gcc.target/i386/avx10_2-media-1.c: Ditto.
* gcc.target/i386/avxvnniint16-builtin.c: New test.
* gcc.target/i386/avx10_2-512-vdpphps-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vmpsadbw-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwsud-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwsuds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwusd-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwusds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwuud-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwuuds-2.c: Ditto.
* gcc.target/i386/avx10_2-builtin-2.c: Ditto.
* gcc.target/i386/avx10_2-vdpphps-2.c: Ditto.
* gcc.target/i386/avx10_2-vmpsadbw-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpwsud-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpwsuds-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpwusd-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpwusds-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpwuud-2.c: Ditto.
* gcc.target/i386/avx10_2-vpdpwuuds-2.c: Ditto.

Co-authored-by: Hongyu Wang 

Diff:
---
 gcc/config/i386/avx10_2-512mediaintrin.h   | 280 
 gcc/config/i386/avx10_2mediaintrin.h   | 472 +
 gcc/config/i386/i386-builtin.def   |  76 +++-
 gcc/config/i386/i386-builtins.cc   |  11 +-
 gcc/config/i386/i386-expand.cc |   3 +
 gcc/config/i386/sse.md | 145 ++-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   8 +
 .../gcc.target/i386/avx10_2-512-media-1.c  |  60 +++
 .../gcc.target/i386/avx10_2-512-vdpphps-2.c|  71 
 .../gcc.target/i386/avx10_2-512-vmpsadbw-2.c   |  93 
 .../gcc.target/i386/avx10_2-512-vpdpwsud-2.c   |  71 
 .../gcc.target/i386/avx10_2-512-vpdpwsuds-2.c  |  74 
 .../gcc.target/i386/avx10_2-512-vpdpwusd-2.c   |  71 
 .../gcc.target/i386/avx10_2-512-vpdpwusds-2.c  |  74 
 .../gcc.target/i386/avx10_2-512-vpdpwuud-2.c   |  70 +++
 .../gcc.target/i386/avx10_2-512-vpdpwuuds-2.c  |  73 
 gcc/testsuite/gcc.target/i386/avx10_2-builtin-2.c  |   8 +
 gcc/testsuite/gcc.target/i386/avx10_2-media-1.c| 112 +
 gcc/testsuite/gcc.target/i386/avx10_2-vdpphps-2.c  |  16 +
 gcc/testsuite/gcc.target/i386/avx10_2-vmpsadbw-2.c |  16 +
 gcc/testsuite/gcc.target/i386/avx10_2-vpdpwsud-2.c |  16 +
 .../gcc.target/i386/avx10_2-vpdpwsuds-2.c  |  16 +
 gcc/testsuite/gcc.target/i386/avx10_2-vpdpwusd-2.c |  16 +
 .../gcc.target/i386/avx10_2-vpdpwusds-2.c  |  16 +
 gcc/testsuite/gcc.target/i386/avx10_2-vpdpwuud-2.c |  16 +
 .../gcc.target/i386/avx10_2-vpdpwuuds-2.c  |  16 +
 gcc/testsuite/gcc.target/i386/avxvnniint16-1.c |  42 +-
 .../gcc.target/i386/avxvnniint16-builtin.c |   8 +
 gcc/testsuite/gcc.target/i386/sse-13.c |   8 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  11 +
 gcc/testsuite/gcc.target/i386/sse-22.c |  11 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   8 +
 32 files changed, 1953 insertions(+), 35 deletions(-)

diff --git a/gcc/config/i386/avx10_2-512mediaintrin.h 
b/gcc/config/i386/avx10_2-512mediaintrin.h
index 02d826b24cd2..e471c83b1c48 100644
--- a/gcc/config/i386/avx10_2-512mediaintr

[gcc r15-3179] [PATCH 1/2] AVX10.2: Support BF16 instructions

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:9023662464ac7a0bbac72d94078ea0845bf99c86

commit r15-3179-g9023662464ac7a0bbac72d94078ea0845bf99c86
Author: konglin1 
Date:   Mon Aug 26 10:53:43 2024 +0800

[PATCH 1/2] AVX10.2: Support BF16 instructions

gcc/ChangeLog:

* config.gcc: Add avx10_2-512bf16intrin.h and avx10_2bf16intrin.h.
* config/i386/i386-builtin-types.def : Add new
DEF_FUNCTION_TYPE for V32BF_FTYPE_V32BF_V32BF,
V16BF_FTYPE_V16BF_V16BF, V8BF_FTYPE_V8BF_V8BF,
V8BF_FTYPE_V8BF_V8BF_UQI, V16BF_FTYPE_V16BF_V16BF_UHI,
V32BF_FTYPE_V32BF_V32BF_USI, V32BF_FTYPE_V32BF_V32BF_V32BF_USI,
V8BF_FTYPE_V8BF_V8BF_V8BF_UQI and V16BF_FTYPE_V16BF_V16BF_V16BF_UHI.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_args_builtin):
Handle new DEF_FUNCTION_TYPE.
* config/i386/immintrin.h: Include avx10_2-512bf16intrin.h and
avx10_2bf16intrin.h.
* config/i386/sse.md
(VBF_AVX10_2): New iterator.
(avx10_2_scalefpbf16_): New define_insn.
(avx10_2_nepbf16_): Ditto.
(avx10_2_nepbf16_): Ditto.
(avx10_2_fmaddnepbf16__maskz): New expander.
(avx10_2_fnmaddnepbf16__maskz): Ditto.
(avx10_2_fmsubnepbf16__maskz): Ditto.
(avx10_2_fnmsubnepbf16__maskz): Ditto.
(avx10_2_fmaddnepbf16_): New define_insn.
(avx10_2_fmaddnepbf16__mask): Ditto.
(avx10_2_fmaddnepbf16__mask3): Ditto.
(avx10_2_fnmaddnepbf16_): Ditto.
(avx10_2_fnmaddnepbf16__mask): Ditto.
(avx10_2_fnmaddnepbf16__mask3): Ditto.
(avx10_2_fmsubnepbf16_): Ditto.
(avx10_2_fmsubnepbf16__mask): Ditto.
(avx10_2_fmsubnepbf16__mask3): Ditto.
(avx10_2_fnmsubnepbf16_): Ditto.
(avx10_2_fnmsubnepbf16__mask): Ditto.
(avx10_2_fnmsubnepbf16__mask3): Ditto.
* config/i386/avx10_2-512bf16intrin.h: New file.
* config/i386/avx10_2bf16intrin.h: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512f-helper.h: Add MAKE_MASK_MERGE and 
MAKE_MASK_ZERO
for bf16_uw.
* gcc.target/i386/m512-check.h: Add union512bf16_uw, 
union256bf16_uw,
union128bf16_uw and CHECK_EXP for them.
* gcc.target/i386/avx10-helper.h: New file.
* gcc.target/i386/avx10_2-512-bf16-1.c: New test.
* gcc.target/i386/avx10_2-512-vaddnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vdivnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vfmaddXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vfmsubXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vfnmaddXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vfnmsubXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vmaxpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vminpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vscalefpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vsubnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-bf16-1.c: Ditto.
* gcc.target/i386/avx10_2-vaddnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vdivnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vfmaddXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vfmsubXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vfnmaddXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vfnmsubXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vmaxpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vminpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vmulnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vscalefpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vsubnepbf16-2.c: Ditto.

Co-authored-by: Levy Hsu 

Diff:
---
 gcc/config.gcc |   2 +-
 gcc/config/i386/avx10_2-512bf16intrin.h| 364 +++
 gcc/config/i386/avx10_2bf16intrin.h| 685 +
 gcc/config/i386/i386-builtin-types.def |   9 +
 gcc/config/i386/i386-builtin.def   |  78 +++
 gcc/config/i386/i386-expand.cc |   9 +
 gcc/config/i386/immintrin.h|   4 +
 gcc/config/i386/sse.md | 293 +
 gcc/testsuite/gcc.target/i386/avx10-helper.h   |  48 +-
 gcc/testsuite/gcc.target/i386/avx10_2-512-bf16-1.c |  87 +++
 .../gcc.target/i386/avx10_2-512-vaddnepbf16-2.c|  49 ++
 .../gcc.target/i386/avx10_2-512-vdivnepbf16-2.c|  49 ++
 .../i386/avx10_2-512-vfmaddXXXnepbf16-2.c  |  52 ++
 .../i386/avx10_2-512-vfmsubXXXnepbf16-2.c  |  53 ++
 .../i386/avx10_2-512-vfnmaddXXXnepbf16-2.c |  53 ++
 .../i386/avx10_2-

[gcc r15-3180] [PATCH 2/2] AVX10.2: Support BF16 instructions

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:5cb67ddd8240610f39c211b2f73070dc70b0230b

commit r15-3180-g5cb67ddd8240610f39c211b2f73070dc70b0230b
Author: konglin1 
Date:   Mon Aug 26 10:53:45 2024 +0800

[PATCH 2/2] AVX10.2: Support BF16 instructions

gcc/ChangeLog:

* config/i386/avx10_2-512bf16intrin.h: Add new intrinsics.
* config/i386/avx10_2bf16intrin.h: Diito.
* config/i386/i386-builtin-types.def : Add new DEF_FUNCTION_TYPE
for new type.
* config/i386/i386-builtin.def (BDESC): Add new buildin.
* config/i386/i386-expand.cc (ix86_expand_args_builtin):
Handle new type.
* config/i386/sse.md (vecmemsuffix): Add vector BF mode.
(avx10_2_rsqrtpbf16_): New define_insn.
(avx10_2_sqrtnepbf16_): Ditto.
(avx10_2_rcppbf16_): Ditto.
(avx10_2_getexppbf16_): Ditto.
(BF16IMMOP): New iterator.
(bf16immop): Ditto.
(avx10_2_pbf16_): New define_insn.
(avx10_2_fpclasspbf16_): Ditto.
(avx10_2_cmppbf16_): Ditto.
(avx10_2_comsbf16_v8bf): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10-check.h: Add AVX10_SCALAR.
* gcc.target/i386/avx10-helper.h: Add helper functions.
* gcc.target/i386/avx10_2-512-bf16-1.c: Add new tests.
* gcc.target/i386/avx10_2-bf16-1.c: Ditto.
* gcc.target/i386/avx-1.c: Add macros.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-512-vcmppbf16-2.c: New test.
* gcc.target/i386/avx10_2-512-vfpclasspbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vgetexppbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vgetmantpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vrcppbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vreducenepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vrndscalenepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vrsqrtpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vsqrtnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vcmppbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vcomsbf16-1.c: Ditto.
* gcc.target/i386/avx10_2-vcomsbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vfpclasspbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vgetexppbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vgetmantpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vrcppbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vreducenepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vrndscalenepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vrsqrtpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vsqrtnepbf16-2.c: Ditto.

Co-authored-by: Levy Hsu 

Diff:
---
 gcc/config/i386/avx10_2-512bf16intrin.h| 317 ++
 gcc/config/i386/avx10_2bf16intrin.h| 650 +
 gcc/config/i386/i386-builtin-types.def |  10 +
 gcc/config/i386/i386-builtin.def   |  33 ++
 gcc/config/i386/i386-expand.cc |  16 +
 gcc/config/i386/sse.md |  92 +++
 gcc/testsuite/gcc.target/i386/avx-1.c  |  19 +
 gcc/testsuite/gcc.target/i386/avx10-check.h|   4 +-
 gcc/testsuite/gcc.target/i386/avx10-helper.h   |  28 +
 gcc/testsuite/gcc.target/i386/avx10_2-512-bf16-1.c |  58 ++
 .../gcc.target/i386/avx10_2-512-vcmppbf16-2.c  |  36 ++
 .../gcc.target/i386/avx10_2-512-vfpclasspbf16-2.c  |  44 ++
 .../gcc.target/i386/avx10_2-512-vgetexppbf16-2.c   |  47 ++
 .../gcc.target/i386/avx10_2-512-vgetmantpbf16-2.c  |  50 ++
 .../gcc.target/i386/avx10_2-512-vrcppbf16-2.c  |  45 ++
 .../gcc.target/i386/avx10_2-512-vreducenepbf16-2.c |  50 ++
 .../i386/avx10_2-512-vrndscalenepbf16-2.c  |  46 ++
 .../gcc.target/i386/avx10_2-512-vrsqrtpbf16-2.c|  47 ++
 .../gcc.target/i386/avx10_2-512-vsqrtnepbf16-2.c   |  47 ++
 gcc/testsuite/gcc.target/i386/avx10_2-bf16-1.c | 114 
 .../gcc.target/i386/avx10_2-vcmppbf16-2.c  |  16 +
 .../gcc.target/i386/avx10_2-vcomsbf16-1.c  |  19 +
 .../gcc.target/i386/avx10_2-vcomsbf16-2.c  |  58 ++
 .../gcc.target/i386/avx10_2-vfpclasspbf16-2.c  |  16 +
 .../gcc.target/i386/avx10_2-vgetexppbf16-2.c   |  16 +
 .../gcc.target/i386/avx10_2-vgetmantpbf16-2.c  |  16 +
 .../gcc.target/i386/avx10_2-vrcppbf16-2.c  |  16 +
 .../gcc.target/i386/avx10_2-vreducenepbf16-2.c |  16 +
 .../gcc.target/i386/avx10_2-vrndscalenepbf16-2.c   |  16 +
 .../gcc.target/i386/avx10_2-vrsqrtpbf16-2.c|  16 +
 .../gcc.target/i386/avx10_2-vsqrtnepbf16-2.c   |  16 +
 gcc/testsuite/gcc.target/i386/sse-13.c |  19 +
 gcc/testsui

[gcc r15-3181] [PATCH 1/2] AVX10.2: Support saturating convert instructions

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:e2c80d237223f8524c2bd930b681aa891a13db99

commit r15-3181-ge2c80d237223f8524c2bd930b681aa891a13db99
Author: Hu, Lin1 
Date:   Mon Aug 26 10:53:47 2024 +0800

[PATCH 1/2] AVX10.2: Support saturating convert instructions

gcc/ChangeLog:

* config.gcc: Add avx10_2satcvtintrin.h and
avx10_2-512satcvtintrin.h.
* config/i386/i386-builtin-types.def:
Add DEF_FUNCTION_TYPE (V8HI, V8BF, V8HI, UQI),
(V16HI, V16BF, V16HI, UHI), (V32HI, V32BF, V32HI, USI),
(V16SI, V16SF, V16SI, UHI, INT), (V16HI, V16BF, V16HI, UHI, INT),
(V32HI, V32BF, V32HI, USI, INT).
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_args_builtin): Handle
V32HI_FTYPE_V32BF_V32HI_USI, V16HI_FTYPE_V16BF_V16HI_UHI,
V8HI_FTYPE_V8BF_V8HI_UQI.
(ix86_expand_round_builtin): Handle V32HI_FTYPE_V32BF_V32HI_USI_INT,
V16SI_FTYPE_V16SF_V16SI_UHI_INT, V16HI_FTYPE_V16BF_V16HI_UHI_INT.
* config/i386/immintrin.h: Include avx10_2satcvtintrin.h and
avx10_2-512savcvtintrin.h.
* config/i386/sse.md:
(UNSPEC_CVTNE_BF16_IBS_ITER): New iterator.
(sat_cvt_sign_prefix): Ditto.
(sat_cvt_trunc_prefix): Ditto.
(UNSPEC_CVT_PH_IBS_ITER): Ditto.
(UNSPEC_CVTT_PH_IBS_ITER): Ditto.
(UNSPEC_CVT_PS_IBS_ITER): Ditto.
(UNSPEC_CVTT_PS_IBS_ITER): Ditto.

(avx10_2_cvtnebf162ibs):
New define_insn.

(avx10_2_cvtph2ibs):
Ditto.

(avx10_2_cvttph2ibs):
Ditto.

(avx10_2_cvtps2ibs):
Ditto.

(avx10_2_cvttps2ibs):
Ditto.
* config/i386/avx10_2-512satcvtintrin.h: New file.
* config/i386/avx10_2satcvtintrin.h: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add macros.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx512f-helper.h: Add new test macro.
* gcc.target/i386/m512-check.h: Add new type.
* gcc.target/i386/avx10_2-512-satcvt-1.c: New test.
* gcc.target/i386/avx10_2-512-vcvtnebf162ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtnebf162iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtph2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtph2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtps2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtps2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttnebf162ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttnebf162iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttph2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttph2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-satcvt-1.c: Ditto.
* gcc.target/i386/avx10_2-vcvtnebf162ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvtnebf162iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvtph2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvtph2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvtps2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttnebf162ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttnebf162iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttph2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttph2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttps2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttps2iubs-2.c: Ditto.

Diff:
---
 gcc/config.gcc |4 +-
 gcc/config/i386/avx10_2-512satcvtintrin.h  |  624 
 gcc/config/i386/avx10_2satcvtintrin.h  | 1022 
 gcc/config/i386/i386-builtin-types.def |6 +
 gcc/config/i386/i386-builtin.def   |   36 +
 gcc/config/i386/i386-expand.cc |6 +
 gcc/config/i386/immintrin.h|3 +
 gcc/config/i386/sse.md |  109 +++
 gcc/testsuite/gcc.target/i386/avx-1.c  |   20 +
 .../gcc.target/i386/avx10_2-512-satcvt-1.c |  100 ++
 .../gcc.target/i386/avx10_2-512-vcvtnebf162ibs-2.c |   69 ++
 .../i386/avx10_2-512-vcvtnebf162iubs-2.c   |   69 ++
 .../gcc.target/i386/avx10_2-512-vcvtph2ibs-2.c |   74 ++
 .../gcc.target/i386/avx10_2-512-vcvtph2iubs-2.c|   74 ++
 .../gcc.target/i386/avx10_2-512-vcvtps2ibs-2.c |   75 ++
 .../gcc.target/i386/avx10_2-512-vcvtps2iubs-2.c

[gcc r15-3182] [PATCH 2/2] AVX10.2: Support saturating convert instructions

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:3a97ce179f75ec32b7f591422ba254c814567e4d

commit r15-3182-g3a97ce179f75ec32b7f591422ba254c814567e4d
Author: Hu, Lin1 
Date:   Mon Aug 26 10:53:49 2024 +0800

[PATCH 2/2] AVX10.2: Support saturating convert instructions

gcc/ChangeLog:

* config/i386/avx10_2-512satcvtintrin.h: Add new intrin.
* config/i386/avx10_2satcvtintrin.h: Ditto.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md (VF1_VF2_AVX10_2): New iterator.
(VF2_AVX10_2): Ditto.
(VI8_AVX10_2): Ditto.
(sat_cvt_sign_prefix): Add new UNSPEC.
(UNSPEC_SAT_CVT_DS_SIGN_ITER): New iterator.
(pd2dqssuff): Ditto.

(avx10_2_vcvtt2dqs):
New.

(avx10_2_vcvttpd2qqs):
Ditto.

(avx10_2_vcvttps2qqs):
Ditto.

(avx10_2_vcvttsd2sis):
Ditto.

(avx10_2_vcvttss2sis):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add macros.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-satcvt-1.c: Add test.
* gcc.target/i386/avx10_2-512-satcvt-1.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttpd2dqs-2.c: New test.
* gcc.target/i386/avx10_2-512-vcvttpd2qqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttpd2udqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttpd2uqqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2dqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2qqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2udqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2uqqs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttpd2dqs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttpd2qqs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttpd2udqs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttpd2uqqs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttps2dqs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttps2qqs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttps2udqs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttps2uqqs-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttsd2sis-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttsd2usis-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttss2sis-2.c: Ditto.
* gcc.target/i386/avx10_2-vcvttss2usis-2.c: Ditto.

Diff:
---
 gcc/config/i386/avx10_2-512satcvtintrin.h  |  456 +
 gcc/config/i386/avx10_2satcvtintrin.h  | 1055 +++-
 gcc/config/i386/i386-builtin.def   |   32 +
 gcc/config/i386/sse.md |   83 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   26 +
 .../gcc.target/i386/avx10_2-512-satcvt-1.c |   59 ++
 .../gcc.target/i386/avx10_2-512-vcvttpd2dqs-2.c|   72 ++
 .../gcc.target/i386/avx10_2-512-vcvttpd2qqs-2.c|   72 ++
 .../gcc.target/i386/avx10_2-512-vcvttpd2udqs-2.c   |   72 ++
 .../gcc.target/i386/avx10_2-512-vcvttpd2uqqs-2.c   |   72 ++
 .../gcc.target/i386/avx10_2-512-vcvttps2dqs-2.c|   72 ++
 .../gcc.target/i386/avx10_2-512-vcvttps2qqs-2.c|   73 ++
 .../gcc.target/i386/avx10_2-512-vcvttps2udqs-2.c   |   72 ++
 .../gcc.target/i386/avx10_2-512-vcvttps2uqqs-2.c   |   72 ++
 gcc/testsuite/gcc.target/i386/avx10_2-satcvt-1.c   |  138 +++
 .../gcc.target/i386/avx10_2-vcvttpd2dqs-2.c|   16 +
 .../gcc.target/i386/avx10_2-vcvttpd2qqs-2.c|   16 +
 .../gcc.target/i386/avx10_2-vcvttpd2udqs-2.c   |   16 +
 .../gcc.target/i386/avx10_2-vcvttpd2uqqs-2.c   |   16 +
 .../gcc.target/i386/avx10_2-vcvttps2dqs-2.c|   16 +
 .../gcc.target/i386/avx10_2-vcvttps2qqs-2.c|   16 +
 .../gcc.target/i386/avx10_2-vcvttps2udqs-2.c   |   16 +
 .../gcc.target/i386/avx10_2-vcvttps2uqqs-2.c   |   16 +
 .../gcc.target/i386/avx10_2-vcvttsd2sis-2.c|   47 +
 .../gcc.target/i386/avx10_2-vcvttsd2usis-2.c   |   47 +
 .../gcc.target/i386/avx10_2-vcvttss2sis-2.c|   47 +
 .../gcc.target/i386/avx10_2-vcvttss2usis-2.c   |   46 +
 gcc/testsuite/gcc.target/i386/sse-13.c |   26 +
 gcc/testsuite/gcc.target/i386/sse-14.c |   58 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |   58 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   26 +
 31 files changed, 2869 insertions(+), 40 deletions(-)

diff --git a/gcc/config/i386/avx10_2-512satcvtintrin.h 
b/gcc/config/i386/avx10_2-512satcvtintrin.h
index 4286458c413a..d625a6449481 100644
--- a/gcc/config/i386/avx10_2-512satcvtintrin.h
+++ b/gcc/config/i386/avx10_2-512satcvtintrin.h
@@ -438,6 +438,286 @@ _mm512_maskz_ipcvtt_roundps_epu32 (__mmask1

[gcc r15-3183] AVX10.2: Support minmax instructions

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:889f6dd0d8c7317f62578c900c0f662e919786a2

commit r15-3183-g889f6dd0d8c7317f62578c900c0f662e919786a2
Author: Mo, Zewei 
Date:   Mon Aug 26 10:53:50 2024 +0800

AVX10.2: Support minmax instructions

gcc/ChangeLog:

* config.gcc: Add avx10_2-512minmaxintrin.h and
avx10_2minmaxintrin.h.
* config/i386/i386-builtin-types.def:
Add DEF_FUNCTION_TYPE (V8BF, V8BF, V8BF, INT, V8BF, UQI),
(V16BF, V16BF, V16BF, INT, V16BF, UHI),
(V32BF, V32BF, V32BF, INT, V32BF, USI),
(V8HF, V8HF, V8HF, INT, V8HF, UQI),
(V8DF, V8DF, V8DF, INT, V8DF, UQI, INT),
(V32HF, V32HF, V32HF, INT, V32HF, USI, INT),
(V16HF, V16HF, V16HF, INT, V16HF, UHI, INT),
(V16SF, V16SF, V16SF, INT, V16SF, UHI, INT).
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc
(ix86_expand_args_builtin): Handle 
V8BF_FTYPE_V8BF_V8BF_INT_V8BF_UQI,
V16BF_FTYPE_V16BF_V16BF_INT_V16BF_UHI,
V32BF_FTYPE_V32BF_V32BF_INT_V32BF_USI,
V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI,
(ix86_expand_round_builtin): Handle 
V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI_INT,
V32HF_FTYPE_V32HF_V32HF_INT_V32HF_USI_INT,
V16HF_FTYPE_V16HF_V16HF_INT_V16HF_UHI_INT.
V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI_INT.
* config/i386/immintrin.h: Include avx10_2-512mixmaxintrin.h and
avx10_2minmaxintrin.h.
* config/i386/sse.md (VFH_AVX10_2): New.
(avx10_2_vminmaxnepbf16_): New define_insn.
(avx10_2_minmaxp): Ditto.

(avx10_2_minmaxs): Ditto.
* config/i386/avx10_2-512minmaxintrin.h: New file.
* config/i386/avx10_2minmaxintrin.h: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add macros.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx512f-helper.h: Add helper function.
* gcc.target/i386/avx10-minmax-helper.h: New helper file.
* gcc.target/i386/avx10_2-512-minmax-1.c: New test.
* gcc.target/i386/avx10_2-512-vminmaxnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vminmaxpd-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vminmaxph-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vminmaxps-2.c: Ditto.
* gcc.target/i386/avx10_2-minmax-1.c: Ditto.
* gcc.target/i386/avx10_2-vminmaxnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-vminmaxsd-2.c: Ditto.
* gcc.target/i386/avx10_2-vminmaxsh-2.c: Ditto.
* gcc.target/i386/avx10_2-vminmaxss-2.c: Ditto.
* gcc.target/i386/avx10_2-vminmaxpd-2.c: Ditto.
* gcc.target/i386/avx10_2-vminmaxph-2.c: Ditto.
* gcc.target/i386/avx10_2-vminmaxps-2.c: Ditto.

Co-authored-by: Hu, Lin1 
Co-authored-by: Haochen Jiang 

Diff:
---
 gcc/config.gcc |3 +-
 gcc/config/i386/avx10_2-512minmaxintrin.h  |  489 +
 gcc/config/i386/avx10_2minmaxintrin.h  | 1063 
 gcc/config/i386/i386-builtin-types.def |8 +
 gcc/config/i386/i386-builtin.def   |   15 +
 gcc/config/i386/i386-expand.cc |8 +
 gcc/config/i386/immintrin.h|5 +
 gcc/config/i386/sse.md |   46 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   19 +
 .../gcc.target/i386/avx10-minmax-helper.h  |  257 +
 .../gcc.target/i386/avx10_2-512-minmax-1.c |   51 +
 .../gcc.target/i386/avx10_2-512-vminmaxnepbf16-2.c |   35 +
 .../gcc.target/i386/avx10_2-512-vminmaxpd-2.c  |   35 +
 .../gcc.target/i386/avx10_2-512-vminmaxph-2.c  |   35 +
 .../gcc.target/i386/avx10_2-512-vminmaxps-2.c  |   35 +
 gcc/testsuite/gcc.target/i386/avx10_2-minmax-1.c   |  122 +++
 .../gcc.target/i386/avx10_2-vminmaxnepbf16-2.c |   13 +
 .../gcc.target/i386/avx10_2-vminmaxpd-2.c  |   13 +
 .../gcc.target/i386/avx10_2-vminmaxph-2.c  |   15 +
 .../gcc.target/i386/avx10_2-vminmaxps-2.c  |   13 +
 .../gcc.target/i386/avx10_2-vminmaxsd-2.c  |   34 +
 .../gcc.target/i386/avx10_2-vminmaxsh-2.c  |   34 +
 .../gcc.target/i386/avx10_2-vminmaxss-2.c  |   34 +
 gcc/testsuite/gcc.target/i386/avx512f-helper.h |2 +
 gcc/testsuite/gcc.target/i386/sse-13.c |   19 +
 gcc/testsuite/gcc.target/i386/sse-14.c |   67 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |   67 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   19 +
 28 files changed, 2555 insertions(+), 1 deletion(-)

diff --git a/gcc/config.gcc b/gcc/config.

[gcc r15-3184] AVX10.2: Support vector copy instructions

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:f6fe2962daf7b8d8532c768c3b9eab00f99cce5b

commit r15-3184-gf6fe2962daf7b8d8532c768c3b9eab00f99cce5b
Author: Zhang, Jun 
Date:   Mon Aug 26 10:53:52 2024 +0800

AVX10.2: Support vector copy instructions

gcc/ChangeLog:

* config.gcc: Add avx10_2copyintrin.h.
* config/i386/i386.md (avx10_2): New isa attribute.
* config/i386/immintrin.h: Include avx10_2copyintrin.h.
* config/i386/sse.md
(sse_movss_): Add new constraints to handle AVX10.2.
(vec_set_0): Ditto.
(@vec_set_0): Ditto.
(vec_set_0): Ditto.
(avx512fp16_mov): Ditto.
(*vec_set_0_1): New split.
* config/i386/avx10_2copyintrin.h: New file.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-vmovd-1.c: New test.
* gcc.target/i386/avx10_2-vmovd-2.c: Ditto.
* gcc.target/i386/avx10_2-vmovw-1.c: Ditto.
* gcc.target/i386/avx10_2-vmovw-2.c: Ditto.

Diff:
---
 gcc/config.gcc  |   3 +-
 gcc/config/i386/avx10_2copyintrin.h |  38 +++
 gcc/config/i386/i386.md |   3 +-
 gcc/config/i386/immintrin.h |   2 +
 gcc/config/i386/sse.md  | 138 +++-
 gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c |  48 +
 gcc/testsuite/gcc.target/i386/avx10_2-vmovd-2.c |  44 
 gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c |  69 
 gcc/testsuite/gcc.target/i386/avx10_2-vmovw-2.c |  64 +++
 9 files changed, 356 insertions(+), 53 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index cd8a34b292fd..e887c9c74321 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -457,7 +457,8 @@ i[34567]86-*-* | x86_64-*-*)
   avx10_2convertintrin.h avx10_2-512convertintrin.h
   avx10_2bf16intrin.h avx10_2-512bf16intrin.h
   avx10_2satcvtintrin.h avx10_2-512satcvtintrin.h
-  avx10_2minmaxintrin.h avx10_2-512minmaxintrin.h"
+  avx10_2minmaxintrin.h avx10_2-512minmaxintrin.h
+  avx10_2copyintrin.h"
;;
 ia64-*-*)
extra_headers=ia64intrin.h
diff --git a/gcc/config/i386/avx10_2copyintrin.h 
b/gcc/config/i386/avx10_2copyintrin.h
new file mode 100644
index ..f1150c71dbf0
--- /dev/null
+++ b/gcc/config/i386/avx10_2copyintrin.h
@@ -0,0 +1,38 @@
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of GCC.
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use  directly; include  
instead."
+#endif
+
+#ifndef _AVX10_2COPYINTRIN_H_INCLUDED
+#define _AVX10_2COPYINTRIN_H_INCLUDED
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_move_epi32 (__m128i __A)
+{
+  return _mm_set_epi32 (0, 0, 0, ((__v4si) __A)[0]);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_move_epi16 (__m128i __A)
+{
+  return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, ((__v8hi) __A)[0]);
+}
+
+#endif /* _AVX10_2COPYINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 36108e5c2c9e..34f9214115ea 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -579,7 +579,7 @@
noavx512dq,fma_or_avx512vl,avx512vl,noavx512vl,avxvnni,
avx512vnnivl,avx512fp16,avxifma,avx512ifmavl,avxneconvert,
avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl,
-   vaes_avx512vl,noapx_nf"
+   vaes_avx512vl,noapx_nf,avx10_2"
   (const_string "base"))
 
 ;; The (bounding maximum) length of an instruction immediate.
@@ -976,6 +976,7 @@
   (symbol_ref "TARGET_APX_NDD && Pmode == DImode")
 (eq_attr "isa" "vaes_avx512vl")
   (symbol_ref "TARGET_VAES && TARGET_AVX512VL")
+(eq_attr "isa" "avx10_2") (symbol_ref "TARGET_AVX10_2_256")
 
 (

[gcc r15-3185] AVX10.2: Support compare instructions

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:576bd309ded9dfe258023f26924c064a7bf12875

commit r15-3185-g576bd309ded9dfe258023f26924c064a7bf12875
Author: Zhang, Jun 
Date:   Mon Aug 26 10:53:54 2024 +0800

AVX10.2: Support compare instructions

gcc/ChangeLog:

* config/i386/i386-expand.cc
(ix86_ssecom_setcc): Mention behavior change on flags.
(ix86_expand_sse_comi): Handle AVX10.2 behavior.
(ix86_expand_sse_comi_round): Ditto.
(ix86_expand_round_builtin): Ditto.
(ix86_expand_builtin): Change function call.
* config/i386/i386.md (UNSPEC_COMX): New unspec.
* config/i386/sse.md
(avx10_2_vcomx): New.
(_comi): Add HFmode.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-compare-1.c: New test.

Co-authored-by: Haochen Jiang 
Co-authored-by: Hongtao Liu 

Diff:
---
 gcc/config/i386/i386-expand.cc| 170 ++
 gcc/config/i386/i386.md   |   1 +
 gcc/config/i386/sse.md|  18 ++-
 gcc/testsuite/gcc.target/i386/avx10_2-compare-1.c |  21 +++
 4 files changed, 183 insertions(+), 27 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 56fc433e9933..d692008ffe7e 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10601,7 +10601,9 @@ ix86_ssecom_setcc (const enum rtx_code comparison,
   rtx_code_label *label = NULL;
 
   /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
- with NAN operands.  */
+ with NAN operands.
+ Under TARGET_AVX10_2_256, VCOMX/VUCOMX are generated instead of
+ COMI/UCOMI.  VCOMX/VUCOMX will not set ZF for NAN operands.  */
   if (check_unordered)
 {
   gcc_assert (comparison == EQ || comparison == NE);
@@ -10640,7 +10642,7 @@ ix86_ssecom_setcc (const enum rtx_code comparison,
 
 static rtx
 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
- rtx target)
+ rtx target, bool comx_ok)
 {
   rtx pat, set_dst;
   tree arg0 = CALL_EXPR_ARG (exp, 0);
@@ -10673,11 +10675,13 @@ ix86_expand_sse_comi (const struct 
builtin_description *d, tree exp,
 case GE:
   break;
 case EQ:
-  check_unordered = true;
+  if (!TARGET_AVX10_2_256 || !comx_ok)
+   check_unordered = true;
   mode = CCZmode;
   break;
 case NE:
-  check_unordered = true;
+  if (!TARGET_AVX10_2_256 || !comx_ok)
+   check_unordered = true;
   mode = CCZmode;
   const_val = const1_rtx;
   break;
@@ -10696,6 +10700,28 @@ ix86_expand_sse_comi (const struct builtin_description 
*d, tree exp,
   || !insn_p->operand[1].predicate (op1, mode1))
 op1 = copy_to_mode_reg (mode1, op1);
 
+  if ((comparison == EQ || comparison == NE)
+  && TARGET_AVX10_2_256 && comx_ok)
+{
+  switch (icode)
+   {
+   case CODE_FOR_sse_comi:
+ icode = CODE_FOR_avx10_2_comxsf;
+ break;
+   case CODE_FOR_sse_ucomi:
+ icode = CODE_FOR_avx10_2_ucomxsf;
+ break;
+   case CODE_FOR_sse2_comi:
+ icode = CODE_FOR_avx10_2_comxdf;
+ break;
+   case CODE_FOR_sse2_ucomi:
+ icode = CODE_FOR_avx10_2_ucomxdf;
+ break;
+
+   default:
+ gcc_unreachable ();
+   }
+}
   pat = GEN_FCN (icode) (op0, op1);
   if (! pat)
 return 0;
@@ -12190,7 +12216,7 @@ ix86_erase_embedded_rounding (rtx pat)
with rounding.  */
 static rtx
 ix86_expand_sse_comi_round (const struct builtin_description *d,
-   tree exp, rtx target)
+   tree exp, rtx target, bool comx_ok)
 {
   rtx pat, set_dst;
   tree arg0 = CALL_EXPR_ARG (exp, 0);
@@ -12252,6 +12278,7 @@ ix86_expand_sse_comi_round (const struct 
builtin_description *d,
 op1 = safe_vector_operand (op1, mode1);
 
   enum rtx_code comparison = comparisons[INTVAL (op2)];
+  enum rtx_code orig_comp = comparison;
   bool ordered = ordereds[INTVAL (op2)];
   bool non_signaling = non_signalings[INTVAL (op2)];
   rtx const_val = const0_rtx;
@@ -12263,10 +12290,21 @@ ix86_expand_sse_comi_round (const struct 
builtin_description *d,
 case ORDERED:
   if (!ordered)
{
- /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
- if (!non_signaling)
-   ordered = true;
- mode = CCSmode;
+ if (TARGET_AVX10_2_256 && comx_ok)
+   {
+ /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
+differently. So directly return true here.  */
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const1_rtx);
+ return target;
+   }
+ else
+   {
+ /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
+ if (!non_signaling)
+   ordered = true;
+   

[gcc r15-3186] i386: Add bf8 -> fp16 intrin

2024-08-25 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b4ac2c23d8745d98984954e88f02aa73f1c3594b

commit r15-3186-gb4ac2c23d8745d98984954e88f02aa73f1c3594b
Author: Haochen Jiang 
Date:   Mon Aug 26 10:53:56 2024 +0800

i386: Add bf8 -> fp16 intrin

Since BF8 and FP16 have same bits for exponent, the type conversion
between them is just a cast for fraction part. We will use a sequence
of instrctions instead of new instructions to do that. For convenience,
intrins are also provided.

gcc/ChangeLog:

* config/i386/avx10_2-512convertintrin.h
(_mm512_cvtpbf8_ph): New.
(_mm512_mask_cvtpbf8_ph): Ditto.
(_mm512_maskz_cvtpbf8_ph): Ditto.
* config/i386/avx10_2convertintrin.h
(_mm_cvtpbf8_ph): Ditto.
(_mm_mask_cvtpbf8_ph): Ditto.
(_mm_maskz_cvtpbf8_ph): Ditto.
(_mm256_cvtpbf8_ph): Ditto.
(_mm256_mask_cvtpbf8_ph): Ditto.
(_mm256_maskz_cvtpbf8_ph): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-512-convert-1.c: Add tests for new
intrin.
* gcc.target/i386/avx10_2-convert-1.c: Ditto.

Diff:
---
 gcc/config/i386/avx10_2-512convertintrin.h | 24 +++
 gcc/config/i386/avx10_2convertintrin.h | 48 ++
 .../gcc.target/i386/avx10_2-512-convert-1.c| 16 +++-
 gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c  | 26 ++--
 4 files changed, 109 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/avx10_2-512convertintrin.h 
b/gcc/config/i386/avx10_2-512convertintrin.h
index 4ad339bbbf97..dfbdfc3e51bb 100644
--- a/gcc/config/i386/avx10_2-512convertintrin.h
+++ b/gcc/config/i386/avx10_2-512convertintrin.h
@@ -540,6 +540,30 @@ _mm512_maskz_cvtnesph_phf8 (__mmask32 __U, __m512h __A)
 (__mmask32) __U);
 }
 
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpbf8_ph (__m256i __A)
+{
+  return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_slli_epi16 (
+(__m512i) _mm512_cvtepi8_epi16 (__A), 8));
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpbf8_ph (__m512h __S, __mmask16 __U, __m256i __A)
+{
+  return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_mask_slli_epi16 (
+(__m512i) __S, __U, (__m512i) _mm512_cvtepi8_epi16 (__A), 8));
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpbf8_ph (__mmask16 __U, __m256i __A)
+{
+  return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_slli_epi16 (
+(__m512i) _mm512_maskz_cvtepi8_epi16 (__U, __A), 8));
+}
+
 #ifdef __DISABLE_AVX10_2_512__
 #undef __DISABLE_AVX10_2_512__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/avx10_2convertintrin.h 
b/gcc/config/i386/avx10_2convertintrin.h
index ac62d1290a5c..8d2c1a54147a 100644
--- a/gcc/config/i386/avx10_2convertintrin.h
+++ b/gcc/config/i386/avx10_2convertintrin.h
@@ -970,6 +970,54 @@ _mm256_maskz_cvtnesph_phf8 (__mmask16 __U, __m256h __A)
 (__mmask16) __U);
 }
 
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpbf8_ph (__m128i __A)
+{
+  return (__m128h) _mm_castsi128_ph ((__m128i) _mm_slli_epi16 (
+(__m128i) _mm_cvtepi8_epi16 (__A), 8));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpbf8_ph (__m128h __S, __mmask8 __U, __m128i __A)
+{
+  return (__m128h) _mm_castsi128_ph ((__m128i) _mm_mask_slli_epi16 (
+(__m128i) __S, __U, (__m128i) _mm_cvtepi8_epi16 (__A), 8));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpbf8_ph (__mmask8 __U, __m128i __A)
+{
+  return (__m128h) _mm_castsi128_ph ((__m128i) _mm_slli_epi16 (
+(__m128i) _mm_maskz_cvtepi8_epi16 (__U, __A), 8));
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpbf8_ph (__m128i __A)
+{
+  return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_slli_epi16 (
+(__m256i) _mm256_cvtepi8_epi16 (__A), 8));
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpbf8_ph (__m256h __S, __mmask8 __U, __m128i __A)
+{
+  return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_mask_slli_epi16 (
+(__m256i) __S, __U, (__m256i) _mm256_cvtepi8_epi16 (__A), 8));
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpbf8_ph (__mmask8 __U, __m128i __A)
+{
+  return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_slli_epi16 (
+(__m256i) _mm256_maskz_cvtepi8_epi16 (__U, __A), 8));
+}
+
 #ifdef __DISABLE_AVX10_2_256__
 #undef __DISABLE_AV

[gcc r15-3358] i386: Support vectorized BF16 sqrt with AVX10.2 instruction

2024-09-01 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:e19f65b0be1e91ff86689feb7695080dad4c9197

commit r15-3358-ge19f65b0be1e91ff86689feb7695080dad4c9197
Author: Levy Hsu 
Date:   Mon Sep 2 10:24:48 2024 +0800

i386: Support vectorized BF16 sqrt with AVX10.2 instruction

gcc/ChangeLog:

* config/i386/sse.md: Expand VF2H to VF2HB with VBF modes.

Diff:
---
 gcc/config/i386/sse.md | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b374783429cb..2de592a9c8fa 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -447,9 +447,12 @@
 (define_mode_iterator VF2_AVX10_2
   [(V8DF "TARGET_AVX10_2_512") V4DF V2DF])
 
-;; All DFmode & HFmode vector float modes
-(define_mode_iterator VF2H
-  [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
+;; All DFmode & HFmode & BFmode vector float modes
+(define_mode_iterator VF2HB
+  [(V32BF "TARGET_AVX10_2_512")
+   (V16BF "TARGET_AVX10_2_256")
+   (V8BF "TARGET_AVX10_2_256")
+   (V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX") V2DF])
@@ -2933,8 +2936,8 @@
(set_attr "mode" "")])
 
 (define_expand "sqrt2"
-  [(set (match_operand:VF2H 0 "register_operand")
-   (sqrt:VF2H (match_operand:VF2H 1 "vector_operand")))]
+  [(set (match_operand:VF2HB 0 "register_operand")
+   (sqrt:VF2HB (match_operand:VF2HB 1 "vector_operand")))]
   "TARGET_SSE2")
 
 (define_expand "sqrt2"


[gcc r15-3359] i386: Support vec_cmp for V8BF/V16BF/V32BF in AVX10.2

2024-09-01 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:f77435aa3911c437cba71991509eee57b333b3ce

commit r15-3359-gf77435aa3911c437cba71991509eee57b333b3ce
Author: Levy Hsu 
Date:   Mon Sep 2 10:24:49 2024 +0800

i386: Support vec_cmp for V8BF/V16BF/V32BF in AVX10.2

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_use_mask_cmp_p): Add BFmode
for int mask cmp.
* config/i386/sse.md (vec_cmp): New
vec_cmp expand for VBF modes.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-512-bf-vector-cmpp-1.c: New test.
* gcc.target/i386/avx10_2-bf-vector-cmpp-1.c: Ditto.

Diff:
---
 gcc/config/i386/i386-expand.cc |  2 ++
 gcc/config/i386/sse.md | 13 ++
 .../gcc.target/i386/avx10_2-512-bf-vector-cmpp-1.c | 19 ++
 .../gcc.target/i386/avx10_2-bf-vector-cmpp-1.c | 29 ++
 4 files changed, 63 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 53327544620f..124cb976ec87 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4036,6 +4036,8 @@ ix86_use_mask_cmp_p (machine_mode mode, machine_mode 
cmp_mode,
 return true;
   else if (GET_MODE_INNER (cmp_mode) == HFmode)
 return true;
+  else if (GET_MODE_INNER (cmp_mode) == BFmode)
+return true;
 
   /* When op_true is NULL, op_false must be NULL, or vice versa.  */
   gcc_assert (!op_true == !op_false);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2de592a9c8fa..3bf95f0b0e53 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4797,6 +4797,19 @@
   DONE;
 })
 
+(define_expand "vec_cmp"
+  [(set (match_operand: 0 "register_operand")
+   (match_operator: 1 ""
+ [(match_operand:VBF_AVX10_2 2 "register_operand")
+  (match_operand:VBF_AVX10_2 3 "nonimmediate_operand")]))]
+  "TARGET_AVX10_2_256"
+{
+  bool ok = ix86_expand_mask_vec_cmp (operands[0], GET_CODE (operands[1]),
+ operands[2], operands[3]);
+  gcc_assert (ok);
+  DONE;
+})
+
 (define_expand "vec_cmp"
   [(set (match_operand: 0 "register_operand")
(match_operator: 1 ""
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-cmpp-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-cmpp-1.c
new file mode 100644
index ..416fcaa36289
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-cmpp-1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx10.2-512 -O2 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times "vcmppbf16" 5 } } */
+
+typedef __bf16 v32bf __attribute__ ((__vector_size__ (64)));
+
+#define VCMPMN(type, op, name) \
+type  \
+__attribute__ ((noinline, noclone)) \
+vec_cmp_##type##type##name (type a, type b) \
+{ \
+  return a op b;  \
+}
+
+VCMPMN (v32bf, <, lt)
+VCMPMN (v32bf, <=, le)
+VCMPMN (v32bf, >, gt)
+VCMPMN (v32bf, >=, ge)
+VCMPMN (v32bf, ==, eq)
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-cmpp-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-cmpp-1.c
new file mode 100644
index ..6234116039f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-cmpp-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler-times "vcmppbf16" 10 } } */
+
+typedef __bf16 v16bf __attribute__ ((__vector_size__ (32)));
+typedef __bf16 v8bf __attribute__ ((__vector_size__ (16)));
+
+#define VCMPMN(type, op, name) \
+type  \
+__attribute__ ((noinline, noclone)) \
+vec_cmp_##type##type##name (type a, type b) \
+{ \
+  return a op b;  \
+}
+
+VCMPMN (v16bf, <, lt)
+VCMPMN (v8bf, <, lt)
+
+VCMPMN (v16bf, <=, le)
+VCMPMN (v8bf, <=, le)
+
+VCMPMN (v16bf, >, gt)
+VCMPMN (v8bf, >, gt)
+
+VCMPMN (v16bf, >=, ge)
+VCMPMN (v8bf, >=, ge)
+
+VCMPMN (v16bf, ==, eq)
+VCMPMN (v8bf, ==, eq)


[gcc r15-3356] i386: Support vectorized BF16 FMA with AVX10.2 instructions

2024-09-01 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6d294fb8ac9baf2624446deaa4c995b7a7719823

commit r15-3356-g6d294fb8ac9baf2624446deaa4c995b7a7719823
Author: Levy Hsu 
Date:   Mon Sep 2 10:24:46 2024 +0800

i386: Support vectorized BF16 FMA with AVX10.2 instructions

gcc/ChangeLog:

* config/i386/sse.md: Add V8BF/V16BF/V32BF to mode iterator 
FMAMODEM.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-512-bf-vector-fma-1.c: New test.
* gcc.target/i386/avx10_2-bf-vector-fma-1.c: New test.

Diff:
---
 gcc/config/i386/sse.md |  5 +-
 .../gcc.target/i386/avx10_2-512-bf-vector-fma-1.c  | 34 
 .../gcc.target/i386/avx10_2-bf-vector-fma-1.c  | 63 ++
 3 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ebca462bae8b..85fbef331ea4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5677,7 +5677,10 @@
(HF "TARGET_AVX512FP16")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
-   (V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")])
+   (V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
+   (V8BF "TARGET_AVX10_2_256")
+   (V16BF "TARGET_AVX10_2_256")
+   (V32BF "TARGET_AVX10_2_512")])
 
 (define_expand "fma4"
   [(set (match_operand:FMAMODEM 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-fma-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-fma-1.c
new file mode 100644
index ..a857f9b90db4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-fma-1.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx10.2-512 -O2" } */
+/* { dg-final { scan-assembler-times "vfmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+
+#include 
+
+typedef __bf16 v32bf __attribute__ ((__vector_size__ (64)));
+
+v32bf
+foo_madd (v32bf a, v32bf b, v32bf c)
+{
+  return a * b + c;
+}
+
+v32bf
+foo_msub (v32bf a, v32bf b, v32bf c)
+{
+  return a * b - c;
+}
+
+v32bf
+foo_nmadd (v32bf a, v32bf b, v32bf c)
+{
+  return -a * b + c;
+}
+
+v32bf
+foo_nmsub (v32bf a, v32bf b, v32bf c)
+{
+  return -a * b - c;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-fma-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-fma-1.c
new file mode 100644
index ..0fd78efe0493
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-fma-1.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler-times "vfmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+
+#include 
+
+typedef __bf16 v16bf __attribute__ ((__vector_size__ (32)));
+typedef __bf16 v8bf __attribute__ ((__vector_size__ (16)));
+
+v16bf
+foo_madd_256 (v16bf a, v16bf b, v16bf c)
+{
+  return a * b + c;
+}
+
+v16bf
+foo_msub_256 (v16bf a, v16bf b, v16bf c)
+{
+  return a * b - c;
+}
+
+v16bf
+foo_nmadd_256 (v16bf a, v16bf b, v16bf c)
+{
+  return -a * b + c;
+}
+
+v16bf
+foo_nmsub_256 (v16bf a, v16bf b, v16bf c)
+{
+  return -a * b - c;
+}
+
+v8bf
+foo_madd_128 (v8bf a, v8bf b, v8bf c)
+{
+  return a * b + c;
+}
+
+v8bf
+foo_msub_128 (v8bf a, v8bf b, v8bf c)
+{
+  return a * b - c;
+}
+
+v8bf
+foo_nmadd_128 (v8b

[gcc r15-3353] i386: Optimize ordered and nonequal

2024-09-01 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:86f5031c804220274a9bbebd26b8ebf47a2207ac

commit r15-3353-g86f5031c804220274a9bbebd26b8ebf47a2207ac
Author: Hu, Lin1 
Date:   Mon Sep 2 10:24:31 2024 +0800

i386: Optimize ordered and nonequal

Currently, when we input !__builtin_isunordered (a, b) && (a != b), gcc
will emit
  ucomiss %xmm1, %xmm0
  movl $1, %ecx
  setp %dl
  setnp %al
  cmovne %ecx, %edx
  andl %edx, %eax
  movzbl %al, %eax

In fact,
  xorl %eax, %eax
  ucomiss %xmm1, %xmm0
  setne %al
is better.

gcc/ChangeLog:

* match.pd: Optimize (and ordered non-equal) to
(not (or unordered  equal))

gcc/testsuite/ChangeLog:

* gcc.target/i386/optimize_one.c: New test.

Diff:
---
 gcc/match.pd | 3 +++
 gcc/testsuite/gcc.target/i386/optimize_one.c | 9 +
 2 files changed, 12 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index be211535a49f..4298e89dad6d 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6651,6 +6651,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (ltgt @0 @0)
  (if (!flag_trapping_math || !tree_expr_maybe_nan_p (@0))
   { constant_boolean_node (false, type); }))
+(simplify
+ (bit_and (ordered @0 @1) (ne @0 @1))
+ (bit_not (uneq @0 @1)))
 
 /* x == ~x -> false */
 /* x != ~x -> true */
diff --git a/gcc/testsuite/gcc.target/i386/optimize_one.c 
b/gcc/testsuite/gcc.target/i386/optimize_one.c
new file mode 100644
index ..62728d3c5ba4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/optimize_one.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times "comi" 1 } } */
+/* { dg-final { scan-assembler-times "set" 1 } } */
+
+int is_ordered_or_nonequal_sh (float a, float b)
+{
+  return !__builtin_isunordered (a, b) && (a != b);
+}


[gcc r15-3357] i386: Support vectorized BF16 smaxmin with AVX10.2 instructions

2024-09-01 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:29ef601973d7b79338694e59581d4c24bcd07f69

commit r15-3357-g29ef601973d7b79338694e59581d4c24bcd07f69
Author: Levy Hsu 
Date:   Mon Sep 2 10:24:47 2024 +0800

i386: Support vectorized BF16 smaxmin with AVX10.2 instructions

gcc/ChangeLog:

* config/i386/sse.md
(3): New define expand pattern for BF smaxmin.

gcc/testsuite/ChangeLog:
* gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c: New test.
* gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c: New test.

Diff:
---
 gcc/config/i386/sse.md |  7 +
 .../i386/avx10_2-512-bf-vector-smaxmin-1.c | 20 
 .../gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c  | 36 ++
 3 files changed, 63 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 85fbef331ea4..b374783429cb 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -31901,6 +31901,13 @@
"vscalefpbf16\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "prefix" "evex")])
 
+(define_expand "3"
+  [(set (match_operand:VBF_AVX10_2 0 "register_operand")
+ (smaxmin:VBF_AVX10_2
+   (match_operand:VBF_AVX10_2 1 "register_operand")
+   (match_operand:VBF_AVX10_2 2 "nonimmediate_operand")))]
+  "TARGET_AVX10_2_256")
+
 (define_insn "avx10_2_pbf16_"
[(set (match_operand:VBF_AVX10_2 0 "register_operand" "=v")
   (smaxmin:VBF_AVX10_2
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c
new file mode 100644
index ..e33c325e2da9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx10.2-512 -mprefer-vector-width=512 -Ofast" } */
+/* /* { dg-final { scan-assembler-times "vmaxpbf16" 1 } } */
+/* /* { dg-final { scan-assembler-times "vminpbf16" 1 } } */
+
+void
+maxpbf16_512 (__bf16* dest, __bf16* src1, __bf16* src2)
+{
+  int i;
+  for (i = 0; i < 32; i++)
+dest[i] = src1[i] > src2[i] ? src1[i] : src2[i];
+}
+
+void
+minpbf16_512 (__bf16* dest, __bf16* src1, __bf16* src2)
+{
+  int i;
+  for (i = 0; i < 32; i++)
+dest[i] = src1[i] < src2[i] ? src1[i] : src2[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c
new file mode 100644
index ..9bae073c95aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx10.2 -Ofast" } */
+/* /* { dg-final { scan-assembler-times "vmaxpbf16" 2 } } */
+/* /* { dg-final { scan-assembler-times "vminpbf16" 2 } } */
+
+void
+maxpbf16_256 (__bf16* dest, __bf16* src1, __bf16* src2)
+{
+  int i;
+  for (i = 0; i < 16; i++)
+dest[i] = src1[i] > src2[i] ? src1[i] : src2[i];
+}
+
+void
+minpbf16_256 (__bf16* dest, __bf16* src1, __bf16* src2)
+{
+  int i;
+  for (i = 0; i < 16; i++)
+dest[i] = src1[i] < src2[i] ? src1[i] : src2[i];
+}
+
+void
+maxpbf16_128 (__bf16* dest, __bf16* src1, __bf16* src2)
+{
+  int i;
+  for (i = 0; i < 16; i++)
+dest[i] = src1[i] > src2[i] ? src1[i] : src2[i];
+}
+
+void
+minpbf16_128 (__bf16* dest, __bf16* src1, __bf16* src2)
+{
+  int i;
+  for (i = 0; i < 16; i++)
+dest[i] = src1[i] < src2[i] ? src1[i] : src2[i];
+}


[gcc r15-3354] i386: Optimize generate insn for AVX10.2 compare

2024-09-01 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:3b1decef83003db9cf8667977c293435c0f3d024

commit r15-3354-g3b1decef83003db9cf8667977c293435c0f3d024
Author: Hu, Lin1 
Date:   Mon Sep 2 10:24:36 2024 +0800

i386: Optimize generate insn for AVX10.2 compare

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_fp_compare): Add UNSPEC to
support the optimization.
* config/i386/i386.cc (ix86_fp_compare_code_to_integer): Add NE/EQ.
* config/i386/i386.md (*cmpx): New define_insn.
(*cmpxhf): Ditto.
* config/i386/predicates.md (ix86_trivial_fp_comparison_operator):
Add ne/eq.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-compare-1b.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc |  5 ++
 gcc/config/i386/i386.cc|  5 ++
 gcc/config/i386/i386.md| 31 ++-
 gcc/config/i386/predicates.md  | 12 +++
 gcc/testsuite/gcc.target/i386/avx10_2-compare-1b.c | 96 ++
 5 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index d692008ffe7e..53327544620f 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2916,6 +2916,11 @@ ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx 
op1)
   switch (ix86_fp_comparison_strategy (code))
 {
 case IX86_FPCMP_COMI:
+  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+  if (TARGET_AVX10_2_256 && (code == EQ || code == NE))
+   tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX);
+  if (unordered_compare)
+   tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
   cmp_mode = CCFPmode;
   emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
   break;
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 546c964d2a47..7af9ceca429f 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -16634,6 +16634,11 @@ ix86_fp_compare_code_to_integer (enum rtx_code code)
   return LEU;
 case LTGT:
   return NE;
+case EQ:
+case NE:
+  if (TARGET_AVX10_2_256)
+   return code;
+  /* FALLTHRU.  */
 default:
   return UNKNOWN;
 }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b56a51be09fb..0fae3c1eb878 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -117,6 +117,7 @@
   UNSPEC_STC
   UNSPEC_PUSHFL
   UNSPEC_POPFL
+  UNSPEC_OPTCOMX
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -1736,7 +1737,7 @@
(compare:CC (match_operand:XF 1 "nonmemory_operand")
(match_operand:XF 2 "nonmemory_operand")))
(set (pc) (if_then_else
-  (match_operator 0 "ix86_fp_comparison_operator"
+  (match_operator 0 "ix86_fp_comparison_operator_xf"
[(reg:CC FLAGS_REG)
 (const_int 0)])
   (label_ref (match_operand 3))
@@ -1753,7 +1754,7 @@
(compare:CC (match_operand:XF 2 "nonmemory_operand")
(match_operand:XF 3 "nonmemory_operand")))
(set (match_operand:QI 0 "register_operand")
-  (match_operator 1 "ix86_fp_comparison_operator"
+  (match_operator 1 "ix86_fp_comparison_operator_xf"
[(reg:CC FLAGS_REG)
 (const_int 0)]))]
   "TARGET_80387"
@@ -2017,6 +2018,32 @@
(set_attr "bdver1_decode" "double")
(set_attr "znver1_decode" "double")])
 
+(define_insn "*cmpx"
+  [(set (reg:CCFP FLAGS_REG)
+   (unspec:CCFP [
+ (compare:CCFP
+   (match_operand:MODEF 0 "register_operand" "v")
+   (match_operand:MODEF 1 "nonimmediate_operand" "vm"))]
+ UNSPEC_OPTCOMX))]
+  "TARGET_AVX10_2_256"
+  "%vcomx\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
+(define_insn "*cmpxhf"
+  [(set (reg:CCFP FLAGS_REG)
+   (unspec:CCFP [
+ (compare:CCFP
+   (match_operand:HF 0 "register_operand" "v")
+   (match_operand:HF 1 "nonimmediate_operand" "vm"))]
+ UNSPEC_OPTCOMX))]
+  "TARGET_AVX10_2_256"
+  "vcomxsh\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "HF")])
+
 (define_insn "*cmpi"
   [(set (reg:CCFP FLAGS_REG)
(compare:CCFP
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index ab6a2e14d355..053312bbe27c 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1633,7 +1633,13 @@
 })
 
 ;; Return true if this comparison only requires testing one flag bit.
+;; VCOMX/VUCOMX set ZF, SF, OF, differently from COMI/UCOMI.
 (define_predicate "ix86_trivial_fp_comparison_operator"
+  (if_then_else (match_test "TARGET_AVX10_2_256")
+   (match_code "gt,ge,unlt,unle,eq,uneq,ne,ltgt,ordered,unordered")
+

[gcc r15-3355] i386: Support vectorized BF16 add/sub/mul/div with AVX10.2 instructions

2024-09-01 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:f82fa0da4d9e1fdaf5e4edd70364d5781534ce11

commit r15-3355-gf82fa0da4d9e1fdaf5e4edd70364d5781534ce11
Author: Levy Hsu 
Date:   Mon Sep 2 10:24:45 2024 +0800

i386: Support vectorized BF16 add/sub/mul/div with AVX10.2 instructions

AVX10.2 introduces several non-exception instructions for BF16 vector.
Enable vectorized BF add/sub/mul/div operation by supporting standard
optab for them.

gcc/ChangeLog:

* config/i386/sse.md (div3): New expander for BFmode div.
(VF_BHSD): New mode iterator with vector BFmodes.
(3): Change mode to VF_BHSD.
(mul3): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_2-512-bf-vector-operations-1.c: New test.
* gcc.target/i386/avx10_2-bf-vector-operations-1.c: Ditto.

Diff:
---
 gcc/config/i386/sse.md | 49 +++---
 .../i386/avx10_2-512-bf-vector-operations-1.c  | 42 
 .../i386/avx10_2-bf-vector-operations-1.c  | 79 ++
 3 files changed, 162 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 442ac93afa2b..ebca462bae8b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -391,6 +391,19 @@
(V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX")
(V2DF "TARGET_SSE2")])
 
+(define_mode_iterator VF_BHSD
+  [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V16SF "TARGET_AVX512F && TARGET_EVEX512")
+   (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F && TARGET_EVEX512")
+   (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")
+   (V32BF "TARGET_AVX10_2_512")
+   (V16BF "TARGET_AVX10_2_256")
+   (V8BF "TARGET_AVX10_2_256")
+  ])
+
 ;; 128-, 256- and 512-bit float vector modes for bitwise operations
 (define_mode_iterator VFB
   [(V32BF "TARGET_AVX512F && TARGET_EVEX512")
@@ -2527,10 +2540,10 @@
 })
 
 (define_expand "3"
-  [(set (match_operand:VFH 0 "register_operand")
-   (plusminus:VFH
- (match_operand:VFH 1 "")
- (match_operand:VFH 2 "")))]
+  [(set (match_operand:VF_BHSD 0 "register_operand")
+   (plusminus:VF_BHSD
+ (match_operand:VF_BHSD 1 "")
+ (match_operand:VF_BHSD 2 "")))]
   "TARGET_SSE &&  && "
   "ix86_fixup_binary_operands_no_copy (, mode, operands);")
 
@@ -2616,10 +2629,10 @@
 })
 
 (define_expand "mul3"
-  [(set (match_operand:VFH 0 "register_operand")
-   (mult:VFH
- (match_operand:VFH 1 "")
- (match_operand:VFH 2 "")))]
+  [(set (match_operand:VF_BHSD 0 "register_operand")
+   (mult:VF_BHSD
+ (match_operand:VF_BHSD 1 "")
+ (match_operand:VF_BHSD 2 "")))]
   "TARGET_SSE &&  && "
   "ix86_fixup_binary_operands_no_copy (MULT, mode, operands);")
 
@@ -2734,6 +2747,26 @@
 }
 })
 
+(define_expand "div3"
+  [(set (match_operand:VBF_AVX10_2 0 "register_operand")
+   (div:VBF_AVX10_2
+ (match_operand:VBF_AVX10_2 1 "register_operand")
+ (match_operand:VBF_AVX10_2 2 "vector_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  if (TARGET_RECIP_VEC_DIV
+  && optimize_insn_for_speed_p ()
+  && flag_finite_math_only
+  && flag_unsafe_math_optimizations)
+{
+  rtx op = gen_reg_rtx (mode);
+  operands[2] = force_reg (mode, operands[2]);
+  emit_insn (gen_avx10_2_rcppbf16_ (op, operands[2]));
+  emit_insn (gen_avx10_2_mulnepbf16_ (operands[0], operands[1], op));
+  DONE;
+}
+})
+
 (define_expand "cond_div"
   [(set (match_operand:VFH 0 "register_operand")
(vec_merge:VFH
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c
new file mode 100644
index ..d6b0750c2334
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx10.2-512 -O2" } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vaddnepbf16\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdivnepbf16\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vsubnepbf16\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vrcppbf16\[ 
\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include 
+
+typedef __bf16 v32bf __attribute__ ((__vector_size__ (64)));
+
+v32bf
+foo_mul (v32bf a, v32bf b)
+{
+  return a * b;
+}
+
+v32bf
+foo_add (v32bf a, v32bf b)
+{
+  return a + b;
+}
+
+v32

[gcc r15-3352] i386: Auto vectorize sdot_prod, usdot_prod, udot_prod with AVX10.2 instructions

2024-09-01 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b1f9fbb6da1a3ced57c3668cecc9f9449e1b237e

commit r15-3352-gb1f9fbb6da1a3ced57c3668cecc9f9449e1b237e
Author: Haochen Jiang 
Date:   Mon Sep 2 10:24:29 2024 +0800

i386: Auto vectorize sdot_prod, usdot_prod, udot_prod with AVX10.2 
instructions

gcc/ChangeLog:

* config/i386/sse.md (VI1_AVX512VNNIBW): New.
(VI2_AVX10_2): Ditto.
(sdot_prod): Add AVX10.2
to auto vectorize and combine 512 bit part.
(udot_prod): Ditto.
(sdot_prodv64qi): Removed.
(udot_prodv64qi): Ditto.
(usdot_prod): Add AVX10.2 to auto vectorize.
(udot_prod): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vnniint16-auto-vectorize-2.c: Only define
TEST when not defined.
* gcc.target/i386/vnniint8-auto-vectorize-2.c: Ditto.
* gcc.target/i386/vnniint16-auto-vectorize-3.c: New test.
* gcc.target/i386/vnniint16-auto-vectorize-4.c: Ditto.
* gcc.target/i386/vnniint8-auto-vectorize-3.c: Ditto.
* gcc.target/i386/vnniint8-auto-vectorize-4.c: Ditto.

Diff:
---
 gcc/config/i386/sse.md | 93 +-
 .../gcc.target/i386/vnniint16-auto-vectorize-2.c   | 11 ++-
 .../gcc.target/i386/vnniint16-auto-vectorize-3.c   |  6 ++
 .../gcc.target/i386/vnniint16-auto-vectorize-4.c   | 18 +
 .../gcc.target/i386/vnniint8-auto-vectorize-2.c| 12 ++-
 .../gcc.target/i386/vnniint8-auto-vectorize-3.c|  6 ++
 .../gcc.target/i386/vnniint8-auto-vectorize-4.c| 18 +
 7 files changed, 86 insertions(+), 78 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index da91d39cf8eb..442ac93afa2b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -610,6 +610,10 @@
 (define_mode_iterator VI1_AVX512VNNI
   [(V64QI "TARGET_AVX512VNNI && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI])
 
+(define_mode_iterator VI1_AVX512VNNIBW
+  [(V64QI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512")
+   (V32QI "TARGET_AVX2") V16QI])
+
 (define_mode_iterator VI12_256_512_AVX512VL
   [(V64QI "TARGET_EVEX512") (V32QI "TARGET_AVX512VL")
(V32HI "TARGET_EVEX512") (V16HI "TARGET_AVX512VL")])
@@ -627,6 +631,9 @@
   [(V32HI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512")
(V16HI "TARGET_AVX2") V8HI])
 
+(define_mode_iterator VI2_AVX10_2
+  [(V32HI "TARGET_AVX10_2_512") V16HI V8HI])
+
 (define_mode_iterator VI4_AVX
   [(V8SI "TARGET_AVX") V4SI])
 
@@ -31232,12 +31239,13 @@
 
 (define_expand "sdot_prod"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI1_AVX2 1 "register_operand")
-   (match_operand:VI1_AVX2 2 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 1 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 2 "register_operand")
(match_operand: 3 "register_operand")]
   "TARGET_SSE2"
 {
-  if (TARGET_AVXVNNIINT8)
+  if (( == 64 && TARGET_AVX10_2_512)
+  || ( < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256)))
 {
   operands[1] = lowpart_subreg (mode,
force_reg (mode, operands[1]),
@@ -31276,44 +31284,15 @@
   DONE;
 })
 
-(define_expand "sdot_prodv64qi"
-  [(match_operand:V16SI 0 "register_operand")
-   (match_operand:V64QI 1 "register_operand")
-   (match_operand:V64QI 2 "register_operand")
-   (match_operand:V16SI 3 "register_operand")]
-  "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
-{
-  /* Emulate with vpdpwssd.  */
-  rtx op1_lo = gen_reg_rtx (V32HImode);
-  rtx op1_hi = gen_reg_rtx (V32HImode);
-  rtx op2_lo = gen_reg_rtx (V32HImode);
-  rtx op2_hi = gen_reg_rtx (V32HImode);
-
-  emit_insn (gen_vec_unpacks_lo_v64qi (op1_lo, operands[1]));
-  emit_insn (gen_vec_unpacks_lo_v64qi (op2_lo, operands[2]));
-  emit_insn (gen_vec_unpacks_hi_v64qi (op1_hi, operands[1]));
-  emit_insn (gen_vec_unpacks_hi_v64qi (op2_hi, operands[2]));
-
-  rtx res1 = gen_reg_rtx (V16SImode);
-  rtx res2 = gen_reg_rtx (V16SImode);
-  rtx sum = gen_reg_rtx (V16SImode);
-
-  emit_move_insn (sum, CONST0_RTX (V16SImode));
-  emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum));
-  emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3]));
-
-  emit_insn (gen_addv16si3 (operands[0], res1, res2));
-  DONE;
-})
-
 (define_expand "udot_prod"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI1_AVX2 1 "register_operand")
-   (match_operand:VI1_AVX2 2 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 1 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 2 "register_operand")
(match_operand: 3 "register_operand")]
   "TARGET_SSE2"
 {
-  if (TARGET_AVXVNNIINT8)
+  if (( == 64 && TARGET_AVX10_2_512)
+  || ( < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256)))
 {
   operands[1] = lowpart_subreg (mode,
force_reg (mode, operands[1]),
@@ -31352,36 +31331,6 @@
   DONE;
 })
 
-(define_expand "udot_prodv64qi"
-

[gcc r15-3410] i386: Fix vfpclassph non-optimizied intrin

2024-09-03 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:9b312595f9ac073f55d858b6f833097608b40bba

commit r15-3410-g9b312595f9ac073f55d858b6f833097608b40bba
Author: Haochen Jiang 
Date:   Mon Sep 2 15:00:22 2024 +0800

i386: Fix vfpclassph non-optimizied intrin

The intrin for non-optimized got a typo in mask type, which will cause
the high bits of __mmask32 being unexpectedly zeroed.

The test does not fail under O0 with current 1b since the testcase is
wrong. We need to include avx512-mask-type.h after SIZE is defined, or
it will always be __mmask8. That problem also happened in AVX10.2 testcases.
I will write a seperate patch to fix that.

gcc/ChangeLog:

* config/i386/avx512fp16intrin.h
(_mm512_mask_fpclass_ph_mask): Correct mask type to __mmask32.
(_mm512_fpclass_ph_mask): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vfpclassph-1c.c: New test.

Diff:
---
 gcc/config/i386/avx512fp16intrin.h |  4 +-
 .../gcc.target/i386/avx512fp16-vfpclassph-1c.c | 77 ++
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx512fp16intrin.h 
b/gcc/config/i386/avx512fp16intrin.h
index 1869a920dd32..c3096b74ad2b 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -3961,11 +3961,11 @@ _mm512_fpclass_ph_mask (__m512h __A, const int __imm)
 #else
 #define _mm512_mask_fpclass_ph_mask(u, x, c)   \
   ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
-(int) (c),(__mmask8)(u)))
+(int) (c),(__mmask32)(u)))
 
 #define _mm512_fpclass_ph_mask(x, c)\
   ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
-(int) (c),(__mmask8)-1))
+(int) (c),(__mmask32)-1))
 #endif /* __OPIMTIZE__ */
 
 /* Intrinsics vgetexpph.  */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
new file mode 100644
index ..4739f1228e32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
@@ -0,0 +1,77 @@
+/* { dg-do run } */
+/* { dg-options "-O0 -mavx512fp16" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#define AVX512FP16
+#include "avx512f-helper.h"
+
+#include 
+#include 
+#include 
+#define SIZE (AVX512F_LEN / 16)
+#include "avx512f-mask-type.h"
+
+#ifndef __FPCLASSPH__
+#define __FPCLASSPH__
+int check_fp_class_hp (_Float16 src, int imm)
+{
+  int qNaN_res = isnan (src);
+  int sNaN_res = isnan (src);
+  int Pzero_res = (src == 0.0);
+  int Nzero_res = (src == -0.0);
+  int PInf_res = (isinf (src) == 1);
+  int NInf_res = (isinf (src) == -1);
+  int Denorm_res = (fpclassify (src) == FP_SUBNORMAL);
+  int FinNeg_res = __builtin_finite (src) && (src < 0);
+
+  int result = (((imm & 1) && qNaN_res)
+   || (((imm >> 1) & 1) && Pzero_res)
+   || (((imm >> 2) & 1) && Nzero_res)
+   || (((imm >> 3) & 1) && PInf_res)
+   || (((imm >> 4) & 1) && NInf_res)
+   || (((imm >> 5) & 1) && Denorm_res)
+   || (((imm >> 6) & 1) && FinNeg_res)
+   || (((imm >> 7) & 1) && sNaN_res));
+  return result;
+}
+#endif
+
+MASK_TYPE
+CALC (_Float16 *s1, int imm)
+{
+  int i;
+  MASK_TYPE res = 0;
+
+  for (i = 0; i < SIZE; i++)
+if (check_fp_class_hp(s1[i], imm))
+  res = res | (1 << i);
+
+  return res;
+}
+
+void
+TEST (void)
+{
+  int i;
+  UNION_TYPE (AVX512F_LEN, h) src;
+  MASK_TYPE res1, res2, res_ref = 0;
+  MASK_TYPE mask = MASK_VALUE;
+
+  src.a[SIZE - 1] = NAN;
+  src.a[SIZE - 2] = 1.0 / 0.0;
+  for (i = 0; i < SIZE - 2; i++)
+{
+  src.a[i] = -24.43 + 0.6 * i;
+}
+
+  res1 = INTRINSIC (_fpclass_ph_mask) (src.x, 0xFF);
+  res2 = INTRINSIC (_mask_fpclass_ph_mask) (mask, src.x, 0xFF);
+
+  res_ref = CALC (src.a, 0xFF);
+
+  if (res_ref != res1)
+abort ();
+
+  if ((mask & res_ref) != res2)
+abort ();
+}


[gcc r12-10696] i386: Fix vfpclassph non-optimizied intrin

2024-09-03 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6e59b188c4a051d4f2de5220d30681e6963d96c0

commit r12-10696-g6e59b188c4a051d4f2de5220d30681e6963d96c0
Author: Haochen Jiang 
Date:   Mon Sep 2 15:00:22 2024 +0800

i386: Fix vfpclassph non-optimizied intrin

The intrin for non-optimized got a typo in mask type, which will cause
the high bits of __mmask32 being unexpectedly zeroed.

The test does not fail under O0 with current 1b since the testcase is
wrong. We need to include avx512-mask-type.h after SIZE is defined, or
it will always be __mmask8. That problem also happened in AVX10.2 testcases.
I will write a seperate patch to fix that.

gcc/ChangeLog:

* config/i386/avx512fp16intrin.h
(_mm512_mask_fpclass_ph_mask): Correct mask type to __mmask32.
(_mm512_fpclass_ph_mask): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vfpclassph-1c.c: New test.

Diff:
---
 gcc/config/i386/avx512fp16intrin.h |  4 +-
 .../gcc.target/i386/avx512fp16-vfpclassph-1c.c | 77 ++
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx512fp16intrin.h 
b/gcc/config/i386/avx512fp16intrin.h
index b16ccfcb7f17..6330e57ebb85 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -2321,11 +2321,11 @@ _mm512_fpclass_ph_mask (__m512h __A, const int __imm)
 #else
 #define _mm512_mask_fpclass_ph_mask(u, x, c)   \
   ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
-(int) (c),(__mmask8)(u)))
+(int) (c),(__mmask32)(u)))
 
 #define _mm512_fpclass_ph_mask(x, c)\
   ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
-(int) (c),(__mmask8)-1))
+(int) (c),(__mmask32)-1))
 #endif /* __OPIMTIZE__ */
 
 /* Intrinsics vgetexpph, vgetexpsh.  */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
new file mode 100644
index ..4739f1228e32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
@@ -0,0 +1,77 @@
+/* { dg-do run } */
+/* { dg-options "-O0 -mavx512fp16" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#define AVX512FP16
+#include "avx512f-helper.h"
+
+#include 
+#include 
+#include 
+#define SIZE (AVX512F_LEN / 16)
+#include "avx512f-mask-type.h"
+
+#ifndef __FPCLASSPH__
+#define __FPCLASSPH__
+int check_fp_class_hp (_Float16 src, int imm)
+{
+  int qNaN_res = isnan (src);
+  int sNaN_res = isnan (src);
+  int Pzero_res = (src == 0.0);
+  int Nzero_res = (src == -0.0);
+  int PInf_res = (isinf (src) == 1);
+  int NInf_res = (isinf (src) == -1);
+  int Denorm_res = (fpclassify (src) == FP_SUBNORMAL);
+  int FinNeg_res = __builtin_finite (src) && (src < 0);
+
+  int result = (((imm & 1) && qNaN_res)
+   || (((imm >> 1) & 1) && Pzero_res)
+   || (((imm >> 2) & 1) && Nzero_res)
+   || (((imm >> 3) & 1) && PInf_res)
+   || (((imm >> 4) & 1) && NInf_res)
+   || (((imm >> 5) & 1) && Denorm_res)
+   || (((imm >> 6) & 1) && FinNeg_res)
+   || (((imm >> 7) & 1) && sNaN_res));
+  return result;
+}
+#endif
+
+MASK_TYPE
+CALC (_Float16 *s1, int imm)
+{
+  int i;
+  MASK_TYPE res = 0;
+
+  for (i = 0; i < SIZE; i++)
+if (check_fp_class_hp(s1[i], imm))
+  res = res | (1 << i);
+
+  return res;
+}
+
+void
+TEST (void)
+{
+  int i;
+  UNION_TYPE (AVX512F_LEN, h) src;
+  MASK_TYPE res1, res2, res_ref = 0;
+  MASK_TYPE mask = MASK_VALUE;
+
+  src.a[SIZE - 1] = NAN;
+  src.a[SIZE - 2] = 1.0 / 0.0;
+  for (i = 0; i < SIZE - 2; i++)
+{
+  src.a[i] = -24.43 + 0.6 * i;
+}
+
+  res1 = INTRINSIC (_fpclass_ph_mask) (src.x, 0xFF);
+  res2 = INTRINSIC (_mask_fpclass_ph_mask) (mask, src.x, 0xFF);
+
+  res_ref = CALC (src.a, 0xFF);
+
+  if (res_ref != res1)
+abort ();
+
+  if ((mask & res_ref) != res2)
+abort ();
+}


[gcc r13-9002] i386: Fix vfpclassph non-optimizied intrin

2024-09-03 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:e152aee5709dd3e341ef965450500f754f8b0a46

commit r13-9002-ge152aee5709dd3e341ef965450500f754f8b0a46
Author: Haochen Jiang 
Date:   Mon Sep 2 15:00:22 2024 +0800

i386: Fix vfpclassph non-optimizied intrin

The intrin for non-optimized got a typo in mask type, which will cause
the high bits of __mmask32 being unexpectedly zeroed.

The test does not fail under O0 with current 1b since the testcase is
wrong. We need to include avx512-mask-type.h after SIZE is defined, or
it will always be __mmask8. I will write a seperate patch to fix that
on trunk ONLY.

gcc/ChangeLog:

* config/i386/avx512fp16intrin.h
(_mm512_mask_fpclass_ph_mask): Correct mask type to __mmask32.
(_mm512_fpclass_ph_mask): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vfpclassph-1c.c: New test.

Diff:
---
 gcc/config/i386/avx512fp16intrin.h |  4 +-
 .../gcc.target/i386/avx512fp16-vfpclassph-1c.c | 77 ++
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx512fp16intrin.h 
b/gcc/config/i386/avx512fp16intrin.h
index dd083e5ed67b..4702c56c0dc7 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -2322,11 +2322,11 @@ _mm512_fpclass_ph_mask (__m512h __A, const int __imm)
 #else
 #define _mm512_mask_fpclass_ph_mask(u, x, c)   \
   ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
-(int) (c),(__mmask8)(u)))
+(int) (c),(__mmask32)(u)))
 
 #define _mm512_fpclass_ph_mask(x, c)\
   ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
-(int) (c),(__mmask8)-1))
+(int) (c),(__mmask32)-1))
 #endif /* __OPIMTIZE__ */
 
 /* Intrinsics vgetexpph, vgetexpsh.  */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
new file mode 100644
index ..4739f1228e32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
@@ -0,0 +1,77 @@
+/* { dg-do run } */
+/* { dg-options "-O0 -mavx512fp16" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#define AVX512FP16
+#include "avx512f-helper.h"
+
+#include 
+#include 
+#include 
+#define SIZE (AVX512F_LEN / 16)
+#include "avx512f-mask-type.h"
+
+#ifndef __FPCLASSPH__
+#define __FPCLASSPH__
+int check_fp_class_hp (_Float16 src, int imm)
+{
+  int qNaN_res = isnan (src);
+  int sNaN_res = isnan (src);
+  int Pzero_res = (src == 0.0);
+  int Nzero_res = (src == -0.0);
+  int PInf_res = (isinf (src) == 1);
+  int NInf_res = (isinf (src) == -1);
+  int Denorm_res = (fpclassify (src) == FP_SUBNORMAL);
+  int FinNeg_res = __builtin_finite (src) && (src < 0);
+
+  int result = (((imm & 1) && qNaN_res)
+   || (((imm >> 1) & 1) && Pzero_res)
+   || (((imm >> 2) & 1) && Nzero_res)
+   || (((imm >> 3) & 1) && PInf_res)
+   || (((imm >> 4) & 1) && NInf_res)
+   || (((imm >> 5) & 1) && Denorm_res)
+   || (((imm >> 6) & 1) && FinNeg_res)
+   || (((imm >> 7) & 1) && sNaN_res));
+  return result;
+}
+#endif
+
+MASK_TYPE
+CALC (_Float16 *s1, int imm)
+{
+  int i;
+  MASK_TYPE res = 0;
+
+  for (i = 0; i < SIZE; i++)
+if (check_fp_class_hp(s1[i], imm))
+  res = res | (1 << i);
+
+  return res;
+}
+
+void
+TEST (void)
+{
+  int i;
+  UNION_TYPE (AVX512F_LEN, h) src;
+  MASK_TYPE res1, res2, res_ref = 0;
+  MASK_TYPE mask = MASK_VALUE;
+
+  src.a[SIZE - 1] = NAN;
+  src.a[SIZE - 2] = 1.0 / 0.0;
+  for (i = 0; i < SIZE - 2; i++)
+{
+  src.a[i] = -24.43 + 0.6 * i;
+}
+
+  res1 = INTRINSIC (_fpclass_ph_mask) (src.x, 0xFF);
+  res2 = INTRINSIC (_mask_fpclass_ph_mask) (mask, src.x, 0xFF);
+
+  res_ref = CALC (src.a, 0xFF);
+
+  if (res_ref != res1)
+abort ();
+
+  if ((mask & res_ref) != res2)
+abort ();
+}


[gcc r14-10627] i386: Fix vfpclassph non-optimizied intrin

2024-09-03 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:59157c038d683e91c419a1fadd5f91f15218f57b

commit r14-10627-g59157c038d683e91c419a1fadd5f91f15218f57b
Author: Haochen Jiang 
Date:   Mon Sep 2 15:00:22 2024 +0800

i386: Fix vfpclassph non-optimizied intrin

The intrin for non-optimized got a typo in mask type, which will cause
the high bits of __mmask32 being unexpectedly zeroed.

The test does not fail under O0 with current 1b since the testcase is
wrong. We need to include avx512-mask-type.h after SIZE is defined, or
it will always be __mmask8. I will write a seperate patch to fix that
on trunk ONLY.

gcc/ChangeLog:

* config/i386/avx512fp16intrin.h
(_mm512_mask_fpclass_ph_mask): Correct mask type to __mmask32.
(_mm512_fpclass_ph_mask): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vfpclassph-1c.c: New test.

Diff:
---
 gcc/config/i386/avx512fp16intrin.h |  4 +-
 .../gcc.target/i386/avx512fp16-vfpclassph-1c.c | 77 ++
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx512fp16intrin.h 
b/gcc/config/i386/avx512fp16intrin.h
index f86050b20873..e8baebd41d3c 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -3961,11 +3961,11 @@ _mm512_fpclass_ph_mask (__m512h __A, const int __imm)
 #else
 #define _mm512_mask_fpclass_ph_mask(u, x, c)   \
   ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
-(int) (c),(__mmask8)(u)))
+(int) (c),(__mmask32)(u)))
 
 #define _mm512_fpclass_ph_mask(x, c)\
   ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
-(int) (c),(__mmask8)-1))
+(int) (c),(__mmask32)-1))
 #endif /* __OPIMTIZE__ */
 
 /* Intrinsics vgetexpph.  */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
new file mode 100644
index ..4739f1228e32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
@@ -0,0 +1,77 @@
+/* { dg-do run } */
+/* { dg-options "-O0 -mavx512fp16" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#define AVX512FP16
+#include "avx512f-helper.h"
+
+#include 
+#include 
+#include 
+#define SIZE (AVX512F_LEN / 16)
+#include "avx512f-mask-type.h"
+
+#ifndef __FPCLASSPH__
+#define __FPCLASSPH__
+int check_fp_class_hp (_Float16 src, int imm)
+{
+  int qNaN_res = isnan (src);
+  int sNaN_res = isnan (src);
+  int Pzero_res = (src == 0.0);
+  int Nzero_res = (src == -0.0);
+  int PInf_res = (isinf (src) == 1);
+  int NInf_res = (isinf (src) == -1);
+  int Denorm_res = (fpclassify (src) == FP_SUBNORMAL);
+  int FinNeg_res = __builtin_finite (src) && (src < 0);
+
+  int result = (((imm & 1) && qNaN_res)
+   || (((imm >> 1) & 1) && Pzero_res)
+   || (((imm >> 2) & 1) && Nzero_res)
+   || (((imm >> 3) & 1) && PInf_res)
+   || (((imm >> 4) & 1) && NInf_res)
+   || (((imm >> 5) & 1) && Denorm_res)
+   || (((imm >> 6) & 1) && FinNeg_res)
+   || (((imm >> 7) & 1) && sNaN_res));
+  return result;
+}
+#endif
+
+MASK_TYPE
+CALC (_Float16 *s1, int imm)
+{
+  int i;
+  MASK_TYPE res = 0;
+
+  for (i = 0; i < SIZE; i++)
+if (check_fp_class_hp(s1[i], imm))
+  res = res | (1 << i);
+
+  return res;
+}
+
+void
+TEST (void)
+{
+  int i;
+  UNION_TYPE (AVX512F_LEN, h) src;
+  MASK_TYPE res1, res2, res_ref = 0;
+  MASK_TYPE mask = MASK_VALUE;
+
+  src.a[SIZE - 1] = NAN;
+  src.a[SIZE - 2] = 1.0 / 0.0;
+  for (i = 0; i < SIZE - 2; i++)
+{
+  src.a[i] = -24.43 + 0.6 * i;
+}
+
+  res1 = INTRINSIC (_fpclass_ph_mask) (src.x, 0xFF);
+  res2 = INTRINSIC (_mask_fpclass_ph_mask) (mask, src.x, 0xFF);
+
+  res_ref = CALC (src.a, 0xFF);
+
+  if (res_ref != res1)
+abort ();
+
+  if ((mask & res_ref) != res2)
+abort ();
+}


[gcc r15-3539] doc: Enhance Intel CPU documentation

2024-09-08 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:91bc2ad28c58ca3f4c2f96601d8af51f570e08c4

commit r15-3539-g91bc2ad28c58ca3f4c2f96601d8af51f570e08c4
Author: Haochen Jiang 
Date:   Fri Sep 6 11:19:26 2024 +0800

doc: Enhance Intel CPU documentation

This patch will add those recent aliased CPU names into documentation
for clearness.

gcc/ChangeLog:

PR target/116617
* doc/invoke.texi: Add meteorlake, raptorlake and lunarlake.

Diff:
---
 gcc/doc/invoke.texi | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 019e0a5ca805..b9a86a9a181f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -34741,12 +34741,14 @@ UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, 
AVX512-FP16 and AVX512BF16
 instruction set support.
 
 @item alderlake
-Intel Alderlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3,
-SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC, XSAVES,
-XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI, MOVDIR64B,
-CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT, PCONFIG, PKU,
-VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and AVX-VNNI instruction set
-support.
+@itemx raptorlake
+@itemx meteorlake
+Intel Alderlake/Raptorlake/Meteorlake CPU with 64-bit extensions, MOVBE, MMX,
+SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND,
+XSAVE, XSAVEC, XSAVES, XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB,
+MOVDIRI, MOVDIR64B, CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA,
+LZCNT, PCONFIG, PKU, VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and
+AVX-VNNI instruction set support.
 
 @item rocketlake
 Intel Rocketlake CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3
@@ -34788,11 +34790,12 @@ UINTR, AVXIFMA, AVXVNNIINT8, AVXNECONVERT and 
CMPCCXADD instruction set
 support.
 
 @item arrowlake-s
-Intel Arrow Lake S CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3,
-SSSE3, SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC,
-XSAVES, XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI,
-MOVDIR64B, CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT,
-PCONFIG, PKU, VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL, AVX-VNNI,
+@itemx lunarlake
+Intel Arrow Lake S/Lunarlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2,
+SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE,
+XSAVEC, XSAVES, XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB,
+MOVDIRI, MOVDIR64B, CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA,
+LZCNT, PCONFIG, PKU, VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL, AVX-VNNI,
 UINTR, AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD, AVXVNNIINT16, SHA512,
 SM3 and SM4 instruction set support.


[gcc r14-10658] doc: Enhance Intel CPU documentation

2024-09-08 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:3951efed1cce970a5c61eacbad7e5f5314a9fc17

commit r14-10658-g3951efed1cce970a5c61eacbad7e5f5314a9fc17
Author: Haochen Jiang 
Date:   Fri Sep 6 11:19:26 2024 +0800

doc: Enhance Intel CPU documentation

This patch will add those recent aliased CPU names into documentation
for clearness.

gcc/ChangeLog:

PR target/116617
* doc/invoke.texi: Add meteorlake, raptorlake and lunarlake.

Diff:
---
 gcc/doc/invoke.texi | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 80476bcf37a1..176851baf61d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -34484,12 +34484,14 @@ UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, 
AVX512-FP16 and AVX512BF16
 instruction set support.
 
 @item alderlake
-Intel Alderlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3,
-SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC, XSAVES,
-XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI, MOVDIR64B,
-CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT, PCONFIG, PKU,
-VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and AVX-VNNI instruction set
-support.
+@itemx raptorlake
+@itemx meteorlake
+Intel Alderlake/Raptorlake/Meteorlake CPU with 64-bit extensions, MOVBE, MMX,
+SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND,
+XSAVE, XSAVEC, XSAVES, XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB,
+MOVDIRI, MOVDIR64B, CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA,
+LZCNT, PCONFIG, PKU, VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and
+AVX-VNNI instruction set support.
 
 @item rocketlake
 Intel Rocketlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3
@@ -34531,11 +34533,12 @@ UINTR, AVXIFMA, AVXVNNIINT8, AVXNECONVERT and 
CMPCCXADD instruction set
 support.
 
 @item arrowlake-s
-Intel Arrow Lake S CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3,
-SSSE3, SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC,
-XSAVES, XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI,
-MOVDIR64B, CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT,
-PCONFIG, PKU, VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL, AVX-VNNI,
+@itemx lunarlake
+Intel Arrow Lake S/Lunarlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2,
+SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE,
+XSAVEC, XSAVES, XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB,
+MOVDIRI, MOVDIR64B, CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA,
+LZCNT, PCONFIG, PKU, VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL, AVX-VNNI,
 UINTR, AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD, AVXVNNIINT16, SHA512,
 SM3 and SM4 instruction set support.


[gcc r13-9011] doc: Enhance Intel CPU documentation

2024-09-08 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:0a16b1b97c112e41a0d37235e83678a67abd9454

commit r13-9011-g0a16b1b97c112e41a0d37235e83678a67abd9454
Author: Haochen Jiang 
Date:   Fri Sep 6 11:19:26 2024 +0800

doc: Enhance Intel CPU documentation

This patch will add those recent aliased CPU names into documentation
for clearness, partly backported from GCC15 trunk patch.

gcc/ChangeLog:

PR target/116617
* doc/invoke.texi: Add meteorlake, raptorlake and lunarlake.

Diff:
---
 gcc/doc/invoke.texi | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b17d0cf93411..0f665ed6779a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -32532,12 +32532,14 @@ UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, 
AVX512-FP16 and AVX512BF16
 instruction set support.
 
 @item alderlake
-Intel Alderlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3,
-SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC, XSAVES,
-XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI, MOVDIR64B,
-CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT, PCONFIG, PKU,
-VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and AVX-VNNI instruction set
-support.
+@itemx raptorlake
+@itemx meteorlake
+Intel Alderlake/Raptorlake/Meteorlake CPU with 64-bit extensions, MOVBE, MMX,
+SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND,
+XSAVE, XSAVEC, XSAVES, XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB,
+MOVDIRI, MOVDIR64B, CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA,
+LZCNT, PCONFIG, PKU, VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and
+AVX-VNNI instruction set support.
 
 @item rocketlake
 Intel Rocketlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3


[gcc r15-3594] i386: Fix incorrect avx512f-mask-type.h include

2024-09-11 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:5958279509c4601499ac22629512f1723e6744b4

commit r15-3594-g5958279509c4601499ac22629512f1723e6744b4
Author: Haochen Jiang 
Date:   Tue Sep 3 13:38:36 2024 +0800

i386: Fix incorrect avx512f-mask-type.h include

In avx512f-mask-type.h, we need SIZE being defined to get
MASK_TYPE defined correctly. Fix those testcases where
SIZE are not defined before the include for avv512f-mask-type.h.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10-helper.h: Do not include
avx512f-mask-type.h.
* gcc.target/i386/avx10_2-512-vaddnepbf16-2.c:
Define SIZE and include avx512f-mask-type.h.
* gcc.target/i386/avx10_2-512-vcmppbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtnebf162ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtnebf162iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtph2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtph2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtps2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvtps2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttnebf162ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttnebf162iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttpd2dqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttpd2qqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttpd2udqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttpd2uqqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttph2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttph2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2dqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2ibs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2iubs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2qqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2udqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vcvttps2uqqs-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vdivnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vdpphps-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vfmaddXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vfmsubXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vfnmaddXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vfnmsubXXXnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vfpclasspbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vgetexppbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vgetmantpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vmaxpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vminmaxnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vminmaxpd-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vminmaxph-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vminmaxps-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vminpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vmpsadbw-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vmulnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbssd-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbssds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbsud-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbsuds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbuud-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpbuuds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwsud-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwsuds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwusd-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwusds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwuud-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vpdpwuuds-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vrcppbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vreducenepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vrndscalenepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vrsqrtpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vscalefpbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vsqrtnepbf16-2.c: Ditto.
* gcc.target/i386/avx10_2-512-vsubnepbf16-2.c: Ditto.
* gcc.target/i386/avx512fp16-vfpclassph-1b.c: Ditto.

Diff:
---
 gcc/testsuite/gcc.target/i386/avx10-helper.h   |  1 -
 .../gcc.target/i386/avx10_2-512-vaddnepbf16-2.c| 11 ++-
 .../gcc.target/i386/avx10_2-512-vcmppbf16-2.c  |  5 +++--
 .../gcc.target/i386/avx10_2-512-vcvtnebf162ibs-2.c | 16 
 .../i386/avx10_2-512-vcvtnebf162iubs-2.c   | 16 
 .../gcc.target/i386/avx10_2-512-vcvtph2ibs-2.c | 16 
 .../gcc.target/i386/avx10_2-512-vcvtph2iubs-2.c| 16 ++

[gcc r15-1908] i386: Correct AVX10 CPUID emulation

2024-07-09 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:298a576f00c49b8f4529ea2f87b9943a32743250

commit r15-1908-g298a576f00c49b8f4529ea2f87b9943a32743250
Author: Haochen Jiang 
Date:   Tue Jul 9 16:31:02 2024 +0800

i386: Correct AVX10 CPUID emulation

AVX10 Documentaion has specified ecx value as 0 for AVX10 version and
vector size under 0x24 subleaf. Although for ecx=1, the bits are all
reserved for now, we still need to specify ecx as 0 to avoid dirty
value in ecx.

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features): Correct
AVX10 CPUID emulation to specify ecx value.

Diff:
---
 gcc/common/config/i386/cpuinfo.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 936039725ab6..2ae77d335d24 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -998,10 +998,10 @@ get_available_features (struct __processor_model 
*cpu_model,
}
 }
 
-  /* Get Advanced Features at level 0x24 (eax = 0x24).  */
+  /* Get Advanced Features at level 0x24 (eax = 0x24, ecx = 0).  */
   if (avx10_set && max_cpuid_level >= 0x24)
 {
-  __cpuid (0x24, eax, ebx, ecx, edx);
+  __cpuid_count (0x24, 0, eax, ebx, ecx, edx);
   version = ebx & 0xff;
   if (ebx & bit_AVX10_256)
switch (version)


[gcc r14-10397] i386: Correct AVX10 CPUID emulation

2024-07-09 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:74c15cb93b3830fee79f75805329d4299ff4a2f0

commit r14-10397-g74c15cb93b3830fee79f75805329d4299ff4a2f0
Author: Haochen Jiang 
Date:   Tue Jul 9 16:31:02 2024 +0800

i386: Correct AVX10 CPUID emulation

AVX10 Documentaion has specified ecx value as 0 for AVX10 version and
vector size under 0x24 subleaf. Although for ecx=1, the bits are all
reserved for now, we still need to specify ecx as 0 to avoid dirty
value in ecx.

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features): Correct
AVX10 CPUID emulation to specify ecx value.

Diff:
---
 gcc/common/config/i386/cpuinfo.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 017a952a5db0..56427474b7be 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -1014,10 +1014,10 @@ get_available_features (struct __processor_model 
*cpu_model,
}
 }
 
-  /* Get Advanced Features at level 0x24 (eax = 0x24).  */
+  /* Get Advanced Features at level 0x24 (eax = 0x24, ecx = 0).  */
   if (avx10_set && max_cpuid_level >= 0x24)
 {
-  __cpuid (0x24, eax, ebx, ecx, edx);
+  __cpuid_count (0x24, 0, eax, ebx, ecx, edx);
   version = ebx & 0xff;
   if (ebx & bit_AVX10_256)
switch (version)


[gcc r15-2129] i386: Fix testcases generating invalid asm

2024-07-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:4b58697cecbd72fd7db5a0fcdf7af8deb3be2b14

commit r15-2129-g4b58697cecbd72fd7db5a0fcdf7af8deb3be2b14
Author: Haochen Jiang 
Date:   Wed Jul 17 16:26:35 2024 +0800

i386: Fix testcases generating invalid asm

For compile test, we should generate valid asm except for special purposes.
Fix the compile test that generates invalid asm.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-egprs-names.c: Use ax for short and
al for char instead of eax.
* gcc.target/i386/avx512bw-kandnq-1.c: Do not run the test
under -m32 since kmovq with register is invalid. Use long
long to use 64 bit register instead of 32 bit register for
kmovq.
* gcc.target/i386/avx512bw-kandq-1.c: Ditto.
* gcc.target/i386/avx512bw-knotq-1.c: Ditto.
* gcc.target/i386/avx512bw-korq-1.c: Ditto.
* gcc.target/i386/avx512bw-kshiftlq-1.c: Ditto.
* gcc.target/i386/avx512bw-kshiftrq-1.c: Ditto.
* gcc.target/i386/avx512bw-kxnorq-1.c: Ditto.
* gcc.target/i386/avx512bw-kxorq-1.c: Ditto.

Diff:
---
 gcc/testsuite/gcc.target/i386/apx-egprs-names.c | 8 
 gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c   | 6 +++---
 gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c| 6 +++---
 gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c| 4 ++--
 gcc/testsuite/gcc.target/i386/avx512bw-korq-1.c | 6 +++---
 gcc/testsuite/gcc.target/i386/avx512bw-kshiftlq-1.c | 4 ++--
 gcc/testsuite/gcc.target/i386/avx512bw-kshiftrq-1.c | 4 ++--
 gcc/testsuite/gcc.target/i386/avx512bw-kxnorq-1.c   | 6 +++---
 gcc/testsuite/gcc.target/i386/avx512bw-kxorq-1.c| 6 +++---
 9 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/apx-egprs-names.c 
b/gcc/testsuite/gcc.target/i386/apx-egprs-names.c
index f0517e47c334..917ef5054951 100644
--- a/gcc/testsuite/gcc.target/i386/apx-egprs-names.c
+++ b/gcc/testsuite/gcc.target/i386/apx-egprs-names.c
@@ -10,8 +10,8 @@ void foo ()
   register int b __asm ("r30");
   register short c __asm ("r29");
   register char d __asm ("r28");
-  __asm__ __volatile__ ("mov %0, %%rax" : : "r" (a) : "rax");
-  __asm__ __volatile__ ("mov %0, %%eax" : : "r" (b) : "eax");
-  __asm__ __volatile__ ("mov %0, %%eax" : : "r" (c) : "eax");
-  __asm__ __volatile__ ("mov %0, %%eax" : : "r" (d) : "eax");
+  __asm__ __volatile__ ("movq %0, %%rax" : : "r" (a) : "rax");
+  __asm__ __volatile__ ("movl %0, %%eax" : : "r" (b) : "eax");
+  __asm__ __volatile__ ("movw %0, %%ax" : : "r" (c) : "ax");
+  __asm__ __volatile__ ("movb %0, %%al" : : "r" (d) : "al");
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c 
b/gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c
index e8b7a5f9aa21..f9f03c90782f 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c
@@ -1,4 +1,4 @@
-/* { dg-do compile } */
+/* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-mavx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kandnq\[ 
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
@@ -10,8 +10,8 @@ avx512bw_test ()
   __mmask64 k1, k2, k3;
   volatile __m512i x = _mm512_setzero_si512 ();
 
-  __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (1) );
-  __asm__( "kmovq %1, %0" : "=k" (k2) : "r" (2) );
+  __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (1ULL) );
+  __asm__( "kmovq %1, %0" : "=k" (k2) : "r" (2ULL) );
 
   k3 = _kandn_mask64 (k1, k2);
   x = _mm512_mask_add_epi8 (x, k3, x, x);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c 
b/gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c
index a1aaed67c66c..6ad836087adc 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c
@@ -1,4 +1,4 @@
-/* { dg-do compile } */
+/* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-mavx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kandq\[ 
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
@@ -10,8 +10,8 @@ avx512bw_test ()
   __mmask64 k1, k2, k3;
   volatile __m512i x = _mm512_setzero_epi32();
 
-  __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (1) );
-  __asm__( "kmovq %1, %0" : "=k" (k2) : "r" (2) );
+  __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (1ULL) );
+  __asm__( "kmovq %1, %0" : "=k" (k2) : "r" (2ULL) );
 
   k3 = _kand_mask64 (k1, k2);
   x = _mm512_mask_add_epi8 (x, k3, x, x);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c 
b/gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c
index deb657957600..341bbc038479 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c
@@ -1,4 +1,4 @@
-/* { dg-do compile } */
+/* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-mavx512bw -O2" } */
 /* { dg-final { scan-assembler-times "knotq\[ 
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
@@ -10,7 +10,7 @@ av

[gcc r15-2213] i386: Change prefetchi output template

2024-07-22 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:062e46a813799684c6f900815fd22451d6294ae1

commit r15-2213-g062e46a813799684c6f900815fd22451d6294ae1
Author: Haochen Jiang 
Date:   Mon Jul 22 14:06:18 2024 +0800

i386: Change prefetchi output template

For prefetchi instructions, RIP-relative address is explicitly mentioned
for operand and assembler obeys that rule strictly. This makes
instruction like:

prefetchit0 bar

got illegal for assembler, which should be a broad usage for prefetchi.

Change to %a to explicitly add (%rip) after function label to make it
legal in assembler so that it could pass to linker to get the real address.

gcc/ChangeLog:

* config/i386/i386.md (prefetchi): Change to %a.

gcc/testsuite/ChangeLog:

* gcc.target/i386/prefetchi-1.c: Check (%rip).

Diff:
---
 gcc/config/i386/i386.md | 2 +-
 gcc/testsuite/gcc.target/i386/prefetchi-1.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 90d3aa450f05..6207036a2a01 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -28004,7 +28004,7 @@
   "TARGET_PREFETCHI && TARGET_64BIT"
 {
   static const char * const patterns[2] = {
-"prefetchit1\t%0", "prefetchit0\t%0"
+"prefetchit1\t%a0", "prefetchit0\t%a0"
   };
 
   int locality = INTVAL (operands[1]);
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1.c 
b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
index 80f25e70e8e3..03dfdc55e86c 100644
--- a/gcc/testsuite/gcc.target/i386/prefetchi-1.c
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-mprefetchi -O2" } */
-/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ \\t\]+" 2 } } */
-/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ \\t\]+" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ 
\\t\]+bar\\(%rip\\)" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ 
\\t\]+bar\\(%rip\\)" 2 } } */
 
 #include 


[gcc r14-10500] i386: Change prefetchi output template

2024-07-22 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:81f356f9f72fc3159eeaa5a037cf6c3eb701224b

commit r14-10500-g81f356f9f72fc3159eeaa5a037cf6c3eb701224b
Author: Haochen Jiang 
Date:   Mon Jul 22 14:06:18 2024 +0800

i386: Change prefetchi output template

For prefetchi instructions, RIP-relative address is explicitly mentioned
for operand and assembler obeys that rule strictly. This makes
instruction like:

prefetchit0 bar

got illegal for assembler, which should be a broad usage for prefetchi.

Change to %a to explicitly add (%rip) after function label to make it
legal in assembler so that it could pass to linker to get the real address.

gcc/ChangeLog:

* config/i386/i386.md (prefetchi): Change to %a.

gcc/testsuite/ChangeLog:

* gcc.target/i386/prefetchi-1.c: Check (%rip).

Diff:
---
 gcc/config/i386/i386.md | 2 +-
 gcc/testsuite/gcc.target/i386/prefetchi-1.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 80da60c1569e..f9757433687e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -27142,7 +27142,7 @@
   "TARGET_PREFETCHI && TARGET_64BIT"
 {
   static const char * const patterns[2] = {
-"prefetchit1\t%0", "prefetchit0\t%0"
+"prefetchit1\t%a0", "prefetchit0\t%a0"
   };
 
   int locality = INTVAL (operands[1]);
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1.c 
b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
index 80f25e70e8e3..03dfdc55e86c 100644
--- a/gcc/testsuite/gcc.target/i386/prefetchi-1.c
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-mprefetchi -O2" } */
-/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ \\t\]+" 2 } } */
-/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ \\t\]+" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ 
\\t\]+bar\\(%rip\\)" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ 
\\t\]+bar\\(%rip\\)" 2 } } */
 
 #include 


[gcc r13-8935] i386: Change prefetchi output template

2024-07-22 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:dec571e95cf47e21a1a60ed337e68e3474f57f7d

commit r13-8935-gdec571e95cf47e21a1a60ed337e68e3474f57f7d
Author: Haochen Jiang 
Date:   Mon Jul 22 14:06:18 2024 +0800

i386: Change prefetchi output template

For prefetchi instructions, RIP-relative address is explicitly mentioned
for operand and assembler obeys that rule strictly. This makes
instruction like:

prefetchit0 bar

got illegal for assembler, which should be a broad usage for prefetchi.

Change to %a to explicitly add (%rip) after function label to make it
legal in assembler so that it could pass to linker to get the real address.

gcc/ChangeLog:

* config/i386/i386.md (prefetchi): Change to %a.

gcc/testsuite/ChangeLog:

* gcc.target/i386/prefetchi-1.c: Check (%rip).

Diff:
---
 gcc/config/i386/i386.md | 2 +-
 gcc/testsuite/gcc.target/i386/prefetchi-1.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b4b39865d677..a224c9f940ec 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -24291,7 +24291,7 @@
   "TARGET_PREFETCHI && TARGET_64BIT"
 {
   static const char * const patterns[2] = {
-"prefetchit1\t%0", "prefetchit0\t%0"
+"prefetchit1\t%a0", "prefetchit0\t%a0"
   };
 
   int locality = INTVAL (operands[1]);
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1.c 
b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
index 80f25e70e8e3..03dfdc55e86c 100644
--- a/gcc/testsuite/gcc.target/i386/prefetchi-1.c
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-mprefetchi -O2" } */
-/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ \\t\]+" 2 } } */
-/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ \\t\]+" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ 
\\t\]+bar\\(%rip\\)" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ 
\\t\]+bar\\(%rip\\)" 2 } } */
 
 #include 


[gcc r14-10072] i386: Fix Sierra Forest auto dispatch

2024-04-22 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6b5248d15c6d10325c6cbb92a0e0a9eb04e3f122

commit r14-10072-g6b5248d15c6d10325c6cbb92a0e0a9eb04e3f122
Author: Haochen Jiang 
Date:   Mon Apr 22 16:57:36 2024 +0800

i386: Fix Sierra Forest auto dispatch

gcc/ChangeLog:

* common/config/i386/i386-common.cc (processor_alias_table):
Let Sierra Forest map to CPU_TYPE enum.

Diff:
---
 gcc/common/config/i386/i386-common.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index f814df8385b..77b154663bc 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2302,7 +2302,7 @@ const pta processor_alias_table[] =
   {"gracemont", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
   {"sierraforest", PROCESSOR_SIERRAFOREST, CPU_HASWELL, PTA_SIERRAFOREST,
-M_CPU_SUBTYPE (INTEL_SIERRAFOREST), P_PROC_AVX2},
+M_CPU_TYPE (INTEL_SIERRAFOREST), P_PROC_AVX2},
   {"grandridge", PROCESSOR_GRANDRIDGE, CPU_HASWELL, PTA_GRANDRIDGE,
 M_CPU_TYPE (INTEL_GRANDRIDGE), P_PROC_AVX2},
   {"clearwaterforest", PROCESSOR_CLEARWATERFOREST, CPU_HASWELL,


[gcc r13-8641] i386: Fix Sierra Forest auto dispatch

2024-04-22 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:d80c9df20ed77a26eb71457679dad2b564c5da60

commit r13-8641-gd80c9df20ed77a26eb71457679dad2b564c5da60
Author: Haochen Jiang 
Date:   Mon Apr 22 16:57:36 2024 +0800

i386: Fix Sierra Forest auto dispatch

gcc/ChangeLog:

* common/config/i386/i386-common.cc (processor_alias_table):
Let Sierra Forest map to CPU_TYPE enum.

Diff:
---
 gcc/common/config/i386/i386-common.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 988805a3aed..a8809889360 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2110,7 +2110,7 @@ const pta processor_alias_table[] =
   {"gracemont", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
   {"sierraforest", PROCESSOR_SIERRAFOREST, CPU_HASWELL, PTA_SIERRAFOREST,
-M_CPU_SUBTYPE (INTEL_SIERRAFOREST), P_PROC_AVX2},
+M_CPU_TYPE (INTEL_SIERRAFOREST), P_PROC_AVX2},
   {"grandridge", PROCESSOR_GRANDRIDGE, CPU_HASWELL, PTA_GRANDRIDGE,
 M_CPU_TYPE (INTEL_GRANDRIDGE), P_PROC_AVX2},
   {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL,


[gcc r14-10104] i386: Fix behavior for both using AVX10.1-256 in options and function attribute

2024-04-24 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:d279c9d89b2f6ce89c1eec0ff4b980e9c5f51fd1

commit r14-10104-gd279c9d89b2f6ce89c1eec0ff4b980e9c5f51fd1
Author: Haochen Jiang 
Date:   Wed Apr 24 10:43:18 2024 +0800

i386: Fix behavior for both using AVX10.1-256 in options and function 
attribute

When we are using -mavx10.1-256 in command line and avx10.1-256 in
target attribute together, zmm should never be generated. But current
GCC will generate zmm since it wrongly enables EVEX512 for non-explicitly
set AVX512. This patch will fix that issue.

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
Check whether AVX512F is explicitly enabled.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx10_1-24.c: New test.

Diff:
---
 gcc/config/i386/i386-options.cc| 1 +
 gcc/testsuite/gcc.target/i386/avx10_1-24.c | 7 +++
 2 files changed, 8 insertions(+)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 68a2e1c6910..ac48b5c61c4 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1431,6 +1431,7 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args,
  scenario.  */
   if ((def->x_ix86_isa_flags2 & OPTION_MASK_ISA2_AVX10_1_256)
   && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512F)
+  && (opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F)
   && !(def->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_EVEX512)
   && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_EVEX512))
 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_EVEX512;
diff --git a/gcc/testsuite/gcc.target/i386/avx10_1-24.c 
b/gcc/testsuite/gcc.target/i386/avx10_1-24.c
new file mode 100644
index 000..2e93f041760
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_1-24.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mavx10.1" } */
+/* { dg-final { scan-assembler-not "%zmm" } } */
+
+typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
+
+void __attribute__((target("avx10.1-256"))) callee256(__m512 *a, __m512 *b) { 
*a = *b; }


[gcc r14-10137] i386: Fix array index overflow in pr105354-2.c

2024-04-26 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:4a2e55b3ada20fe6457466bb687a66c8d03e056e

commit r14-10137-g4a2e55b3ada20fe6457466bb687a66c8d03e056e
Author: Haochen Jiang 
Date:   Fri Apr 26 16:48:29 2024 +0800

i386: Fix array index overflow in pr105354-2.c

The array index should not be over 8 for v8hi, or it will fail
under -O0 or using -fstack-protector.

gcc/testsuite/ChangeLog:

PR target/110621
* gcc.target/i386/pr105354-2.c: As mentioned.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr105354-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr105354-2.c 
b/gcc/testsuite/gcc.target/i386/pr105354-2.c
index b78b62e1e7e..1c592e84860 100644
--- a/gcc/testsuite/gcc.target/i386/pr105354-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr105354-2.c
@@ -17,7 +17,7 @@ sse2_test (void)
   b.a[i] = i + 16;
   res_ab.a[i] = 0;
   exp_ab.a[i] = -1;
-  if (i <= 8)
+  if (i < 8)
{
  c.a[i] = i;
  d.a[i] = i + 8;


[gcc r13-8652] i386: Fix array index overflow in pr105354-2.c

2024-04-26 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:7425436b5382a04f3eb28c7c7912f4d9a1cad0bd

commit r13-8652-g7425436b5382a04f3eb28c7c7912f4d9a1cad0bd
Author: Haochen Jiang 
Date:   Fri Apr 26 16:48:29 2024 +0800

i386: Fix array index overflow in pr105354-2.c

The array index should not be over 8 for v8hi, or it will fail
under -O0 or using -fstack-protector.

gcc/testsuite/ChangeLog:

PR target/110621
* gcc.target/i386/pr105354-2.c: As mentioned.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr105354-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr105354-2.c 
b/gcc/testsuite/gcc.target/i386/pr105354-2.c
index b78b62e1e7e..1c592e84860 100644
--- a/gcc/testsuite/gcc.target/i386/pr105354-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr105354-2.c
@@ -17,7 +17,7 @@ sse2_test (void)
   b.a[i] = i + 16;
   res_ab.a[i] = 0;
   exp_ab.a[i] = -1;
-  if (i <= 8)
+  if (i < 8)
{
  c.a[i] = i;
  d.a[i] = i + 8;


[gcc r15-2335] i386: Use BLKmode for {ld,st}tilecfg

2024-07-26 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:f145f5411609dca5493a6709e8139609b584622f

commit r15-2335-gf145f5411609dca5493a6709e8139609b584622f
Author: Haochen Jiang 
Date:   Fri Jul 26 16:49:08 2024 +0800

i386: Use BLKmode for {ld,st}tilecfg

Hi all,

For AMX instructions related with memory, we will treat the memory
size as not specified since there won't be different size causing
confusion for memory.

This will change the output under Intel mode, which is broken for now when
using with assembler and aligns to current binutils behavior.

Bootstrapped and regtested on x86-64-pc-linux-gnu. Ok for trunk?

Thx,
Haochen

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_builtin): Change
from XImode to BLKmode.
* config/i386/i386.md (ldtilecfg): Change XI to BLK.
(sttilecfg): Ditto.

Diff:
---
 gcc/config/i386/i386-expand.cc |  2 +-
 gcc/config/i386/i386.md| 12 +---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 9a31e6df2aa2..d9ad06264aaf 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -14198,7 +14198,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx 
subtarget,
  op0 = convert_memory_address (Pmode, op0);
  op0 = copy_addr_to_reg (op0);
}
-  op0 = gen_rtx_MEM (XImode, op0);
+  op0 = gen_rtx_MEM (BLKmode, op0);
   if (fcode == IX86_BUILTIN_LDTILECFG)
icode = CODE_FOR_ldtilecfg;
   else
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 6207036a2a01..fb10fdc9f96d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -29032,24 +29032,22 @@
(set_attr "type" "other")])
 
 (define_insn "ldtilecfg"
-  [(unspec_volatile [(match_operand:XI 0 "memory_operand" "m")]
+  [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "m")]
 UNSPECV_LDTILECFG)]
   "TARGET_AMX_TILE"
   "ldtilecfg\t%0"
   [(set_attr "type" "other")
(set_attr "prefix" "maybe_evex")
-   (set_attr "memory" "load")
-   (set_attr "mode" "XI")])
+   (set_attr "memory" "load")])
 
 (define_insn "sttilecfg"
-  [(set (match_operand:XI 0 "memory_operand" "=m")
-(unspec_volatile:XI [(const_int 0)] UNSPECV_STTILECFG))]
+  [(set (match_operand:BLK 0 "memory_operand" "=m")
+(unspec_volatile:BLK [(const_int 0)] UNSPECV_STTILECFG))]
   "TARGET_AMX_TILE"
   "sttilecfg\t%0"
   [(set_attr "type" "other")
(set_attr "prefix" "maybe_evex")
-   (set_attr "memory" "store")
-   (set_attr "mode" "XI")])
+   (set_attr "memory" "store")])
 
 (include "mmx.md")
 (include "sse.md")


[gcc r15-2373] i386: Fix AVX512 intrin macro typo

2024-07-28 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:16daeb262af4566e665a941368cb15bc2cba3f07

commit r15-2373-g16daeb262af4566e665a941368cb15bc2cba3f07
Author: Haochen Jiang 
Date:   Thu Jul 25 16:12:20 2024 +0800

i386: Fix AVX512 intrin macro typo

There are several typo in AVX512 intrins macro define. Correct them to solve
errors when compiled with -O0.

gcc/ChangeLog:

* config/i386/avx512dqintrin.h
(_mm_mask_fpclass_ss_mask): Correct operand order.
(_mm_mask_fpclass_sd_mask): Ditto.
(_mm256_maskz_reduce_round_ss): Use 
__builtin_ia32_reducess_mask_round
instead of __builtin_ia32_reducesd_mask_round.
(_mm_reduce_round_sd): Use -1 as mask since it is non-mask.
(_mm_reduce_round_ss): Ditto.
* config/i386/avx512vlbwintrin.h
(_mm256_mask_alignr_epi8): Correct operand usage.
(_mm_mask_alignr_epi8): Ditto.
* config/i386/avx512vlintrin.h (_mm_mask_alignr_epi64): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-vpalignr-1b.c: New test.
* gcc.target/i386/avx512dq-vfpclasssd-1b.c: Ditto.
* gcc.target/i386/avx512dq-vfpcla-1b.c: Ditto.
* gcc.target/i386/avx512dq-vreducesd-1b.c: Ditto.
* gcc.target/i386/avx512dq-vreducess-1b.c: Ditto.
* gcc.target/i386/avx512vl-valignq-1b.c: Ditto.

Diff:
---
 gcc/config/i386/avx512dqintrin.h   | 16 +---
 gcc/config/i386/avx512vlbwintrin.h |  4 ++--
 gcc/config/i386/avx512vlintrin.h   |  2 +-
 gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c   | 18 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c | 14 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vfpcla-1b.c | 14 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c  | 16 
 gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c  | 16 
 gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c| 15 +++
 9 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h
index 3beed7e649a9..d9890c6da1dc 100644
--- a/gcc/config/i386/avx512dqintrin.h
+++ b/gcc/config/i386/avx512dqintrin.h
@@ -572,11 +572,11 @@ _mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, 
const int __imm)
   ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
 (int) (C), (__mmask8) (-1))) \
 
-#define _mm_mask_fpclass_ss_mask(X, C, U)  \
+#define _mm_mask_fpclass_ss_mask(U, X, C)  \
   ((__mmask8) __builtin_ia32_fpcla_mask ((__v4sf) (__m128) (X),\
 (int) (C), (__mmask8) (U)))
 
-#define _mm_mask_fpclass_sd_mask(X, C, U)  \
+#define _mm_mask_fpclass_sd_mask(U, X, C)  \
   ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
 (int) (C), (__mmask8) (U)))
 #define _mm_reduce_sd(A, B, C) \
@@ -594,8 +594,9 @@ _mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const 
int __imm)
 (__mmask8)(U)))
 
 #define _mm_reduce_round_sd(A, B, C, R)   \
-  ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A),  \
-(__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R)))
+  ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
+(__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \
+(__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_sd(W, U, A, B, C, R)\
   ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
@@ -622,8 +623,9 @@ _mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const 
int __imm)
 (__mmask8)(U)))
 
 #define _mm_reduce_round_ss(A, B, C, R)   \
-  ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A),   \
-(__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R)))
+  ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
+(__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (),  \
+(__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_ss(W, U, A, B, C, R)\
   ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
@@ -631,7 +633,7 @@ _mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const 
int __imm)
 (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_reduce_round_ss(U, A, B, C, R)  \
-  ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A),   \
+  ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
 (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_

[gcc r12-10648] i386: Fix AVX512 intrin macro typo

2024-07-28 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:bd0fbdc30d831f8c84223f583bcb5e5f6d7d72fc

commit r12-10648-gbd0fbdc30d831f8c84223f583bcb5e5f6d7d72fc
Author: Haochen Jiang 
Date:   Thu Jul 25 16:12:20 2024 +0800

i386: Fix AVX512 intrin macro typo

There are several typo in AVX512 intrins macro define. Correct them to solve
errors when compiled with -O0.

gcc/ChangeLog:

* config/i386/avx512dqintrin.h
(_mm_mask_fpclass_ss_mask): Correct operand order.
(_mm_mask_fpclass_sd_mask): Ditto.
(_mm256_maskz_reduce_round_ss): Use 
__builtin_ia32_reducess_mask_round
instead of __builtin_ia32_reducesd_mask_round.
(_mm_reduce_round_sd): Use -1 as mask since it is non-mask.
(_mm_reduce_round_ss): Ditto.
* config/i386/avx512vlbwintrin.h
(_mm256_mask_alignr_epi8): Correct operand usage.
(_mm_mask_alignr_epi8): Ditto.
* config/i386/avx512vlintrin.h (_mm_mask_alignr_epi64): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-vpalignr-1b.c: New test.
* gcc.target/i386/avx512dq-vfpclasssd-1b.c: Ditto.
* gcc.target/i386/avx512dq-vfpcla-1b.c: Ditto.
* gcc.target/i386/avx512dq-vreducesd-1b.c: Ditto.
* gcc.target/i386/avx512dq-vreducess-1b.c: Ditto.
* gcc.target/i386/avx512vl-valignq-1b.c: Ditto.

Diff:
---
 gcc/config/i386/avx512dqintrin.h   | 16 +---
 gcc/config/i386/avx512vlbwintrin.h |  4 ++--
 gcc/config/i386/avx512vlintrin.h   |  2 +-
 gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c   | 18 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c | 14 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vfpcla-1b.c | 14 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c  | 16 
 gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c  | 16 
 gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c| 15 +++
 9 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h
index e924250a4ad9..4f9451e949b5 100644
--- a/gcc/config/i386/avx512dqintrin.h
+++ b/gcc/config/i386/avx512dqintrin.h
@@ -2800,11 +2800,11 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
   ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
 (int) (C), (__mmask8) (-1))) \
 
-#define _mm_mask_fpclass_ss_mask(X, C, U)  \
+#define _mm_mask_fpclass_ss_mask(U, X, C)  \
   ((__mmask8) __builtin_ia32_fpcla_mask ((__v4sf) (__m128) (X),\
 (int) (C), (__mmask8) (U)))
 
-#define _mm_mask_fpclass_sd_mask(X, C, U)  \
+#define _mm_mask_fpclass_sd_mask(U, X, C)  \
   ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
 (int) (C), (__mmask8) (U)))
 
@@ -2839,8 +2839,9 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 (__mmask8)(U)))
 
 #define _mm_reduce_round_sd(A, B, C, R)   \
-  ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A),  \
-(__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R)))
+  ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
+(__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \
+(__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_sd(W, U, A, B, C, R)\
   ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
@@ -2867,8 +2868,9 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 (__mmask8)(U)))
 
 #define _mm_reduce_round_ss(A, B, C, R)   \
-  ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A),   \
-(__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R)))
+  ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
+(__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (),  \
+(__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_ss(W, U, A, B, C, R)\
   ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
@@ -2876,7 +2878,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_reduce_round_ss(U, A, B, C, R)  \
-  ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A),   \
+  ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
 (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),\
 (__mmask8)(U), (int)(R)))
 
diff --git a/gcc/config/i386/avx512vlbwintrin.h 
b/gcc/config/i386/avx512vlbwintrin.h

[gcc r13-8949] i386: Fix AVX512 intrin macro typo

2024-07-28 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:bb15c4cf21dbe76df5a225342d1fbe8ecd3c7971

commit r13-8949-gbb15c4cf21dbe76df5a225342d1fbe8ecd3c7971
Author: Haochen Jiang 
Date:   Thu Jul 25 16:12:20 2024 +0800

i386: Fix AVX512 intrin macro typo

There are several typo in AVX512 intrins macro define. Correct them to solve
errors when compiled with -O0.

gcc/ChangeLog:

* config/i386/avx512dqintrin.h
(_mm_mask_fpclass_ss_mask): Correct operand order.
(_mm_mask_fpclass_sd_mask): Ditto.
(_mm256_maskz_reduce_round_ss): Use 
__builtin_ia32_reducess_mask_round
instead of __builtin_ia32_reducesd_mask_round.
(_mm_reduce_round_sd): Use -1 as mask since it is non-mask.
(_mm_reduce_round_ss): Ditto.
* config/i386/avx512vlbwintrin.h
(_mm256_mask_alignr_epi8): Correct operand usage.
(_mm_mask_alignr_epi8): Ditto.
* config/i386/avx512vlintrin.h (_mm_mask_alignr_epi64): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-vpalignr-1b.c: New test.
* gcc.target/i386/avx512dq-vfpclasssd-1b.c: Ditto.
* gcc.target/i386/avx512dq-vfpcla-1b.c: Ditto.
* gcc.target/i386/avx512dq-vreducesd-1b.c: Ditto.
* gcc.target/i386/avx512dq-vreducess-1b.c: Ditto.
* gcc.target/i386/avx512vl-valignq-1b.c: Ditto.

Diff:
---
 gcc/config/i386/avx512dqintrin.h   | 16 +---
 gcc/config/i386/avx512vlbwintrin.h |  4 ++--
 gcc/config/i386/avx512vlintrin.h   |  2 +-
 gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c   | 18 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c | 14 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vfpcla-1b.c | 14 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c  | 16 
 gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c  | 16 
 gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c| 15 +++
 9 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h
index 93900a0b5c75..4383ff146702 100644
--- a/gcc/config/i386/avx512dqintrin.h
+++ b/gcc/config/i386/avx512dqintrin.h
@@ -2800,11 +2800,11 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
   ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
 (int) (C), (__mmask8) (-1))) \
 
-#define _mm_mask_fpclass_ss_mask(X, C, U)  \
+#define _mm_mask_fpclass_ss_mask(U, X, C)  \
   ((__mmask8) __builtin_ia32_fpcla_mask ((__v4sf) (__m128) (X),\
 (int) (C), (__mmask8) (U)))
 
-#define _mm_mask_fpclass_sd_mask(X, C, U)  \
+#define _mm_mask_fpclass_sd_mask(U, X, C)  \
   ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
 (int) (C), (__mmask8) (U)))
 
@@ -2839,8 +2839,9 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 (__mmask8)(U)))
 
 #define _mm_reduce_round_sd(A, B, C, R)   \
-  ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A),  \
-(__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R)))
+  ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
+(__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \
+(__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_sd(W, U, A, B, C, R)\
   ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
@@ -2867,8 +2868,9 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 (__mmask8)(U)))
 
 #define _mm_reduce_round_ss(A, B, C, R)   \
-  ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A),   \
-(__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R)))
+  ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
+(__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (),  \
+(__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_ss(W, U, A, B, C, R)\
   ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
@@ -2876,7 +2878,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_reduce_round_ss(U, A, B, C, R)  \
-  ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A),   \
+  ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
 (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),\
 (__mmask8)(U), (int)(R)))
 
diff --git a/gcc/config/i386/avx512vlbwintrin.h 
b/gcc/config/i386/avx512vlbwintrin.h

[gcc r13-8950] i386: Use _mm_setzero_ps/d instead of _mm_avx512_setzero_ps/d for GCC13/12

2024-07-28 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b2ab34b2bb292948bfe103f56b13e9911d143d74

commit r13-8950-gb2ab34b2bb292948bfe103f56b13e9911d143d74
Author: Haochen Jiang 
Date:   Mon Jul 29 14:10:49 2024 +0800

i386: Use _mm_setzero_ps/d instead of _mm_avx512_setzero_ps/d for GCC13/12

In GCC13/12, there is no _mm_avx512_setzero_ps/d since it is introduced
in GCC14.

gcc/ChangeLog:

* config/i386/avx512dqintrin.h (_mm_reduce_round_sd): Use
_mm_setzero_pd instead of _mm_avx512_setzero_pd.
(_mm_reduce_round_ss): Use _mm_setzero_ps instead of
_mm_avx512_setzero_ps.

Diff:
---
 gcc/config/i386/avx512dqintrin.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h
index 4383ff146702..82f4acc7d2eb 100644
--- a/gcc/config/i386/avx512dqintrin.h
+++ b/gcc/config/i386/avx512dqintrin.h
@@ -2840,7 +2840,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 
 #define _mm_reduce_round_sd(A, B, C, R)   \
   ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
-(__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \
+(__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (),   \
 (__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_sd(W, U, A, B, C, R)\
@@ -2869,7 +2869,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 
 #define _mm_reduce_round_ss(A, B, C, R)   \
   ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
-(__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (),  \
+(__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),\
 (__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_ss(W, U, A, B, C, R)\


[gcc r12-10649] i386: Use _mm_setzero_ps/d instead of _mm_avx512_setzero_ps/d for GCC13/12

2024-07-28 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:77ad22e4eaa97bb10068c6170f53caca77c99392

commit r12-10649-g77ad22e4eaa97bb10068c6170f53caca77c99392
Author: Haochen Jiang 
Date:   Mon Jul 29 14:10:49 2024 +0800

i386: Use _mm_setzero_ps/d instead of _mm_avx512_setzero_ps/d for GCC13/12

In GCC13/12, there is no _mm_avx512_setzero_ps/d since it is introduced
in GCC14.

gcc/ChangeLog:

* config/i386/avx512dqintrin.h (_mm_reduce_round_sd): Use
_mm_setzero_pd instead of _mm_avx512_setzero_pd.
(_mm_reduce_round_ss): Use _mm_setzero_ps instead of
_mm_avx512_setzero_ps.

Diff:
---
 gcc/config/i386/avx512dqintrin.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h
index 4f9451e949b5..e8f8efe3be85 100644
--- a/gcc/config/i386/avx512dqintrin.h
+++ b/gcc/config/i386/avx512dqintrin.h
@@ -2840,7 +2840,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 
 #define _mm_reduce_round_sd(A, B, C, R)   \
   ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
-(__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \
+(__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (),   \
 (__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_sd(W, U, A, B, C, R)\
@@ -2869,7 +2869,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
 
 #define _mm_reduce_round_ss(A, B, C, R)   \
   ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
-(__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (),  \
+(__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),\
 (__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_ss(W, U, A, B, C, R)\


[gcc r14-10514] i386: Fix AVX512 intrin macro typo

2024-07-29 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:affb2e8f87e3982ee82b72dc3c44486daefd22e3

commit r14-10514-gaffb2e8f87e3982ee82b72dc3c44486daefd22e3
Author: Haochen Jiang 
Date:   Thu Jul 25 16:12:20 2024 +0800

i386: Fix AVX512 intrin macro typo

There are several typo in AVX512 intrins macro define. Correct them to solve
errors when compiled with -O0.

gcc/ChangeLog:

* config/i386/avx512dqintrin.h
(_mm_mask_fpclass_ss_mask): Correct operand order.
(_mm_mask_fpclass_sd_mask): Ditto.
(_mm256_maskz_reduce_round_ss): Use 
__builtin_ia32_reducess_mask_round
instead of __builtin_ia32_reducesd_mask_round.
(_mm_reduce_round_sd): Use -1 as mask since it is non-mask.
(_mm_reduce_round_ss): Ditto.
* config/i386/avx512vlbwintrin.h
(_mm256_mask_alignr_epi8): Correct operand usage.
(_mm_mask_alignr_epi8): Ditto.
* config/i386/avx512vlintrin.h (_mm_mask_alignr_epi64): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-vpalignr-1b.c: New test.
* gcc.target/i386/avx512dq-vfpclasssd-1b.c: Ditto.
* gcc.target/i386/avx512dq-vfpcla-1b.c: Ditto.
* gcc.target/i386/avx512dq-vreducesd-1b.c: Ditto.
* gcc.target/i386/avx512dq-vreducess-1b.c: Ditto.
* gcc.target/i386/avx512vl-valignq-1b.c: Ditto.

Diff:
---
 gcc/config/i386/avx512dqintrin.h   | 16 +---
 gcc/config/i386/avx512vlbwintrin.h |  4 ++--
 gcc/config/i386/avx512vlintrin.h   |  2 +-
 gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c   | 18 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c | 14 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vfpcla-1b.c | 14 ++
 gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c  | 16 
 gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c  | 16 
 gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c| 15 +++
 9 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h
index 3beed7e649a9..d9890c6da1dc 100644
--- a/gcc/config/i386/avx512dqintrin.h
+++ b/gcc/config/i386/avx512dqintrin.h
@@ -572,11 +572,11 @@ _mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, 
const int __imm)
   ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
 (int) (C), (__mmask8) (-1))) \
 
-#define _mm_mask_fpclass_ss_mask(X, C, U)  \
+#define _mm_mask_fpclass_ss_mask(U, X, C)  \
   ((__mmask8) __builtin_ia32_fpcla_mask ((__v4sf) (__m128) (X),\
 (int) (C), (__mmask8) (U)))
 
-#define _mm_mask_fpclass_sd_mask(X, C, U)  \
+#define _mm_mask_fpclass_sd_mask(U, X, C)  \
   ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
 (int) (C), (__mmask8) (U)))
 #define _mm_reduce_sd(A, B, C) \
@@ -594,8 +594,9 @@ _mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const 
int __imm)
 (__mmask8)(U)))
 
 #define _mm_reduce_round_sd(A, B, C, R)   \
-  ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A),  \
-(__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R)))
+  ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
+(__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \
+(__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_sd(W, U, A, B, C, R)\
   ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
@@ -622,8 +623,9 @@ _mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const 
int __imm)
 (__mmask8)(U)))
 
 #define _mm_reduce_round_ss(A, B, C, R)   \
-  ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A),   \
-(__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R)))
+  ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
+(__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (),  \
+(__mmask8)(-1), (int)(R)))
 
 #define _mm_mask_reduce_round_ss(W, U, A, B, C, R)\
   ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
@@ -631,7 +633,7 @@ _mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const 
int __imm)
 (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_reduce_round_ss(U, A, B, C, R)  \
-  ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A),   \
+  ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
 (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512

[gcc r15-2394] i386: Add non-optimize prefetchi intrins

2024-07-29 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b4524c4430ba9771265bd9fc31e69a3f35dfe117

commit r15-2394-gb4524c4430ba9771265bd9fc31e69a3f35dfe117
Author: Haochen Jiang 
Date:   Thu Jul 25 16:16:05 2024 +0800

i386: Add non-optimize prefetchi intrins

Under -O0, with the "newly" introduced intrins, the variable will be
transformed as mem instead of the origin symbol_ref. The compiler will
then treat the operand as invalid and turn the operation into nop, which
is not expected. Use macro for non-optimize to keep the variable as
symbol_ref just as how prefetch intrin does.

gcc/ChangeLog:

* config/i386/prfchiintrin.h
(_m_prefetchit0): Add macro for non-optimized option.
(_m_prefetchit1): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/prefetchi-1b.c: New test.

Diff:
---
 gcc/config/i386/prfchiintrin.h   |  9 +
 gcc/testsuite/gcc.target/i386/prefetchi-1b.c | 26 ++
 2 files changed, 35 insertions(+)

diff --git a/gcc/config/i386/prfchiintrin.h b/gcc/config/i386/prfchiintrin.h
index dfca89c7d169..d6580e504c04 100644
--- a/gcc/config/i386/prfchiintrin.h
+++ b/gcc/config/i386/prfchiintrin.h
@@ -37,6 +37,7 @@
 #define __DISABLE_PREFETCHI__
 #endif /* __PREFETCHI__ */
 
+#ifdef __OPTIMIZE__
 extern __inline void
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _m_prefetchit0 (void* __P)
@@ -50,6 +51,14 @@ _m_prefetchit1 (void* __P)
 {
   __builtin_ia32_prefetchi (__P, 2);
 }
+#else
+#define _m_prefetchit0(P)  \
+  __builtin_ia32_prefetchi(P, 3);
+
+#define _m_prefetchit1(P)  \
+  __builtin_ia32_prefetchi(P, 2);
+
+#endif
 
 #ifdef __DISABLE_PREFETCHI__
 #undef __DISABLE_PREFETCHI__
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1b.c 
b/gcc/testsuite/gcc.target/i386/prefetchi-1b.c
new file mode 100644
index ..93139554d3cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-1b.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mprefetchi -O0" } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ 
\\t\]+bar\\(%rip\\)" 1 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ 
\\t\]+bar\\(%rip\\)" 1 } } */
+
+#include 
+
+int
+bar (int a)
+{
+  return a + 1;
+}
+
+int
+foo1 (int b)
+{
+  _m_prefetchit0 (bar);
+  return bar (b) + 1;
+}
+
+int
+foo2 (int b)
+{
+  _m_prefetchit1 (bar);
+  return bar (b) + 1;
+}


[gcc r13-8952] i386: Add non-optimize prefetchi intrins

2024-07-29 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:d80abba35edda7b508e29b723daebc0e475ddd87

commit r13-8952-gd80abba35edda7b508e29b723daebc0e475ddd87
Author: Haochen Jiang 
Date:   Thu Jul 25 16:16:05 2024 +0800

i386: Add non-optimize prefetchi intrins

Under -O0, with the "newly" introduced intrins, the variable will be
transformed as mem instead of the origin symbol_ref. The compiler will
then treat the operand as invalid and turn the operation into nop, which
is not expected. Use macro for non-optimize to keep the variable as
symbol_ref just as how prefetch intrin does.

gcc/ChangeLog:

* config/i386/prfchiintrin.h
(_m_prefetchit0): Add macro for non-optimized option.
(_m_prefetchit1): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/prefetchi-1b.c: New test.

Diff:
---
 gcc/config/i386/prfchiintrin.h   |  9 +
 gcc/testsuite/gcc.target/i386/prefetchi-1b.c | 26 ++
 2 files changed, 35 insertions(+)

diff --git a/gcc/config/i386/prfchiintrin.h b/gcc/config/i386/prfchiintrin.h
index 382fc0795518..84cf27fe49c4 100644
--- a/gcc/config/i386/prfchiintrin.h
+++ b/gcc/config/i386/prfchiintrin.h
@@ -37,6 +37,7 @@
 #define __DISABLE_PREFETCHI__
 #endif /* __PREFETCHI__ */
 
+#ifdef __OPTIMIZE__
 extern __inline void
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _m_prefetchit0 (void* __P)
@@ -50,6 +51,14 @@ _m_prefetchit1 (void* __P)
 {
   __builtin_ia32_prefetchi (__P, 2);
 }
+#else
+#define _m_prefetchit0(P)  \
+  __builtin_ia32_prefetchi(P, 3);
+
+#define _m_prefetchit1(P)  \
+  __builtin_ia32_prefetchi(P, 2);
+
+#endif
 
 #ifdef __DISABLE_PREFETCHI__
 #undef __DISABLE_PREFETCHI__
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1b.c 
b/gcc/testsuite/gcc.target/i386/prefetchi-1b.c
new file mode 100644
index ..93139554d3cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-1b.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mprefetchi -O0" } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ 
\\t\]+bar\\(%rip\\)" 1 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ 
\\t\]+bar\\(%rip\\)" 1 } } */
+
+#include 
+
+int
+bar (int a)
+{
+  return a + 1;
+}
+
+int
+foo1 (int b)
+{
+  _m_prefetchit0 (bar);
+  return bar (b) + 1;
+}
+
+int
+foo2 (int b)
+{
+  _m_prefetchit1 (bar);
+  return bar (b) + 1;
+}


[gcc r14-10550] i386: Add non-optimize prefetchi intrins

2024-08-01 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:30f4fa3f53e4c1476b4cb771f8d006c03804788a

commit r14-10550-g30f4fa3f53e4c1476b4cb771f8d006c03804788a
Author: Haochen Jiang 
Date:   Thu Jul 25 16:16:05 2024 +0800

i386: Add non-optimize prefetchi intrins

Under -O0, with the "newly" introduced intrins, the variable will be
transformed as mem instead of the origin symbol_ref. The compiler will
then treat the operand as invalid and turn the operation into nop, which
is not expected. Use macro for non-optimize to keep the variable as
symbol_ref just as how prefetch intrin does.

gcc/ChangeLog:

* config/i386/prfchiintrin.h
(_m_prefetchit0): Add macro for non-optimized option.
(_m_prefetchit1): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/prefetchi-1b.c: New test.

Diff:
---
 gcc/config/i386/prfchiintrin.h   |  9 +
 gcc/testsuite/gcc.target/i386/prefetchi-1b.c | 26 ++
 2 files changed, 35 insertions(+)

diff --git a/gcc/config/i386/prfchiintrin.h b/gcc/config/i386/prfchiintrin.h
index dfca89c7d169..d6580e504c04 100644
--- a/gcc/config/i386/prfchiintrin.h
+++ b/gcc/config/i386/prfchiintrin.h
@@ -37,6 +37,7 @@
 #define __DISABLE_PREFETCHI__
 #endif /* __PREFETCHI__ */
 
+#ifdef __OPTIMIZE__
 extern __inline void
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _m_prefetchit0 (void* __P)
@@ -50,6 +51,14 @@ _m_prefetchit1 (void* __P)
 {
   __builtin_ia32_prefetchi (__P, 2);
 }
+#else
+#define _m_prefetchit0(P)  \
+  __builtin_ia32_prefetchi(P, 3);
+
+#define _m_prefetchit1(P)  \
+  __builtin_ia32_prefetchi(P, 2);
+
+#endif
 
 #ifdef __DISABLE_PREFETCHI__
 #undef __DISABLE_PREFETCHI__
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1b.c 
b/gcc/testsuite/gcc.target/i386/prefetchi-1b.c
new file mode 100644
index ..93139554d3cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-1b.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mprefetchi -O0" } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ 
\\t\]+bar\\(%rip\\)" 1 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ 
\\t\]+bar\\(%rip\\)" 1 } } */
+
+#include 
+
+int
+bar (int a)
+{
+  return a + 1;
+}
+
+int
+foo1 (int b)
+{
+  _m_prefetchit0 (bar);
+  return bar (b) + 1;
+}
+
+int
+foo2 (int b)
+{
+  _m_prefetchit1 (bar);
+  return bar (b) + 1;
+}


[gcc r15-2881] Initial support for AVX10.2

2024-08-12 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:4bcb480d103b36c389daaf711f0f25d74379adb6

commit r15-2881-g4bcb480d103b36c389daaf711f0f25d74379adb6
Author: Haochen Jiang 
Date:   Mon Aug 12 15:30:07 2024 +0800

Initial support for AVX10.2

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features): Handle
avx10.2.
* common/config/i386/i386-common.cc
(OPTION_MASK_ISA2_AVX10_2_256_SET): New.
(OPTION_MASK_ISA2_AVX10_2_512_SET): Ditto.
(OPTION_MASK_ISA2_AVX10_1_256_UNSET):
Add OPTION_MASK_ISA2_AVX10_2_256_UNSET.
(OPTION_MASK_ISA2_AVX10_1_512_UNSET):
Add OPTION_MASK_ISA2_AVX10_2_512_UNSET.
(OPTION_MASK_ISA2_AVX10_2_256_UNSET): New.
(OPTION_MASK_ISA2_AVX10_2_512_UNSET): Ditto.
(ix86_handle_option): Handle avx10.2-256 and avx10.2-512.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_AVX10_2_256 and FEATURE_AVX10_2_512.
* common/config/i386/i386-isas.h: Add ISA_NAMES_TABLE_ENTRY for
avx10.2-256 and avx10.2-512.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__AVX10_2_256__ and __AVX10_2_512__.
* config/i386/i386-isa.def (AVX10_2): Add DEF_PTA(AVX10_2_256)
and DEF_PTA(AVX10_2_512).
* config/i386/i386-options.cc (isa2_opts): Add -mavx10.2-256 and
-mavx10.2-512.
(ix86_valid_target_attribute_inner_p): Handle avx10.2-256 and
avx10.2-512.
* config/i386/i386.opt: Add option -mavx10.2, -mavx10.2-256 and
-mavx10.2-512.
* config/i386/i386.opt.urls: Regenerated.
* doc/extend.texi: Document avx10.2, avx10.2-256 and avx10.2-512.
* doc/invoke.texi: Document -mavx10.2, -mavx10.2-256 and
-mavx10.2-512.
* doc/sourcebuild.texi: Document target avx10.2, avx10.2-256,
avx10.2-512.

gcc/testsuite/ChangeLog:

* g++.dg/other/i386-2.C: Ditto.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/sse-12.c: Ditto.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.

Diff:
---
 gcc/common/config/i386/cpuinfo.h   |  6 +
 gcc/common/config/i386/i386-common.cc  | 43 --
 gcc/common/config/i386/i386-cpuinfo.h  |  2 ++
 gcc/common/config/i386/i386-isas.h |  3 +++
 gcc/config/i386/i386-c.cc  |  4 
 gcc/config/i386/i386-isa.def   |  2 ++
 gcc/config/i386/i386-options.cc|  7 +-
 gcc/config/i386/i386.opt   | 15 
 gcc/config/i386/i386.opt.urls  |  9 +++
 gcc/doc/extend.texi| 15 
 gcc/doc/invoke.texi| 17 +++---
 gcc/doc/sourcebuild.texi   |  9 +++
 gcc/testsuite/g++.dg/other/i386-2.C|  9 ---
 gcc/testsuite/g++.dg/other/i386-3.C|  9 ---
 gcc/testsuite/gcc.target/i386/sse-12.c |  2 +-
 gcc/testsuite/gcc.target/i386/sse-13.c |  2 +-
 gcc/testsuite/gcc.target/i386/sse-14.c |  2 +-
 gcc/testsuite/gcc.target/i386/sse-22.c |  4 ++--
 gcc/testsuite/gcc.target/i386/sse-23.c |  2 +-
 19 files changed, 140 insertions(+), 22 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 2ae77d335d24..2ae383eb6ab5 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -1006,6 +1006,9 @@ get_available_features (struct __processor_model 
*cpu_model,
   if (ebx & bit_AVX10_256)
switch (version)
  {
+ case 2:
+   set_feature (FEATURE_AVX10_2_256);
+   /* Fall through.  */
  case 1:
set_feature (FEATURE_AVX10_1_256);
break;
@@ -1016,6 +1019,9 @@ get_available_features (struct __processor_model 
*cpu_model,
   if (ebx & bit_AVX10_512)
switch (version)
  {
+ case 2:
+   set_feature (FEATURE_AVX10_2_512);
+   /* Fall through.  */
  case 1:
set_feature (FEATURE_AVX10_1_512);
break;
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index e38b1b22ffb1..fb744319b05e 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -122,6 +122,11 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_AVX10_1_256_SET OPTION_MASK_ISA2_AVX10_1_256
 #define OPTION_MASK_ISA2_AVX10_1_512_SET \
   (OPTION_MASK_ISA2_AVX10_1_256_SET | OPTION_MASK_ISA2_AVX10_1_512)
+#define OPTION_MASK_ISA2_AVX10_2_256_SET \
+  (OPTION_MASK_ISA2_AVX10_1_256_SET | OPTION_MASK_ISA2_AVX10_2_256)
+#define OPTION_MASK_ISA2_AVX10_2_512_SET \
+  (OPTION_MASK_ISA2_AVX10_1_512_SET | OPTION_MASK_ISA

[gcc] Created branch 'ix86/heads/avx10.2' in namespace 'refs/vendors'

2024-08-14 Thread Haochen Jiang via Gcc-cvs
The branch 'ix86/heads/avx10.2' was created in namespace 'refs/vendors' 
pointing to:

 4d2e8fcdaf32... Daily bump.


[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vadd{s, d, h} and vcmp{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:7eed0d3f707f4bade3edb3268d8d5b2c9b8ef8d8

commit 7eed0d3f707f4bade3edb3268d8d5b2c9b8ef8d8
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:15 2024 +0800

AVX10.2 ymm rounding: Support vadd{s,d,h} and vcmp{s,d,h} intrins

gcc/ChangeLog:

* config.gcc: Add avx10_2roundingintrin.h.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V4DF_FTYPE_V4DF_V4DF_V4DF_UQI_INT, 
V8SF_FTYPE_V8SF_V8SF_V8SF_UQI_INT,
V16HF_FTYPE_V16HF_V16HF_V16HF_UHI_INT, 
UQI_FTYPE_V4DF_V4DF_INT_UQI_INT,
UHI_FTYPE_V16HF_V16HF_INT_UHI_INT, UQI_FTYPE_V8SF_V8SF_INT_UQI_INT.
* config/i386/immintrin.h: Include avx10_2roundingintrin.h.
* config/i386/sse.md: Change subst_attr name due to renaming.
* config/i386/subst.md:
(): Add condition check for avx10.2
rounding control 256bit intrins and renamed to ...
(): ...this.
(round_saeonly_mode512bit_condition): Add condition check for
avx10.2 rounding control 256 bit intris and renamed to ...
(round_saeonly_mode_condition): ...this.
* config/i386/avx10_2roundingintrin.h: New file.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add -mavx10.2 and new builtin test.
* gcc.target/i386/avx-2.c: Ditto.
* gcc.target/i386/sse-13.c: Add new tests.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: New test.

Diff:
---
 gcc/config.gcc |   2 +-
 gcc/config/i386/avx10_2roundingintrin.h| 337 +
 gcc/config/i386/i386-builtin-types.def |   8 +
 gcc/config/i386/i386-builtin.def   |   8 +
 gcc/config/i386/i386-expand.cc |   6 +
 gcc/config/i386/immintrin.h|   2 +
 gcc/config/i386/sse.md | 100 +++---
 gcc/config/i386/subst.md   |  32 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  10 +-
 gcc/testsuite/gcc.target/i386/avx-2.c  |   2 +-
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  64 
 gcc/testsuite/gcc.target/i386/sse-13.c |   8 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  17 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  17 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   8 +
 15 files changed, 558 insertions(+), 63 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index a36dd1bcbc66..2c0f4518638d 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -452,7 +452,7 @@ i[34567]86-*-* | x86_64-*-*)
   cmpccxaddintrin.h amxfp16intrin.h prfchiintrin.h
   raointintrin.h amxcomplexintrin.h avxvnniint16intrin.h
   sm3intrin.h sha512intrin.h sm4intrin.h
-  usermsrintrin.h"
+  usermsrintrin.h avx10_2roundingintrin.h"
;;
 ia64-*-*)
extra_headers=ia64intrin.h
diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
new file mode 100644
index ..5698ed05c1d6
--- /dev/null
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -0,0 +1,337 @@
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use  directly; include  
instead."
+#endif
+
+#ifndef _AVX10_2ROUNDINGINTRIN_H_INCLUDED
+#define _AVX10_2ROUNDINGINTRIN_H_INCLUDED
+
+#ifndef __AVX10_2_256__
+#pragma GCC push_options
+#pragma GCC target("avx10.2-256")
+#define __DISABLE_AVX10_2_256__
+#endif /* __AVX10_2_256__ */
+
+#ifdef  __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inline__,

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vcvtpd2{, u}{dq, qq} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:3c755fad847af4d38d06ed7c22fe6bfdf227b718

commit 3c755fad847af4d38d06ed7c22fe6bfdf227b718
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:17 2024 +0800

AVX10.2 ymm rounding: Support vcvtpd2{,u}{dq,qq} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: Add new intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V4DI_FTYPE_V4DF_V4DI_UQI_INT, V4SI_FTYPE_V4DF_V4SI_UQI_INT.
* config/i386/sse.md:
(avx_cvtpd2dq256): Change name to
avx_cvtpd2dq256 and extend pattern to
generate 256bit insns.
(fixuns_notrunc2):
Add round_mode_condition.
* config/i386/subst.md (round_pd2udqsuff): New iterator.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/sse-14.c: Add new macro test.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 218 +
 gcc/config/i386/i386-builtin-types.def |   2 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   2 +
 gcc/config/i386/sse.md |  13 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  33 
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 303 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 09285c1ffcdd..3e5e9f3ba0ec 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -348,6 +348,144 @@ _mm256_maskz_cvt_roundpd_ps (__mmask8 __U, __m256d __A, 
const int __R)
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundpd_epi32 (__m256d __A, const int __R)
+{
+  return
+(__m128i) __builtin_ia32_cvtpd2dq256_mask_round ((__v4df) __A,
+(__v4si)
+_mm_undefined_si128 (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A,
+  const int __R)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask_round ((__v4df) __A,
+ (__v4si) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m256d __A, const int __R)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask_round ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundpd_epi64 (__m256d __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvtpd2qq256_mask_round ((__v4df) __A,
+(__v4di)
+_mm256_setzero_si256 (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask_round ((__v4df) __A,
+ (__v4di) __W,
+ 

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vcvtdq2p{s, h} and vcvtpd2p{s, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:33181163a09cb315cc8d19464cb1feca063c959c

commit 33181163a09cb315cc8d19464cb1feca063c959c
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:16 2024 +0800

AVX10.2 ymm rounding: Support vcvtdq2p{s,h} and vcvtpd2p{s,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: Add new intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SF_FTYPE_V8SI_V8SF_UQI_INT, V4SF_FTYPE_V4DF_V4SF_UQI_INT,
V8HF_FTYPE_V8SI_V8HF_UQI_INT, V8HF_FTYPE_V4DF_V8HF_UQI_INT.
* config/i386/sse.md:

(avx512fp16_vcvt2ph_):
Add condition check.
(avx512fp16_vcvtpd2ph_v4df_mask_round): New expand.
(*avx512fp16_vcvt2ph__mask): Change name to
avx512fp16_vcvt2ph__mask_1
and extend pattern to generate 256bit insns.
(avx_cvtpd2ps256): Change name to
avx_cvtpd2ps256 and extend pattern to
generate 256bit insns.
* config/i386/subst.md (round_applied): New condition.
(round_suff): New iterator.
(round_mode_condition): Add V32HI check for 512bit.
(round_saeonly_mode_condition): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/sse-14.c: Add new macro test.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 210 +
 gcc/config/i386/i386-builtin-types.def |   4 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   4 +
 gcc/config/i386/sse.md |  32 +++-
 gcc/config/i386/subst.md   |   4 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  44 -
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 322 insertions(+), 16 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 5698ed05c1d6..09285c1ffcdd 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -216,6 +216,138 @@ _mm256_mask_cmp_round_ps_mask (__mmask8 __U, __m256 __A, 
__m256 __B,
(__mmask8) __U,
__R);
 }
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi32_ph (__m256i __A, const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtdq2ph256_mask_round ((__v8si) __A,
+  (__v8hf)
+  _mm_setzero_ph (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepi32_ph (__m128h __W, __mmask8 __U, __m256i __A,
+  const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtdq2ph256_mask_round ((__v8si) __A,
+  (__v8hf) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepi32_ph (__mmask8 __U, __m256i __A, const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtdq2ph256_mask_round ((__v8si) __A,
+  (__v8hf)
+  _mm_setzero_ph (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi32_ps (__m256i __A, const int __R)
+{
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask_round ((__v8si) __A,
+(__v8sf)
+_mm256_undefined_ps (),
+  

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vcvtph2p{s, d, sx} and vcvtph2{, u}{dq, qq} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6fc313aad25ad678e1d32f290edadc5ac2481c7d

commit 6fc313aad25ad678e1d32f290edadc5ac2481c7d
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:18 2024 +0800

AVX10.2 ymm rounding: Support vcvtph2p{s,d,sx} and vcvtph2{,u}{dq,qq} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SF_FTYPE_V8HF_V8SF_UQI_INT, V8SI_FTYPE_V8HF_V8SI_UQI_INT,
V4DF_FTYPE_V8HF_V4DF_UQI_INT, V4DI_FTYPE_V8HF_V4DI_UQI_INT.
* config/i386/sse.md:
(avx512fp16_float_extend_ph2):
Add condition check.
(avx512fp16_vcvtph2_
 ):
Ditto.
(avx512fp16_float_extend_ph2): Extend round 
saeonly.
(vcvtph2ps256): Ditto.
* config/i386/subst.md
(round_saeonly_applied): New condition.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 384 +
 gcc/config/i386/i386-builtin-types.def |   4 +
 gcc/config/i386/i386-builtin.def   |   7 +
 gcc/config/i386/i386-expand.cc |   4 +
 gcc/config/i386/sse.md |  19 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   7 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  57 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   7 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  20 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  21 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   7 +
 12 files changed, 529 insertions(+), 9 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 3e5e9f3ba0ec..29966f5e1bf8 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -486,6 +486,246 @@ _mm256_maskz_cvt_roundpd_epu64 (__mmask8 __U, __m256d 
__A, const int __R)
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_epi32 (__m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2dq256_mask_round ((__v8hf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_epi32 (__m256i __W, __mmask8 __U, __m128h __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtph2dq256_mask_round ((__v8hf) __A,
+  (__v8si) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundph_epi32 (__mmask8 __U, __m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2dq256_mask_round ((__v8hf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_pd (__m128h __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_vcvtph2pd256_mask_round ((__v8hf) __A,
+  (__v4df)
+  _mm256_setzero_pd (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_pd (__m256d __W, __mmask8 __U, __m128h __A,
+   const int __R)
+{
+  return (__m256d)

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vcvtph2{, u}w and vcvtps2p{d, hx} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:a0e52c6dc21d8d2713e63e70e8bdbdc5dc536185

commit a0e52c6dc21d8d2713e63e70e8bdbdc5dc536185
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:18 2024 +0800

AVX10.2 ymm rounding: Support vcvtph2{,u}w and vcvtps2p{d,hx} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V16HI_FTYPE_V16HF_V16HI_UHI_INT, V4DF_FTYPE_V4SF_V4DF_UQI_INT
V8HF_FTYPE_V8SF_V8HF_UQI_INT.
* config/i386/sse.md
(avx512fp16_vcvt2ph_):
Add round condition check.
* config/i386/subst.md (round_mode_condition): Add V16HI check for
256bit.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 220 +
 gcc/config/i386/i386-builtin-types.def |   3 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   3 +
 gcc/config/i386/sse.md |   2 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  36 
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 304 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 29966f5e1bf8..bc3f92a7d1ae 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -726,6 +726,143 @@ _mm256_maskz_cvt_roundph_epu64 (__mmask8 __U, __m128h 
__A, const int __R)
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_epu16 (__m256h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2uw256_mask_round ((__v16hf) __A,
+ (__v16hi)
+ _mm256_undefined_si256 (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_epu16 (__m256i __W, __mmask16 __U, __m256h __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtph2uw256_mask_round ((__v16hf) __A,
+  (__v16hi) __W,
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundph_epu16 (__mmask16 __U, __m256h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2uw256_mask_round ((__v16hf) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_epi16 (__m256h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2w256_mask_round ((__v16hf) __A,
+(__v16hi)
+_mm256_undefined_si256 (),
+(__mmask16) -1,
+__R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_epi16 (__m256i __W, __mmask16 __U, __m256h __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtph2w256_mask_round ((__v16hf) __A,
+ (__v16hi) __W,
+   

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vcvtps2{, u}{dq, qq} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:4d8dc583f64af45f4f0fd4c0cce5c40283b9f1cc

commit 4d8dc583f64af45f4f0fd4c0cce5c40283b9f1cc
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:19 2024 +0800

AVX10.2 ymm rounding: Support vcvtps2{,u}{dq,qq} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SI_FTYPE_V8SF_V8SI_UQI_INT, V4DI_FTYPE_V4SF_V4DI_UQI_INT.
* config/i386/sse.md
(_fix_notrunc):
Extend to round.

(_fixuns_notrunc):
Add round condition check.
* config/i386/subst.md (round_constraint4): New.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 226 +
 gcc/config/i386/i386-builtin-types.def |   2 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   2 +
 gcc/config/i386/sse.md |  10 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  32 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 308 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index bc3f92a7d1ae..fca10a6b586f 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -863,6 +863,146 @@ _mm256_maskz_cvtx_roundps_ph (__mmask8 __U, __m256 __A, 
const int __R)
(__mmask8) __U,
__R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundps_epi32 (__m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundps_epi32 (__m256i __W, __mmask8 __U, __m256 __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtps2dq256_mask_round ((__v8sf) __A,
+  (__v8si) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundps_epi32 (__mmask8 __U, __m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundps_epi64 (__m128 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvtps2qq256_mask_round ((__v4sf) __A,
+(__v4di)
+_mm256_setzero_si256 (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundps_epi64 (__m256i __W, __mmask8 __U, __m128 __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask_round ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U,
+   

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vcvtqq2p{s, d, h} and vcvttpd2{, u}{dq, qq} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:76f880db8637df5fd682cb5e77854c4014b4c59f

commit 76f880db8637df5fd682cb5e77854c4014b4c59f
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:20 2024 +0800

AVX10.2 ymm rounding: Support vcvtqq2p{s,d,h} and vcvttpd2{,u}{dq,qq} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V4DF_FTYPE_V4DI_V4DF_UQI_INT, V4SF_FTYPE_V4DI_V4SF_UQI_INT,
V8HF_FTYPE_V4DI_V8HF_UQI_INT.
* config/i386/sse.md:
(avx512fp16_vcvtqq2ph_v4di_mask_round): New expand.
(*avx512fp16_vcvt2ph__mask):
Extend round control and add "_1" suffix.

(float2):
Add condition check.

(float2):
Ditto.

(float2):
Limit suffix output.
(unspec_fix_truncv4dfv4si2): Extend round control.
(unspec_fixuns_truncv4dfv4si2): Ditto.
* config/i386/subst.md (round_qq2pssuff): New iterator.
(round_saeonly_suff): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-2.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 390 +
 gcc/config/i386/i386-builtin-types.def |   3 +
 gcc/config/i386/i386-builtin.def   |   7 +
 gcc/config/i386/i386-expand.cc |   3 +
 gcc/config/i386/sse.md |  43 ++-
 gcc/config/i386/subst.md   |   2 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   7 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-2.c |  72 
 gcc/testsuite/gcc.target/i386/sse-13.c |   7 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  21 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  21 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   7 +
 12 files changed, 569 insertions(+), 14 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index fca10a6b586f..25efd9d7b96b 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1003,6 +1003,244 @@ _mm256_maskz_cvt_roundps_epu64 (__mmask8 __U, __m128 
__A, const int __R)
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi64_pd (__m256i __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask_round ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepi64_pd (__m256d __W, __mmask8 __U, __m256i __A,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask_round ((__v4di) __A,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepi64_pd (__mmask8 __U, __m256i __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask_round ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi64_ph (__m256i __A, const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtqq2ph256_mask_round ((__v4di) __A,
+  (__v8hf)
+  _mm_setzero_ph (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m128h
+__attribute__ 

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vcvttph2{, u}{dq, qq, w} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:33714da7770ba84eb3afcedef5ea1406e7d218bf

commit 33714da7770ba84eb3afcedef5ea1406e7d218bf
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:20 2024 +0800

AVX10.2 ymm rounding: Support vcvttph2{,u}{dq,qq,w} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md 
(avx512fp16_fix_trunc2):
Extend round control for 256bit.
(unspec_avx512fp16_fix_trunc2):
Ditto.

(avx512fp16_fix_trunc2):
Add condition check.
* config/i386/subst.md
(round_saeonly_mode_condition): Add V16HI check for 256bit.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-2.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 335 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/config/i386/sse.md |  10 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-2.c |  46 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 10 files changed, 447 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 25efd9d7b96b..45a04e5a7a87 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1241,6 +1241,216 @@ _mm256_maskz_cvtt_roundpd_epu64 (__mmask8 __U, __m256d 
__A, const int __R)
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundph_epi32 (__m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvttph2dq256_mask_round ((__v8hf) __A,
+  (__v8si)
+  _mm256_setzero_si256 (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundph_epi32 (__m256i __W, __mmask8 __U, __m128h __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvttph2dq256_mask_round ((__v8hf) __A,
+   (__v8si) __W,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundph_epi32 (__mmask8 __U, __m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvttph2dq256_mask_round ((__v8hf) __A,
+  (__v8si)
+  _mm256_setzero_si256 (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundph_epi64 (__m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvttph2qq256_mask_round ((__v8hf) __A,
+  (__v4di)
+  _mm256_setzero_si256 (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundph_epi64 (__m256i __W, __mmask8 __U, __m128h __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvttph2qq256_mask_round ((__v8hf) __A,
+   (__v4di) __W,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__,

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vcvttps2{, u}{dq, qq} and vcvtu{dq, qq}2p{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:2434026e4bcc5bde764e97a964cdea6940e73413

commit 2434026e4bcc5bde764e97a964cdea6940e73413
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:21 2024 +0800

AVX10.2 ymm rounding: Support vcvttps2{,u}{dq,qq} and vcvtu{dq,qq}2p{s,d,h} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md
(unspec_fix_truncv8sfv8si2): Extend rounding control.
(fixuns_trunc2):
Ditto.

(floatuns2):
Add condition check.

(fix_trunc2):
Remove round_saeonly_name.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-2.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 492 +
 gcc/config/i386/i386-builtin.def   |   9 +
 gcc/config/i386/sse.md |  27 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   9 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-2.c |  75 
 gcc/testsuite/gcc.target/i386/sse-13.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  26 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  27 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   9 +
 9 files changed, 670 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 45a04e5a7a87..384facb424c0 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1451,6 +1451,312 @@ _mm256_maskz_cvtt_roundph_epi16 (__mmask16 __U, __m256h 
__A, const int __R)
  (__mmask16) __U,
  __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundps_epi32 (__m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundps_epi32 (__m256i __W, __mmask8 __U, __m256 __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask_round ((__v8sf) __A,
+  (__v8si) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundps_epi32 (__mmask8 __U, __m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundps_epi64 (__m128 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2qq256_mask_round ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundps_epi64 (__m256i __W, __mmask8 __U, __m128 __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask_round ((__v4sf) __A,
+  (__v4di) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundps_epi64 (__mmask8 __U, __m128 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vcvt{, u}w2ph and vdivp{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:d2cf76a13b5a513489c14e15faf0e722ab94d663

commit d2cf76a13b5a513489c14e15faf0e722ab94d663
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:22 2024 +0800

AVX10.2 ymm rounding: Support vcvt{,u}w2ph and vdivp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V16HF_FTYPE_V16HI_V16HF_UHI_INT.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 286 +
 gcc/config/i386/i386-builtin-types.def |   1 +
 gcc/config/i386/i386-builtin.def   |   5 +
 gcc/config/i386/i386-expand.cc |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   5 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  58 +
 gcc/testsuite/gcc.target/i386/sse-13.c |   5 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  15 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   5 +
 10 files changed, 396 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 384facb424c0..15ea46b59835 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1757,6 +1757,183 @@ _mm256_maskz_cvt_roundepu64_ps (__mmask8 __U, __m256i 
__A, const int __R)
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepu16_ph (__m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtuw2ph256_mask_round ((__v16hi) __A,
+  (__v16hf)
+  _mm256_setzero_ph (),
+  (__mmask16) -1,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepu16_ph (__m256h __W, __mmask16 __U, __m256i __A,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtuw2ph256_mask_round ((__v16hi) __A,
+  (__v16hf) __W,
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepu16_ph (__mmask16 __U, __m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtuw2ph256_mask_round ((__v16hi) __A,
+  (__v16hf)
+  _mm256_setzero_ph (),
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi16_ph (__m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtw2ph256_mask_round ((__v16hi) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepi16_ph (__m256h __W, __mmask16 __U, __m256i __A,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtw2ph256_mask_round ((__v16hi) __A,
+ (__v16hf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepi16_ph (__mmask16 __U, __m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtw2ph256_mask_round ((__v16hi) __A,
+  

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vfmadd{132, 231, 213}p{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:8194aafd95d05e5a49f730f5cf7c09b562fb7f95

commit 8194aafd95d05e5a49f730f5cf7c09b562fb7f95
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:23 2024 +0800

AVX10.2 ymm rounding: Support vfmadd{132,231,213}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fmadd__mask3): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 176 +
 gcc/config/i386/i386-builtin.def   |   9 ++
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   9 ++
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  31 
 gcc/testsuite/gcc.target/i386/sse-13.c |   9 ++
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   9 ++
 9 files changed, 268 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index d5ea6bc57da9..9015095144e4 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2092,6 +2092,146 @@ _mm256_maskz_fixupimm_round_ps (__mmask8 __U, __m256 
__A, __m256 __B,
(__mmask8) __U,
__R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+   __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_round_pd (__m256d __A, __m256d __B, __m256d __D,
+__mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_round_ph (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddph256_mask_round ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __D,
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_round_ph (__m256h __A, __mmask16 __U, __m256h __B,
+   __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddph256_ma

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vfmaddcph and vfmaddsub{132, 231, 213}p{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:3e30845566e8334c2e654f31bcd21ef4ac5f96cc

commit 3e30845566e8334c2e654f31bcd21ef4ac5f96cc
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:23 2024 +0800

AVX10.2 ymm rounding: Support vfmaddcph and vfmaddsub{132,231,213}p{s,d,h} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fmaddsub__mask): Add condition check.
(_fmaddsub__mask3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 238 +
 gcc/config/i386/i386-builtin.def   |  13 ++
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  13 ++
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  43 
 gcc/testsuite/gcc.target/i386/sse-13.c |  13 ++
 gcc/testsuite/gcc.target/i386/sse-14.c |  16 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  13 ++
 9 files changed, 366 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 9015095144e4..95e42410a109 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2232,6 +2232,193 @@ _mm256_maskz_fmadd_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_round_pch (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_round ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __D,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_round_pch (__m256h __A, __mmask16 __U, __m256h __B,
+__m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_round_pch (__m256h __A, __m256h __B, __m256h __D,
+ __mmask16 __U, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_mask3_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_round_pch (__mmask16 __U, __m256h __A, __m256h __B,
+ __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_maskz_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   __U,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask_round ((__v4df) __A,
+(__v4df) __B,
+(__v4df) __D,
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmaddsub_round_pd (__m256d __A, __mmask8 __U, __

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vfc{madd, mul}cph, vfixupimmp{s, d} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:f39193494511bdfac24accf5da95388ac32bb272

commit f39193494511bdfac24accf5da95388ac32bb272
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:22 2024 +0800

AVX10.2 ymm rounding: Support vfc{madd,mul}cph, vfixupimmp{s,d} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V16HF_FTYPE_V16HF_V16HF_INT, V16HF_FTYPE_V16HF_V16HF_V16HF_INT,
V16HF_FTYPE_V16HF_V16HF_V16HF_UQI_INT,
V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI_INT,
V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI_INT.
* config/i386/sse.md:
(_fixupimm):
Add condition check.
(_fixupimm_mask): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 247 +
 gcc/config/i386/i386-builtin-types.def |   5 +
 gcc/config/i386/i386-builtin.def   |  10 +
 gcc/config/i386/i386-expand.cc |   5 +
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  10 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  49 
 gcc/testsuite/gcc.target/i386/sse-13.c |  10 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  13 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  13 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  10 +
 11 files changed, 374 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 15ea46b59835..d5ea6bc57da9 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1934,6 +1934,164 @@ _mm256_maskz_div_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
  (__mmask8) __U,
  __R);
 }
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmadd_round_pch (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmadd_round_pch (__m256h __A, __mmask8 __U, __m256h __B,
+ __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_mask_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fcmadd_round_pch (__m256h __A, __m256h __B, __m256h __D,
+  __mmask8 __U, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_mask3_round ((__v16hf) __A,
+(__v16hf) __B,
+(__v16hf) __D,
+__U,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmadd_round_pch (__mmask8 __U, __m256h __A, __m256h __B,
+  __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_maskz_round ((__v16hf) __A,
+(__v16hf) __B,
+(__v16hf) __D,
+__U,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmul_round_pch (__m256h __A, __m256h __B, const int __R)
+{
+  return
+(__m256h) __builtin_ia32_vfcmulcph256_round ((__v16hf) __A,
+  

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vfm{sub, subadd}{132, 231, 213}p{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:4d822f755a34ea5e5a3c891881f4ba0226151448

commit 4d822f755a34ea5e5a3c891881f4ba0226151448
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:24 2024 +0800

AVX10.2 ymm rounding: Support vfm{sub,subadd}{132,231,213}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fmsub__mask): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 350 +
 gcc/config/i386/i386-builtin.def   |  18 ++
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  18 ++
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  62 
 gcc/testsuite/gcc.target/i386/sse-13.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-14.c |  24 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  24 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  18 ++
 9 files changed, 533 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 95e42410a109..346a32c1a8a5 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2419,6 +2419,284 @@ _mm256_maskz_fmaddsub_round_ps (__mmask8 __U, __m256 
__A, __m256 __B,
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+   __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsub_round_pd (__m256d __A, __m256d __B, __m256d __D,
+__mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask3_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsub_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_maskz_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U, __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_round_ph (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h)
+__builtin_ia32_vfmsubph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  (__mmask16) -1, __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_round_ph (__m256h __A, __mmask16 __U, __m256h __B,
+   __m256h __D, const int __R)
+{
+  return (__m256h)
+__builtin_ia32_vfmsubph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vfmulcph and vfnmadd{132, 231, 213}p{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:81f1bcf7c470d1797bfdddc17f3a9b7be50fa040

commit 81f1bcf7c470d1797bfdddc17f3a9b7be50fa040
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:24 2024 +0800

AVX10.2 ymm rounding: Support vfmulcph and vfnmadd{132,231,213}p{s,d,h} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 241 +
 gcc/config/i386/i386-builtin.def   |  11 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |  11 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +
 gcc/testsuite/gcc.target/i386/sse-13.c |  11 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  14 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  14 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  11 +
 8 files changed, 363 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 346a32c1a8a5..3f833bffa54d 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2697,6 +2697,185 @@ _mm256_maskz_fmsubadd_round_ps (__mmask8 __U, __m256 
__A, __m256 __B,
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmul_round_pch (__m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_round ((__v16hf) __B,
+(__v16hf) __D,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmul_round_pch (__m256h __A, __mmask8 __U, __m256h __B,
+   __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask_round ((__v16hf) __B,
+ (__v16hf) __D,
+ (__v16hf) __A,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmul_round_pch (__mmask8 __U, __m256h __B, __m256h __D,
+const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask_round ((__v16hf) __B,
+ (__v16hf) __D,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmadd_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmadd_round_pd (__m256d __A, __m256d __B, __m256d __D,
+ __mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask3_round ((__v4df) __A,
+   (__v4df) __B,
+   (__v4df) __D,
+  

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vfnmsub{132, 231, 213}p{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:93c5a73e0ef33fa4ef46a3fe87225bd584830dc3

commit 93c5a73e0ef33fa4ef46a3fe87225bd584830dc3
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:25 2024 +0800

AVX10.2 ymm rounding: Support vfnmsub{132,231,213}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fnmsub__mask3): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 181 +
 gcc/config/i386/i386-builtin.def   |   9 +
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   9 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  31 
 gcc/testsuite/gcc.target/i386/sse-13.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   9 +
 9 files changed, 273 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 3f833bffa54d..afc1220fea42 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2876,6 +2876,151 @@ _mm256_maskz_fnmadd_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmsub_round_pd (__m256d __A, __m256d __B, __m256d __D,
+ __mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3_round ((__v4df) __A,
+   (__v4df) __B,
+   (__v4df) __D,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmsub_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_maskz_round ((__v4df) __A,
+   (__v4df) __B,
+   (__v4df) __D,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_round_ph (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h)
+__builtin_ia32_vfnmsubph256_mask_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   (__mmask16) -1,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_round_ph (__m256h __A, __mmask16 __U, __m256h __B,
+__m256h __D, const int __R)
+{
+  return (__m256

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vgetexpp{s, d, h} and vgetmantp{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:5da970fcb303434bc27d35a7ba9e6f5d4ee00e2d

commit 5da970fcb303434bc27d35a7ba9e6f5d4ee00e2d
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:26 2024 +0800

AVX10.2 ymm rounding: Support vgetexpp{s,d,h} and vgetmantp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SF_FTYPE_V8SF_V8SF_UQI_INT, V4DF_FTYPE_V4DF_V4DF_UQI_INT,
V16HF_FTYPE_V16HF_V16HF_UHI_INT, 
V16HF_FTYPE_V16HF_INT_V16HF_UHI_INT,
V4DF_FTYPE_V4DF_INT_V4DF_UQI_INT, V8SF_FTYPE_V8SF_INT_V8SF_UQI_INT.
* config/i386/sse.md:
(_getexp):
Add condition check.
(_getmant):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 341 +
 gcc/config/i386/i386-builtin-types.def |   6 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/config/i386/i386-expand.cc |   6 +
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  59 
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 11 files changed, 474 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index afc1220fea42..07729a6cc04f 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3021,6 +3021,217 @@ _mm256_maskz_fnmsub_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_round_pd (__m256d __A, const int __R)
+{
+  return
+(__m256d) __builtin_ia32_getexppd256_mask_round ((__v4df) __A,
+(__v4df)
+_mm256_undefined_pd (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+const int __R)
+{
+  return (__m256d) __builtin_ia32_getexppd256_mask_round ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_round_pd (__mmask8 __U, __m256d __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_getexppd256_mask_round ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_round_ph (__m256h __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask_round ((__v16hf) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+const int __R)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask_round ((__v16hf) __A,
+ (__v16hf) __W,
+ (__mmask16) __U,
+

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vmulp{s, d, h} and vrangep{s, d} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:ef07d9e1195a7ba200b4c26194bcc17948cb97f1

commit ef07d9e1195a7ba200b4c26194bcc17948cb97f1
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:27 2024 +0800

AVX10.2 ymm rounding: Support vmulp{s,d,h} and vrangep{s,d} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin):
Handle V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI_INT,
V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI_INT.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 313 +
 gcc/config/i386/i386-builtin-types.def |   2 +
 gcc/config/i386/i386-builtin.def   |   5 +
 gcc/config/i386/i386-expand.cc |   2 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   5 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  43 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   5 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  15 +
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   5 +
 10 files changed, 410 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index a5712f5230aa..ac0914415c94 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3454,6 +3454,198 @@ _mm256_maskz_min_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_round_pd (__m256d __A, __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_undefined_pd (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_setzero_pd (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_round_ph (__m256h __A, __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_mulph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf)
+  _mm256_undefined_ph (),
+  (__mmask16) -1,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+ __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_mulph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __W,
+  (__mmask16) __U,
+ 

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support v{max, min}p{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:ce16fa99857c057ad95ad7cef8ce6f5ffbe9ef48

commit ce16fa99857c057ad95ad7cef8ce6f5ffbe9ef48
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:26 2024 +0800

AVX10.2 ymm rounding: Support v{max,min}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 360 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 8 files changed, 470 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 07729a6cc04f..a5712f5230aa 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3232,6 +3232,228 @@ _mm256_maskz_getmant_round_ps (__mmask8 __U, __m256 __A,
  _mm256_setzero_ps (),
  __U, __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_round_pd (__m256d __A, __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_undefined_pd (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_setzero_pd (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_round_ph (__m256h __A, __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_maxph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf)
+  _mm256_undefined_ph (),
+  (__mmask16) -1,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+ __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_maxph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __W,
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_round_ph (__mmask16 __U, __m256h __A, __m256h __B,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_maxph256_mask_round ((__v16hf) __A,
+

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vscalefp{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:144b9ca72a05d9f7235bf939821a3ff425a251c8

commit 144b9ca72a05d9f7235bf939821a3ff425a251c8
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:28 2024 +0800

AVX10.2 ymm rounding: Support vscalefp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/sse.md:
(_scalef): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 182 +
 gcc/config/i386/i386-builtin.def   |   3 +
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   3 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  25 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   3 +
 gcc/testsuite/gcc.target/i386/sse-14.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-22.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   3 +
 9 files changed, 238 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index d6b8e2695de3..f35f23378583 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3873,6 +3873,119 @@ _mm256_maskz_roundscale_round_ps (__mmask8 __U, __m256 
__A, const int __C,
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_round_pd (__m256d __A, __m256d __B, const int __R)
+{
+  return
+(__m256d) __builtin_ia32_scalefpd256_mask_round ((__v4df) __A,
+(__v4df) __B,
+(__v4df)
+_mm256_undefined_pd (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+__m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_scalefpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ const int __R)
+{
+  return (__m256d) __builtin_ia32_scalefpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_round_ph (__m256h __A, __m256h __B, const int __R)
+{
+  return
+(__m256h) __builtin_ia32_scalefph256_mask_round ((__v16hf) __A,
+(__v16hf) __B,
+(__v16hf)
+_mm256_undefined_ph (),
+(__mmask16) -1,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+__m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_scalefph256_mask_round ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, _

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vreducep{s, d, h} and vrndscalep{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:2d78e6dbb4620da8e4c2e87ef14134f627559467

commit 2d78e6dbb4620da8e4c2e87ef14134f627559467
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:27 2024 +0800

AVX10.2 ymm rounding: Support vreducep{s,d,h} and vrndscalep{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(reducep):
Add condition check.
(_rndscale): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 367 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 +
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 9 files changed, 479 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index ac0914415c94..d6b8e2695de3 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3646,6 +3646,233 @@ _mm256_maskz_range_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
(__mmask8) __U,
__R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_round_pd (__m256d __A, const int __C, const int __R)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask_round ((__v4df) __A,
+ __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+const int __C, const int __R)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask_round ((__v4df) __A,
+ __C,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_round_pd (__mmask8 __U, __m256d __A, const int __C,
+ const int __R)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask_round ((__v4df) __A,
+ __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_round_ph (__m256h __A, const int __C, const int __R)
+{
+  return (__m256h) __builtin_ia32_reduceph256_mask_round ((__v16hf) __A,
+ __C,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+const int __C, const int __R)
+{
+  return (__m256h) __builtin_ia32_reduceph256_mask_round ((__v16hf) __A,
+ __C,
+ (__v16hf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attri

[gcc(refs/vendors/ix86/heads/avx10.2)] AVX10.2 ymm rounding: Support vsqrtp{s, d, h} and vsubp{s, d, h} intrins

2024-08-14 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:bab0adf0069e69115aa984aa750a03fb3d1d0cdb

commit bab0adf0069e69115aa984aa750a03fb3d1d0cdb
Author: Hu, Lin1 
Date:   Thu Aug 15 09:38:29 2024 +0800

AVX10.2 ymm rounding: Support vsqrtp{s,d,h} and vsubp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 339 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   7 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 8 files changed, 447 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index f35f23378583..c7146e37ec9a 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3986,6 +3986,216 @@ _mm256_maskz_scalef_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_round_pd (__m256d __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask_round ((__v4df) __A,
+   (__v4df)
+   _mm256_undefined_pd (),
+   (__mmask8) -1,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask_round ((__v4df) __A,
+   (__v4df) __W,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_round_pd (__mmask8 __U, __m256d __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask_round ((__v4df) __A,
+   (__v4df)
+   _mm256_setzero_pd (),
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_round_ph (__m256h __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_sqrtph256_mask_round ((__v16hf) __A,
+   (__v16hf)
+   _mm256_undefined_ph (),
+   (__mmask16) -1,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_sqrtph256_mask_round ((__v16hf) __A,
+   (__v16hf) __W,
+   (__mmask16) __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_round_ph (__mmask16 __U, __m256h __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_sqrtph256_mask_round ((__v16hf) __A,
+   (__v16hf)
+   _mm256_setzero_ph (),
+   (__mmask16) __U,
+   __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_round_ps (__m256 __A, const int __R)
+{
+  return (__m256) __builtin_ia32_sqrtps

[gcc r15-764] i386: Disable ix86_expand_vecop_qihi2 when !TARGET_AVX512BW

2024-05-21 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:73a167cfa225d5ee7092d41596b9fea1719898ff

commit r15-764-g73a167cfa225d5ee7092d41596b9fea1719898ff
Author: Haochen Jiang 
Date:   Tue May 21 14:10:43 2024 +0800

i386: Disable ix86_expand_vecop_qihi2 when !TARGET_AVX512BW

Since vpermq is really slow, we should avoid using it for permutation
when vpmovwb is not available (needs AVX512BW) for ix86_expand_vecop_qihi2
and fall back to ix86_expand_vecop_qihi.

gcc/ChangeLog:

PR target/115069
* config/i386/i386-expand.cc (ix86_expand_vecop_qihi2):
Do not enable the optimization when AVX512BW is not enabled.

gcc/testsuite/ChangeLog:

PR target/115069
* gcc.target/i386/pr115069.c: New.

Diff:
---
 gcc/config/i386/i386-expand.cc   | 7 +++
 gcc/testsuite/gcc.target/i386/pr115069.c | 9 +
 2 files changed, 16 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 7142c0a9d77..ec402a78a09 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24188,6 +24188,13 @@ ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, 
rtx op1, rtx op2)
   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   bool uns_p = code != ASHIFTRT;
 
+  /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
+ generic permutation to merge the data back into the right place.  This
+ permutation results in VPERMQ, which is slow, so better fall back to
+ ix86_expand_vecop_qihi.  */
+  if (!TARGET_AVX512BW)
+return false;
+
   if ((qimode == V16QImode && !TARGET_AVX2)
   || (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512))
   /* There are no V64HImode instructions.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr115069.c 
b/gcc/testsuite/gcc.target/i386/pr115069.c
new file mode 100644
index 000..50a3e033079
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115069.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+/* { dg-final { scan-assembler-not "vpermq" } } */
+
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi foo (v16qi a, v16qi b) {
+return a * b;
+}


[gcc r14-10229] i386: Disable ix86_expand_vecop_qihi2 when !TARGET_AVX512BW

2024-05-21 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:1ad5c9d524d8fa99773045e75da04ae958012085

commit r14-10229-g1ad5c9d524d8fa99773045e75da04ae958012085
Author: Haochen Jiang 
Date:   Tue May 21 14:10:43 2024 +0800

i386: Disable ix86_expand_vecop_qihi2 when !TARGET_AVX512BW

Since vpermq is really slow, we should avoid using it for permutation
when vpmovwb is not available (needs AVX512BW) for ix86_expand_vecop_qihi2
and fall back to ix86_expand_vecop_qihi.

gcc/ChangeLog:

PR target/115069
* config/i386/i386-expand.cc (ix86_expand_vecop_qihi2):
Do not enable the optimization when AVX512BW is not enabled.

gcc/testsuite/ChangeLog:

PR target/115069
* gcc.target/i386/pr115069.c: New.

Diff:
---
 gcc/config/i386/i386-expand.cc   | 7 +++
 gcc/testsuite/gcc.target/i386/pr115069.c | 9 +
 2 files changed, 16 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 8bb8f21e686..51efe6fdd7d 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23963,6 +23963,13 @@ ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, 
rtx op1, rtx op2)
   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   bool uns_p = code != ASHIFTRT;
 
+  /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
+ generic permutation to merge the data back into the right place.  This
+ permutation results in VPERMQ, which is slow, so better fall back to
+ ix86_expand_vecop_qihi.  */
+  if (!TARGET_AVX512BW)
+return false;
+
   if ((qimode == V16QImode && !TARGET_AVX2)
   || (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512))
   /* There are no V64HImode instructions.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr115069.c 
b/gcc/testsuite/gcc.target/i386/pr115069.c
new file mode 100644
index 000..50a3e033079
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115069.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+/* { dg-final { scan-assembler-not "vpermq" } } */
+
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi foo (v16qi a, v16qi b) {
+return a * b;
+}


[gcc r14-10253] Adjust generic loop alignment from 16:11:8 to 16 for Intel processors

2024-05-28 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:80600352d1282f084900ab444f2d4c83986f2ae5

commit r14-10253-g80600352d1282f084900ab444f2d4c83986f2ae5
Author: Haochen Jiang 
Date:   Wed May 29 11:12:37 2024 +0800

Adjust generic loop alignment from 16:11:8 to 16 for Intel processors

Previously, we use 16:11:8 in generic tune for Intel processors, which
lead to cross cache line issue and result in some random performance
penalty in benchmarks with small loops commit to commit.

After changing to always aligning to 16 bytes, it will somehow solve
the issue.

gcc/ChangeLog:

* config/i386/x86-tune-costs.h (generic_cost): Change from
16:11:8 to 16.

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 65d7d1f7e42..d34b5cc 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -3758,7 +3758,7 @@ struct processor_costs generic_cost = {
   generic_memset,
   COSTS_N_INSNS (4),   /* cond_taken_branch_cost.  */
   COSTS_N_INSNS (2),   /* cond_not_taken_branch_cost.  */
-  "16:11:8",   /* Loop alignment.  */
+  "16",/* Loop alignment.  */
   "16:11:8",   /* Jump alignment.  */
   "0:0:8", /* Label alignment.  */
   "16",/* Func alignment.  */


[gcc r14-10254] Align tight&hot loop without considering max skipping bytes.

2024-05-28 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b4d4ece0443433cd5c3078cfe03f18429e73b77a

commit r14-10254-gb4d4ece0443433cd5c3078cfe03f18429e73b77a
Author: liuhongt 
Date:   Wed May 29 11:12:51 2024 +0800

Align tight&hot loop without considering max skipping bytes.

When hot loop is small enough to fix into one cacheline, we should align
the loop with ceil_log2 (loop_size) without considering maximum
skipp bytes. It will help code prefetch.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_avoid_jump_mispredicts): Change
gen_pad to gen_max_skip_align.
(ix86_align_loops): New function.
(ix86_reorg): Call ix86_align_loops.
* config/i386/i386.md (pad): Rename to ..
(max_skip_align): .. this, and accept 2 operands for align and
skip.

Diff:
---
 gcc/config/i386/i386.cc | 148 +++-
 gcc/config/i386/i386.md |  10 ++--
 2 files changed, 153 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index fbd9b4dac2e..984ba37beeb 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23135,7 +23135,7 @@ ix86_avoid_jump_mispredicts (void)
  if (dump_file)
fprintf (dump_file, "Padding insn %i by %i bytes!\n",
 INSN_UID (insn), padsize);
-  emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+ emit_insn_before (gen_max_skip_align (GEN_INT (4), GEN_INT 
(padsize)), insn);
}
 }
 }
@@ -23408,6 +23408,150 @@ ix86_split_stlf_stall_load ()
 }
 }
 
+/* When a hot loop can be fit into one cacheline,
+   force align the loop without considering the max skip.  */
+static void
+ix86_align_loops ()
+{
+  basic_block bb;
+
+  /* Don't do this when we don't know cache line size.  */
+  if (ix86_cost->prefetch_block == 0)
+return;
+
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+  FOR_EACH_BB_FN (bb, cfun)
+{
+  rtx_insn *label = BB_HEAD (bb);
+  bool has_fallthru = 0;
+  edge e;
+  edge_iterator ei;
+
+  if (!LABEL_P (label))
+   continue;
+
+  profile_count fallthru_count = profile_count::zero ();
+  profile_count branch_count = profile_count::zero ();
+
+  FOR_EACH_EDGE (e, ei, bb->preds)
+   {
+ if (e->flags & EDGE_FALLTHRU)
+   has_fallthru = 1, fallthru_count += e->count ();
+ else
+   branch_count += e->count ();
+   }
+
+  if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+   continue;
+
+  if (bb->loop_father
+ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+ && (has_fallthru
+ ? (!(single_succ_p (bb)
+  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+&& optimize_bb_for_speed_p (bb)
+&& branch_count + fallthru_count > count_threshold
+&& (branch_count > fallthru_count * 
param_align_loop_iterations))
+ /* In case there'no fallthru for the loop.
+Nops inserted won't be executed.  */
+ : (branch_count > count_threshold
+|| (bb->count > bb->prev_bb->count * 10
+&& (bb->prev_bb->count
+<= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)
+   {
+ rtx_insn* insn, *end_insn;
+ HOST_WIDE_INT size = 0;
+ bool padding_p = true;
+ basic_block tbb = bb;
+ unsigned cond_branch_num = 0;
+ bool detect_tight_loop_p = false;
+
+ for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+  i++, tbb = tbb->next_bb)
+   {
+ /* Only handle continuous cfg layout. */
+ if (bb->loop_father != tbb->loop_father)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_BB_INSNS (tbb, insn)
+   {
+ if (!NONDEBUG_INSN_P (insn))
+   continue;
+ size += ix86_min_insn_size (insn);
+
+ /* We don't know size of inline asm.
+Don't align loop for call.  */
+ if (asm_noperands (PATTERN (insn)) >= 0
+ || CALL_P (insn))
+   {
+ size = -1;
+ break;
+   }
+   }
+
+ if (size == -1 || size > ix86_cost->prefetch_block)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_EACH_EDGE (e, ei, tbb->succs)
+   {
+ /* It could be part of the loop.  */
+ if (e->dest == bb)
+   {
+ detect_tight_loop_p = true;
+ break;
+   }
+   }
+
+ if (

[gcc r15-887] Adjust generic loop alignment from 16:11:8 to 16 for Intel processors

2024-05-28 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:00ed5424b1d4dcccfa187f55205521826794898c

commit r15-887-g00ed5424b1d4dcccfa187f55205521826794898c
Author: Haochen Jiang 
Date:   Wed May 29 11:13:55 2024 +0800

Adjust generic loop alignment from 16:11:8 to 16 for Intel processors

Previously, we use 16:11:8 in generic tune for Intel processors, which
lead to cross cache line issue and result in some random performance
penalty in benchmarks with small loops commit to commit.

After changing to always aligning to 16 bytes, it will somehow solve
the issue.

gcc/ChangeLog:

* config/i386/x86-tune-costs.h (generic_cost): Change from
16:11:8 to 16.

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 65d7d1f7e42..d34b5cc 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -3758,7 +3758,7 @@ struct processor_costs generic_cost = {
   generic_memset,
   COSTS_N_INSNS (4),   /* cond_taken_branch_cost.  */
   COSTS_N_INSNS (2),   /* cond_not_taken_branch_cost.  */
-  "16:11:8",   /* Loop alignment.  */
+  "16",/* Loop alignment.  */
   "16:11:8",   /* Jump alignment.  */
   "0:0:8", /* Label alignment.  */
   "16",/* Func alignment.  */


[gcc r15-888] Align tight&hot loop without considering max skipping bytes.

2024-05-28 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b644126237a1aa8599f767a5e0bbada1d7286f44

commit r15-888-gb644126237a1aa8599f767a5e0bbada1d7286f44
Author: liuhongt 
Date:   Wed May 29 11:14:26 2024 +0800

Align tight&hot loop without considering max skipping bytes.

When hot loop is small enough to fix into one cacheline, we should align
the loop with ceil_log2 (loop_size) without considering maximum
skipp bytes. It will help code prefetch.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_avoid_jump_mispredicts): Change
gen_pad to gen_max_skip_align.
(ix86_align_loops): New function.
(ix86_reorg): Call ix86_align_loops.
* config/i386/i386.md (pad): Rename to ..
(max_skip_align): .. this, and accept 2 operands for align and
skip.

Diff:
---
 gcc/config/i386/i386.cc | 148 +++-
 gcc/config/i386/i386.md |  10 ++--
 2 files changed, 153 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 85d87b9f778..1a0206ab573 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23146,7 +23146,7 @@ ix86_avoid_jump_mispredicts (void)
  if (dump_file)
fprintf (dump_file, "Padding insn %i by %i bytes!\n",
 INSN_UID (insn), padsize);
-  emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+ emit_insn_before (gen_max_skip_align (GEN_INT (4), GEN_INT 
(padsize)), insn);
}
 }
 }
@@ -23419,6 +23419,150 @@ ix86_split_stlf_stall_load ()
 }
 }
 
+/* When a hot loop can be fit into one cacheline,
+   force align the loop without considering the max skip.  */
+static void
+ix86_align_loops ()
+{
+  basic_block bb;
+
+  /* Don't do this when we don't know cache line size.  */
+  if (ix86_cost->prefetch_block == 0)
+return;
+
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+  FOR_EACH_BB_FN (bb, cfun)
+{
+  rtx_insn *label = BB_HEAD (bb);
+  bool has_fallthru = 0;
+  edge e;
+  edge_iterator ei;
+
+  if (!LABEL_P (label))
+   continue;
+
+  profile_count fallthru_count = profile_count::zero ();
+  profile_count branch_count = profile_count::zero ();
+
+  FOR_EACH_EDGE (e, ei, bb->preds)
+   {
+ if (e->flags & EDGE_FALLTHRU)
+   has_fallthru = 1, fallthru_count += e->count ();
+ else
+   branch_count += e->count ();
+   }
+
+  if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+   continue;
+
+  if (bb->loop_father
+ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+ && (has_fallthru
+ ? (!(single_succ_p (bb)
+  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+&& optimize_bb_for_speed_p (bb)
+&& branch_count + fallthru_count > count_threshold
+&& (branch_count > fallthru_count * 
param_align_loop_iterations))
+ /* In case there'no fallthru for the loop.
+Nops inserted won't be executed.  */
+ : (branch_count > count_threshold
+|| (bb->count > bb->prev_bb->count * 10
+&& (bb->prev_bb->count
+<= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)
+   {
+ rtx_insn* insn, *end_insn;
+ HOST_WIDE_INT size = 0;
+ bool padding_p = true;
+ basic_block tbb = bb;
+ unsigned cond_branch_num = 0;
+ bool detect_tight_loop_p = false;
+
+ for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+  i++, tbb = tbb->next_bb)
+   {
+ /* Only handle continuous cfg layout. */
+ if (bb->loop_father != tbb->loop_father)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_BB_INSNS (tbb, insn)
+   {
+ if (!NONDEBUG_INSN_P (insn))
+   continue;
+ size += ix86_min_insn_size (insn);
+
+ /* We don't know size of inline asm.
+Don't align loop for call.  */
+ if (asm_noperands (PATTERN (insn)) >= 0
+ || CALL_P (insn))
+   {
+ size = -1;
+ break;
+   }
+   }
+
+ if (size == -1 || size > ix86_cost->prefetch_block)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_EACH_EDGE (e, ei, tbb->succs)
+   {
+ /* It could be part of the loop.  */
+ if (e->dest == bb)
+   {
+ detect_tight_loop_p = true;
+ break;
+   }
+   }
+
+ if (de

  1   2   >