[PATCH] RISC-V: Adjusting the comments of the emit_vlmax_insn/emit_vlmax_insn_lra/emit_nonvlmax_insn functions

2023-09-21 Thread Lehua Ding
This patch adjusts the comments of the
emit_vlmax_insn/emit_vlmax_insn_lra/emit_nonvlmax_insn functions.
The purpose of the adjustment is to make it clear that vlmax here is not
VLMAX as defined inside the RVV ISA. This is because this function is used
by RVV mode (e.g. RVVM1SImode) in addition to VLS mode (V16QI). For RVV mode,
it means the same thing, for VLS mode, it indicates setting the vl to the
number of units of the mode. Changed the comment because I didn't think of
a better name. If there is a suitable name, feel free to discuss it.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (emit_nonvlmax_insn): Adjust comments.
(emit_vlmax_insn_lra): Adjust comments.

---
 gcc/config/riscv/riscv-v.cc | 23 +++
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 64a71a128d4..df4d2ac1b2b 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -347,9 +347,8 @@ private:
   expand_operand m_ops[MAX_OPERANDS];
 };

-/* Emit RVV insn which vl is VLMAX.
-   This function can only be used before LRA pass or
-   for VLS_AVL_IMM modes.  */
+/* Emit RVV insn which vl is the number of units of the vector mode.
+   This function can only be used before LRA pass or for VLS_AVL_IMM modes.  */
 void
 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
 {
@@ -357,23 +356,23 @@ emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx 
*ops)
   e.emit_insn ((enum insn_code) icode, ops);
 }

-/* Emit RVV insn which vl is VL.  */
+/* Like emit_vlmax_insn but can be only used after LRA pass that can't create
+   pseudo register.  */
 void
-emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
+emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 {
-  insn_expander e (insn_flags, false);
+  gcc_assert (!can_create_pseudo_p ());
+
+  insn_expander e (insn_flags, true);
   e.set_vl (vl);
   e.emit_insn ((enum insn_code) icode, ops);
 }

-/* Emit RVV insn which vl is VL but the AVL_TYPE insn attr is VLMAX.
-   This function used after LRA pass that cann't create pseudo register.  */
+/* Emit RVV insn which vl is the VL argument.  */
 void
-emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
+emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 {
-  gcc_assert (!can_create_pseudo_p ());
-
-  insn_expander e (insn_flags, true);
+  insn_expander e (insn_flags, false);
   e.set_vl (vl);
   e.emit_insn ((enum insn_code) icode, ops);
 }
--
2.36.3



[PATCH 01/18] Initial support for -mevex512

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* common/config/i386/i386-common.cc
(OPTION_MASK_ISA2_EVEX512_SET): New.
(OPTION_MASK_ISA2_EVEX512_UNSET): Ditto.
(ix86_handle_option): Handle EVEX512.
* config/i386/i386-c.cc (ix86_target_macros_internal): Ditto.
* config/i386/i386-options.cc: (isa2_opts): Ditto.
(ix86_valid_target_attribute_inner_p): Ditto.
(ix86_option_override_internal): Set EVEX512 target if it is not
explicitly set when AVX512 is enabled. Disable
AVX512{PF,ER,4VNNIW,4FAMPS} for -mno-evex512.
* config/i386/i386.opt: Add mevex512. Temporaily RejectNegative.
---
 gcc/common/config/i386/i386-common.cc | 15 +++
 gcc/config/i386/i386-c.cc |  2 ++
 gcc/config/i386/i386-options.cc   | 19 ++-
 gcc/config/i386/i386.opt  |  4 
 4 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 95468b7c405..8cc59e08d06 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -123,6 +123,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_SM3_SET OPTION_MASK_ISA2_SM3
 #define OPTION_MASK_ISA2_SHA512_SET OPTION_MASK_ISA2_SHA512
 #define OPTION_MASK_ISA2_SM4_SET OPTION_MASK_ISA2_SM4
+#define OPTION_MASK_ISA2_EVEX512_SET OPTION_MASK_ISA2_EVEX512
 
 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2.  */
@@ -309,6 +310,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_SM3_UNSET OPTION_MASK_ISA2_SM3
 #define OPTION_MASK_ISA2_SHA512_UNSET OPTION_MASK_ISA2_SHA512
 #define OPTION_MASK_ISA2_SM4_UNSET OPTION_MASK_ISA2_SM4
+#define OPTION_MASK_ISA2_EVEX512_UNSET OPTION_MASK_ISA2_EVEX512
 
 /* SSE4 includes both SSE4.1 and SSE4.2.  -mno-sse4 should the same
as -mno-sse4.1. */
@@ -1341,6 +1343,19 @@ ix86_handle_option (struct gcc_options *opts,
}
   return true;
 
+case OPT_mevex512:
+  if (value)
+   {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_EVEX512_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_EVEX512_SET;
+   }
+  else
+   {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_EVEX512_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_EVEX512_UNSET;
+   }
+  return true;
+
 case OPT_mfma:
   if (value)
{
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 47768fa0940..93154efa7ff 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -707,6 +707,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
 def_or_undef (parse_in, "__SHA512__");
   if (isa_flag2 & OPTION_MASK_ISA2_SM4)
 def_or_undef (parse_in, "__SM4__");
+  if (isa_flag2 & OPTION_MASK_ISA2_EVEX512)
+def_or_undef (parse_in, "__EVEX512__");
   if (TARGET_IAMCU)
 {
   def_or_undef (parse_in, "__iamcu");
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index e47f9ed5d5f..a1a7a92da9f 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -250,7 +250,8 @@ static struct ix86_target_opts isa2_opts[] =
   { "-mavxvnniint16",  OPTION_MASK_ISA2_AVXVNNIINT16 },
   { "-msm3",   OPTION_MASK_ISA2_SM3 },
   { "-msha512",OPTION_MASK_ISA2_SHA512 },
-  { "-msm4",OPTION_MASK_ISA2_SM4 }
+  { "-msm4",OPTION_MASK_ISA2_SM4 },
+  { "-mevex512",OPTION_MASK_ISA2_EVEX512 }
 };
 static struct ix86_target_opts isa_opts[] =
 {
@@ -1109,6 +1110,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree 
args, char *p_strings[],
 IX86_ATTR_ISA ("sm3", OPT_msm3),
 IX86_ATTR_ISA ("sha512", OPT_msha512),
 IX86_ATTR_ISA ("sm4", OPT_msm4),
+IX86_ATTR_ISA ("evex512", OPT_mevex512),
 
 /* enum options */
 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
@@ -2559,6 +2561,21 @@ ix86_option_override_internal (bool main_args_p,
   &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
   & ~opts->x_ix86_isa_flags_explicit);
 
+  /* Set EVEX512 target if it is not explicitly set
+ when AVX512 is enabled.  */
+  if (TARGET_AVX512F_P(opts->x_ix86_isa_flags)
+  && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_EVEX512))
+opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_EVEX512;
+
+  /* Disable AVX512{PF,ER,4VNNIW,4FAMPS} for -mno-evex512.  */
+  if (!TARGET_EVEX512_P(opts->x_ix86_isa_flags2))
+{
+  opts->x_ix86_isa_flags
+   &= ~(OPTION_MASK_ISA_AVX512PF | OPTION_MASK_ISA_AVX512ER);
+  opts->x_ix86_isa_flags2
+   &= ~(OPTION_MASK_ISA2_AVX5124FMAPS | OPTION_MASK_ISA2_AVX5124VNNIW);
+}
+
   /* Validate -mpreferred-stack-boundary= value or default it to
  PREFERRED_STACK_BOUNDARY_DEFAULT.  */
   ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
diff --g

[PATCH 00/18] Support -mevex512 for AVX512

2023-09-21 Thread Hu, Lin1
Hi all,

After previous discussion, instead of supporting option -mavx10.1, we
will first introduct option -m[no-]evex512, which will enable/disable
512 bit register and 64 bit mask register.

It will not change the current option behavior since if AVX512F is
enabled with no evex512 option specified, it will automatically enable
512 bit register and 64 bit mask register.

How the patches go comes following:

Patch 1 added initial support for option -mevex512.

Patch 2-6 refined current intrin file to push evex512 target for all
512 bit intrins. Those scalar intrins remained untouched.

Patch 7-11 added OPTION_MASK_ISA2_EVEX512 for all related builtins.

Patch 12 disabled zmm register, 512 bit libmvec call for no-evex512,
also requested evex512 for vectorization when using 512 bit register.

Patch 13-17 supported evex512 in related patterns.

Patch 18 added testcases for -mno-evex512 and allowed its usage.

The patches currently cause scan-asm fail for pr89229-{5,6,7}b.c since
we will emit scalar vmovss here. When trying to use x/ymm 16+ w/o
avx512vl but with avx512f+evex512, I suppose we could either emit scalar
or zmm instructions. It is quite a rare case on HW since there is no
HW w/o avx512vl but with avx512f, so I prefer to not to add maintainence
effort here to get a slightly perf improvement. But it could be changed
to former behavior.

Discussions are welcomed for all the patches.

Thx,
Haochen

Haochen Jiang (18):
  Initial support for -mevex512
  Push evex512 target for 512 bit intrins
  Push evex512 target for 512 bit intrins
  Push evex512 target for 512 bit intrins
  Push evex512 target for 512 bit intrins
  Push evex512 target for 512 bit intrins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Disable zmm register and 512 bit libmvec call when !TARGET_EVEX512
  Support -mevex512 for AVX512F intrins
  Support -mevex512 for AVX512DQ intrins
  Support -mevex512 for AVX512BW intrins
  Support -mevex512 for

AVX512{IFMA,VBMI,VNNI,BF16,VPOPCNTDQ,VBMI2,BITALG,VP2INTERSECT},VAES,GFNI,VPCLMULQDQ
intrins
  Support -mevex512 for AVX512FP16 intrins
  Allow -mno-evex512 usage

 gcc/common/config/i386/i386-common.cc   |15 +
 gcc/config.gcc  |19 +-
 gcc/config/i386/avx5124fmapsintrin.h| 2 +-
 gcc/config/i386/avx5124vnniwintrin.h| 2 +-
 gcc/config/i386/avx512bf16intrin.h  |31 +-
 gcc/config/i386/avx512bitalgintrin.h|   155 +-
 gcc/config/i386/avx512bitalgvlintrin.h  |   180 +
 gcc/config/i386/avx512bwintrin.h|   291 +-
 gcc/config/i386/avx512dqintrin.h|  1840 +-
 gcc/config/i386/avx512erintrin.h| 2 +-
 gcc/config/i386/avx512fintrin.h | 19663 +-
 gcc/config/i386/avx512fp16intrin.h  |  8925 
 gcc/config/i386/avx512ifmaintrin.h  | 4 +-
 gcc/config/i386/avx512pfintrin.h| 2 +-
 gcc/config/i386/avx512vbmi2intrin.h | 4 +-
 gcc/config/i386/avx512vbmiintrin.h  | 4 +-
 gcc/config/i386/avx512vnniintrin.h  | 4 +-
 gcc/config/i386/avx512vp2intersectintrin.h  | 4 +-
 gcc/config/i386/avx512vpopcntdqintrin.h | 4 +-
 gcc/config/i386/gfniintrin.h|76 +-
 gcc/config/i386/i386-builtin.def|  1312 +-
 gcc/config/i386/i386-builtins.cc|96 +-
 gcc/config/i386/i386-c.cc   | 2 +
 gcc/config/i386/i386-expand.cc  |18 +-
 gcc/config/i386/i386-options.cc |33 +-
 gcc/config/i386/i386.cc |   168 +-
 gcc/config/i386/i386.h  | 7 +-
 gcc/config/i386/i386.md |   127 +-
 gcc/config/i386/i386.opt| 4 +
 gcc/config/i386/immintrin.h | 2 +
 gcc/config/i386/predicates.md   | 3 +-
 gcc/config/i386/sse.md  |   854 +-
 gcc/config/i386/vaesintrin.h| 4 +-
 gcc/config/i386/vpclmulqdqintrin.h  | 4 +-
 gcc/testsuite/gcc.target/i386/noevex512-1.c |13 +
 gcc/testsuite/gcc.target/i386/noevex512-2.c |13 +
 gcc/testsuite/gcc.target/i386/noevex512-3.c |13 +
 gcc/testsuite/gcc.target/i386/pr89229-5b.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr89229-6b.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr89229-7b.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr90096.c | 2 +-
 41 files changed, 17170 insertions(+), 16738 deletions(-)
 create mode 100644 gcc/config/i386/avx512bitalgvlintrin.h
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-3.c

-- 
2.31.1



[PATCH 08/18] [PATCH 2/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
---
 gcc/config/i386/i386-builtin.def | 94 
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 0cc526383db..7a0dec9bc8b 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2408,37 +2408,37 @@ BDESC (OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_avx512vl_cmpv2df3_mask, "__builtin_
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_cmpv4sf3_mask, 
"__builtin_ia32_cmpps128_mask", IX86_BUILTIN_CMPPS128_MASK, UNKNOWN, (int) 
UQI_FTYPE_V4SF_V4SF_INT_UQI)
 
 /* AVX512DQ.  */
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16sf_mask, 
"__builtin_ia32_broadcastf32x2_512_mask", IX86_BUILTIN_BROADCASTF32x2_512, 
UNKNOWN, (int) V16SF_FTYPE_V4SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16si_mask, 
"__builtin_ia32_broadcasti32x2_512_mask", IX86_BUILTIN_BROADCASTI32x2_512, 
UNKNOWN, (int) V16SI_FTYPE_V4SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv8df_mask_1, 
"__builtin_ia32_broadcastf64x2_512_mask", IX86_BUILTIN_BROADCASTF64X2_512, 
UNKNOWN, (int) V8DF_FTYPE_V2DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv8di_mask_1, 
"__builtin_ia32_broadcasti64x2_512_mask", IX86_BUILTIN_BROADCASTI64X2_512, 
UNKNOWN, (int) V8DI_FTYPE_V2DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16sf_mask_1, 
"__builtin_ia32_broadcastf32x8_512_mask", IX86_BUILTIN_BROADCASTF32X8_512, 
UNKNOWN, (int) V16SF_FTYPE_V8SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16si_mask_1, 
"__builtin_ia32_broadcasti32x8_512_mask", IX86_BUILTIN_BROADCASTI32X8_512, 
UNKNOWN, (int) V16SI_FTYPE_V8SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextractf64x2_mask, 
"__builtin_ia32_extractf64x2_512_mask", IX86_BUILTIN_EXTRACTF64X2_512, UNKNOWN, 
(int) V2DF_FTYPE_V8DF_INT_V2DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextractf32x8_mask, 
"__builtin_ia32_extractf32x8_mask", IX86_BUILTIN_EXTRACTF32X8, UNKNOWN, (int) 
V8SF_FTYPE_V16SF_INT_V8SF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextracti64x2_mask, 
"__builtin_ia32_extracti64x2_512_mask", IX86_BUILTIN_EXTRACTI64X2_512, UNKNOWN, 
(int) V2DI_FTYPE_V8DI_INT_V2DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextracti32x8_mask, 
"__builtin_ia32_extracti32x8_mask", IX86_BUILTIN_EXTRACTI32X8, UNKNOWN, (int) 
V8SI_FTYPE_V16SI_INT_V8SI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv8df_mask, 
"__builtin_ia32_reducepd512_mask", IX86_BUILTIN_REDUCEPD512_MASK, UNKNOWN, 
(int) V8DF_FTYPE_V8DF_INT_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv16sf_mask, 
"__builtin_ia32_reduceps512_mask", IX86_BUILTIN_REDUCEPS512_MASK, UNKNOWN, 
(int) V16SF_FTYPE_V16SF_INT_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_mulv8di3_mask, 
"__builtin_ia32_pmullq512_mask", IX86_BUILTIN_PMULLQ512, UNKNOWN, (int) 
V8DI_FTYPE_V8DI_V8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_xorv8df3_mask, 
"__builtin_ia32_xorpd512_mask", IX86_BUILTIN_XORPD512, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_xorv16sf3_mask, 
"__builtin_ia32_xorps512_mask", IX86_BUILTIN_XORPS512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_iorv8df3_mask, 
"__builtin_ia32_orpd512_mask", IX86_BUILTIN_ORPD512, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_iorv16sf3_mask, 
"__builtin_ia32_orps512_mask", IX86_BUILTIN_ORPS512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_andv8df3_mask, 
"__builtin_ia32_andpd512_mask", IX86_BUILTIN_ANDPD512, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_andv16sf3_mask, 
"__builtin_ia32_andps512_mask", IX86_BUILTIN_ANDPS512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512f_andnotv8df3_mask, 
"__builtin_ia32_andnpd512_mask", IX86_BUILTIN_ANDNPD512, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512f_andnotv16sf3_mask, 
"__builtin_ia32_andnps512_mask", IX86_BUILTIN_ANDNPS512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vinsertf32x8_mask, 
"__builtin_ia32_insertf32x8_mask", IX86_BUILTIN_INSERTF32X8, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vinserti32x8_mask, 
"__builtin_ia32_inserti32x8_mask", IX86_BUILTIN_INSERTI32X8, UNKNOWN, (int) 
V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR

[PATCH 16/18] Support -mevex512 for AVX512{IFMA, VBMI, VNNI, BF16, VPOPCNTDQ, VBMI2, BITALG, VP2INTERSECT}, VAES, GFNI, VPCLMULQDQ intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/sse.md (VI1_AVX512VL): Add TARGET_EVEX512.
(VI8_FVL): Ditto.
(VI1_AVX512F): Ditto.
(VI1_AVX512VNNI): Ditto.
(VI1_AVX512VL_F): Ditto.
(VI12_VI48F_AVX512VL): Ditto.
(*avx512f_permvar_truncv32hiv32qi_1): Ditto.
(sdot_prod): Ditto.
(VEC_PERM_AVX2): Ditto.
(VPERMI2): Ditto.
(VPERMI2I): Ditto.
(vpmadd52v8di): Ditto.
(usdot_prod): Ditto.
(vpdpbusd_v16si): Ditto.
(vpdpbusds_v16si): Ditto.
(vpdpwssd_v16si): Ditto.
(vpdpwssds_v16si): Ditto.
(VI48_AVX512VP2VL): Ditto.
(avx512vp2intersect_2intersectv16si): Ditto.
(VF_AVX512BF16VL): Ditto.
(VF1_AVX512_256): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr90096.c: Adjust error message.

Co-authored-by: Hu, Lin1 
---
 gcc/config/i386/sse.md  | 56 +
 gcc/testsuite/gcc.target/i386/pr90096.c |  2 +-
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e59f6bf4410..a5a95b9de66 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -298,7 +298,7 @@
(V32BF "TARGET_EVEX512") (V16BF "TARGET_AVX512VL") (V8BF 
"TARGET_AVX512VL")])
 
 (define_mode_iterator VI1_AVX512VL
-  [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
+  [(V64QI "TARGET_EVEX512") (V16QI "TARGET_AVX512VL") (V32QI 
"TARGET_AVX512VL")])
 
 ;; All vector modes
 (define_mode_iterator V
@@ -531,7 +531,7 @@
   [(V8DI "TARGET_AVX512F && TARGET_EVEX512") (V4DI "TARGET_AVX") V2DI])
 
 (define_mode_iterator VI8_FVL
-  [(V8DI "TARGET_AVX512F") V4DI (V2DI "TARGET_AVX512VL")])
+  [(V8DI "TARGET_AVX512F && TARGET_EVEX512") V4DI (V2DI "TARGET_AVX512VL")])
 
 (define_mode_iterator VI8_AVX512VL
   [(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
@@ -546,10 +546,10 @@
   [(V64QI "TARGET_AVX512BW && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI])
 
 (define_mode_iterator VI1_AVX512F
-  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI])
+  [(V64QI "TARGET_AVX512F && TARGET_EVEX512") (V32QI "TARGET_AVX") V16QI])
 
 (define_mode_iterator VI1_AVX512VNNI
-  [(V64QI "TARGET_AVX512VNNI") (V32QI "TARGET_AVX2") V16QI])
+  [(V64QI "TARGET_AVX512VNNI && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI])
 
 (define_mode_iterator VI12_256_512_AVX512VL
   [(V64QI "TARGET_EVEX512") (V32QI "TARGET_AVX512VL")
@@ -599,7 +599,7 @@
V8DI ])
 
 (define_mode_iterator VI1_AVX512VL_F
-  [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F")])
+  [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F && TARGET_EVEX512")])
 
 (define_mode_iterator VI8_AVX2_AVX512BW
   [(V8DI "TARGET_AVX512BW && TARGET_EVEX512") (V4DI "TARGET_AVX2") V2DI])
@@ -923,8 +923,8 @@
(V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")
(V4SI "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
(V2DI "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")
-   V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")
-   V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+   (V64QI "TARGET_EVEX512") (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")
+   (V32HI "TARGET_EVEX512") (V16HI "TARGET_AVX512VL") (V8HI 
"TARGET_AVX512VL")])
 
 (define_mode_iterator VI48F_256 [V8SI V8SF V4DI V4DF])
 
@@ -14217,7 +14217,7 @@
 (const_int 26) (const_int 27)
 (const_int 28) (const_int 29)
 (const_int 30) (const_int 31)])))]
-  "TARGET_AVX512VBMI && ix86_pre_reload_split ()"
+  "TARGET_AVX512VBMI && TARGET_EVEX512 && ix86_pre_reload_split ()"
   "#"
   "&& 1"
   [(set (match_dup 0)
@@ -16040,7 +16040,7 @@
   "TARGET_SSE2"
 {
   /* Try with vnni instructions.  */
-  if (( == 64 && TARGET_AVX512VNNI)
+  if (( == 64 && TARGET_AVX512VNNI && TARGET_EVEX512)
   || ( < 64
  && ((TARGET_AVX512VNNI && TARGET_AVX512VL) || TARGET_AVXVNNI)))
 {
@@ -17320,7 +17320,8 @@
(V8DF "TARGET_AVX512F && TARGET_EVEX512")
(V16SI "TARGET_AVX512F && TARGET_EVEX512")
(V8DI "TARGET_AVX512F && TARGET_EVEX512")
-   (V32HI "TARGET_AVX512BW && TARGET_EVEX512") (V64QI "TARGET_AVX512VBMI")
+   (V32HI "TARGET_AVX512BW && TARGET_EVEX512")
+   (V64QI "TARGET_AVX512VBMI && TARGET_EVEX512")
(V32HF "TARGET_AVX512FP16")])
 
 (define_expand "vec_perm"
@@ -26983,7 +26984,8 @@
(V32HI "TARGET_AVX512BW && TARGET_EVEX512")
(V16HI "TARGET_AVX512BW && TARGET_AVX512VL")
(V8HI "TARGET_AVX512BW && TARGET_AVX512VL")
-   (V64QI "TARGET_AVX512VBMI") (V32QI "TARGET_AVX512VBMI && TARGET_AVX512VL")
+   (V64QI "TARGET_AVX512VBMI && TARGET_EVEX512")
+   (V32QI "TARGET_AVX512VBMI && TARGET_AVX512VL")
(V16QI "TARGET_AVX512VBMI && TARGET_AVX512VL")])
 
 (define_mode_iterator VPERMI2I
@@ -26993,7 +26995,8 @@
(V32HI "TARGET_AVX512BW && TARGET_EVEX512")
(V16HI "TARGET_AVX512BW && TARGET_AVX512VL")
(V8HI "TARGET_AVX512BW && TARGET_AVX512VL")
-   (V64QI "TAR

[PATCH 09/18] [PATCH 3/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
---
 gcc/config/i386/i386-builtin.def | 226 +++
 1 file changed, 113 insertions(+), 113 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 7a0dec9bc8b..167d530a537 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -293,10 +293,10 @@ BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_CMPCCXADD, 
CODE_FOR_cmpccxadd_si,
 BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_CMPCCXADD, 
CODE_FOR_cmpccxadd_di, "__builtin_ia32_cmpccxadd64", IX86_BUILTIN_CMPCCXADD64, 
UNKNOWN, (int) LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT)
 
 /* AVX512BW */
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_loadv32hi_mask, 
"__builtin_ia32_loaddquhi512_mask", IX86_BUILTIN_LOADDQUHI512_MASK, UNKNOWN, 
(int) V32HI_FTYPE_PCSHORT_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_loadv64qi_mask, 
"__builtin_ia32_loaddquqi512_mask", IX86_BUILTIN_LOADDQUQI512_MASK, UNKNOWN, 
(int) V64QI_FTYPE_PCCHAR_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_storev32hi_mask, 
"__builtin_ia32_storedquhi512_mask", IX86_BUILTIN_STOREDQUHI512_MASK, UNKNOWN, 
(int) VOID_FTYPE_PSHORT_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_storev64qi_mask, 
"__builtin_ia32_storedquqi512_mask", IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, 
(int) VOID_FTYPE_PCHAR_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_loadv32hi_mask, "__builtin_ia32_loaddquhi512_mask", 
IX86_BUILTIN_LOADDQUHI512_MASK, UNKNOWN, (int) V32HI_FTYPE_PCSHORT_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_loadv64qi_mask, "__builtin_ia32_loaddquqi512_mask", 
IX86_BUILTIN_LOADDQUQI512_MASK, UNKNOWN, (int) V64QI_FTYPE_PCCHAR_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_storev32hi_mask, "__builtin_ia32_storedquhi512_mask", 
IX86_BUILTIN_STOREDQUHI512_MASK, UNKNOWN, (int) VOID_FTYPE_PSHORT_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_storev64qi_mask, "__builtin_ia32_storedquqi512_mask", 
IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, (int) VOID_FTYPE_PCHAR_V64QI_UDI)
 
 /* AVX512VP2INTERSECT */
 BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, UNKNOWN, (int) 
VOID_FTYPE_PUHI_PUHI_V16SI_V16SI)
@@ -407,9 +407,9 @@ BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 
0, CODE_FOR_avx512vl
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_avx512vl_ss_truncatev16hiv16qi2_mask_store, 
"__builtin_ia32_pmovswb256mem_mask", IX86_BUILTIN_PMOVSWB256_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV16QI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_avx512vl_us_truncatev8hiv8qi2_mask_store_2, 
"__builtin_ia32_pmovuswb128mem_mask", IX86_BUILTIN_PMOVUSWB128_MEM, UNKNOWN, 
(int) VOID_FTYPE_PUDI_V8HI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_avx512vl_us_truncatev16hiv16qi2_mask_store, 
"__builtin_ia32_pmovuswb256mem_mask", IX86_BUILTIN_PMOVUSWB256_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV16QI_V16HI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, 
CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovuswb512mem_mask", IX86_BUILTIN_PMOVUSWB512_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV32QI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, 
CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV32QI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, 
CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) 
VOID_FTYPE_PV32QI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovuswb512mem_mask", IX86_BUILTIN_PMOVUSWB512_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV32QI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV32QI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) 
VOID_FTYPE_PV32QI_V32HI_USI)
 
 /* AVX512FP16 */
 BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_loadhf_mask, 
"__builtin_ia32_loadsh_mask", IX86_BUILTIN_LOADSH_MASK, UNKNOWN, (int) 
V8HF_FTYPE_PCFLOAT16_V8HF_UQI)
@@ -1590,61 +1590,61 @@ BDESC (OPTION_MASK_ISA_AVX512F, 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512f_round
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_kashiftq

[PATCH 14/18] Support -mevex512 for AVX512DQ intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_sse2_mulvxdi3):
Add TARGET_EVEX512 for 512 bit usage.
* config/i386/i386.cc (standard_sse_constant_opcode): Ditto.
* config/i386/sse.md (VF1_VF2_AVX512DQ): Ditto.
(VF1_128_256VL): Ditto.
(VF2_AVX512VL): Ditto.
(VI8_256_512): Ditto.
(fixuns_trunc2):
Ditto.
(AVX512_VEC): Ditto.
(AVX512_VEC_2): Ditto.
(VI4F_BRCST32x2): Ditto.
(VI8F_BRCST64x2): Ditto.
---
 gcc/config/i386/i386-expand.cc |  2 +-
 gcc/config/i386/i386.cc| 22 --
 gcc/config/i386/sse.md | 24 ++--
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 0705e08d38c..063561e1265 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24008,7 +24008,7 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
   machine_mode mode = GET_MODE (op0);
   rtx t1, t2, t3, t4, t5, t6;
 
-  if (TARGET_AVX512DQ && mode == V8DImode)
+  if (TARGET_AVX512DQ && TARGET_EVEX512 && mode == V8DImode)
 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 635dd85e764..589b29a324d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -5332,9 +5332,14 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx 
*operands)
  if (EXT_REX_SSE_REG_P (operands[0]))
{
  if (TARGET_AVX512DQ)
-   return (TARGET_AVX512VL
-   ? "vxorpd\t%x0, %x0, %x0"
-   : "vxorpd\t%g0, %g0, %g0");
+   {
+ if (TARGET_AVX512VL)
+   return "vxorpd\t%x0, %x0, %x0";
+ else if (TARGET_EVEX512)
+   return "vxorpd\t%g0, %g0, %g0";
+ else
+   gcc_unreachable ();
+   }
  else
{
  if (TARGET_AVX512VL)
@@ -5356,9 +5361,14 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx 
*operands)
  if (EXT_REX_SSE_REG_P (operands[0]))
{
  if (TARGET_AVX512DQ)
-   return (TARGET_AVX512VL
-   ? "vxorps\t%x0, %x0, %x0"
-   : "vxorps\t%g0, %g0, %g0");
+   {
+ if (TARGET_AVX512VL)
+   return "vxorps\t%x0, %x0, %x0";
+ else if (TARGET_EVEX512)
+   return "vxorps\t%g0, %g0, %g0";
+ else
+   gcc_unreachable ();
+   }
  else
{
  if (TARGET_AVX512VL)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8d1b75b43e0..a8f93ceddc5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -350,7 +350,8 @@
 
 (define_mode_iterator VF1_VF2_AVX512DQ
   [(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
-   (V8DF "TARGET_AVX512DQ") (V4DF "TARGET_AVX512DQ && TARGET_AVX512VL")
+   (V8DF "TARGET_AVX512DQ && TARGET_EVEX512")
+   (V4DF "TARGET_AVX512DQ && TARGET_AVX512VL")
(V2DF "TARGET_AVX512DQ && TARGET_AVX512VL")])
 
 (define_mode_iterator VFH
@@ -392,7 +393,7 @@
   [(V8SF "TARGET_AVX") V4SF])
 
 (define_mode_iterator VF1_128_256VL
-  [V8SF (V4SF "TARGET_AVX512VL")])
+  [(V8SF "TARGET_EVEX512") (V4SF "TARGET_AVX512VL")])
 
 ;; All DFmode vector float modes
 (define_mode_iterator VF2
@@ -467,7 +468,7 @@
(V8DF "TARGET_EVEX512") (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
 
 (define_mode_iterator VF2_AVX512VL
-  [V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
+  [(V8DF "TARGET_EVEX512") (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
 
 (define_mode_iterator VF1_AVX512VL
   [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")])
@@ -534,7 +535,7 @@
   [(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
 
 (define_mode_iterator VI8_256_512
-  [V8DI (V4DI "TARGET_AVX512VL")])
+  [(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL")])
 
 (define_mode_iterator VI1_AVX2
   [(V32QI "TARGET_AVX2") V16QI])
@@ -9075,7 +9076,7 @@
 (define_insn "fixuns_trunc2"
   [(set (match_operand: 0 "register_operand" "=v")
(unsigned_fix:
- (match_operand:VF1_128_256VL 1 "nonimmediate_operand" "vm")))]
+ (match_operand:VF1_128_256 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512VL"
   "vcvttps2udq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
@@ -11466,7 +11467,8 @@
(V8SF "32x4") (V8SI "32x4") (V4DF "64x2") (V4DI "64x2")])
 
 (define_mode_iterator AVX512_VEC
-  [(V8DF "TARGET_AVX512DQ") (V8DI "TARGET_AVX512DQ")
+  [(V8DF "TARGET_AVX512DQ && TARGET_EVEX512")
+   (V8DI "TARGET_AVX512DQ && TARGET

[PATCH 18/18] Allow -mno-evex512 usage

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386.opt: Allow -mno-evex512.

gcc/testsuite/ChangeLog:

* gcc.target/i386/noevex512-1.c: New test.
* gcc.target/i386/noevex512-2.c: Ditto.
* gcc.target/i386/noevex512-3.c: Ditto.
---
 gcc/config/i386/i386.opt|  2 +-
 gcc/testsuite/gcc.target/i386/noevex512-1.c | 13 +
 gcc/testsuite/gcc.target/i386/noevex512-2.c | 13 +
 gcc/testsuite/gcc.target/i386/noevex512-3.c | 13 +
 4 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-3.c

diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 6d8601b1f75..34fc167af82 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1312,5 +1312,5 @@ Target Alias(mtune-ctrl=, use_scatter, ^use_scatter)
 Enable vectorization for scatter instruction.
 
 mevex512
-Target RejectNegative Mask(ISA2_EVEX512) Var(ix86_isa_flags2) Save
+Target Mask(ISA2_EVEX512) Var(ix86_isa_flags2) Save
 Support 512 bit vector built-in functions and code generation.
diff --git a/gcc/testsuite/gcc.target/i386/noevex512-1.c 
b/gcc/testsuite/gcc.target/i386/noevex512-1.c
new file mode 100644
index 000..7fd45f15be6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noevex512-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -march=x86-64 -mavx512f -mno-evex512 -Wno-psabi" } */
+/* { dg-final { scan-assembler-not ".%zmm" } } */
+
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+__m512d
+foo ()
+{
+  __m512d a, b;
+  a = a + b;
+  return a;
+}
diff --git a/gcc/testsuite/gcc.target/i386/noevex512-2.c 
b/gcc/testsuite/gcc.target/i386/noevex512-2.c
new file mode 100644
index 000..1c206e385d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noevex512-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mavx512bw -mno-evex512" } */
+
+#include 
+
+long long
+foo (long long c)
+{
+  register long long a __asm ("k7") = c;
+  long long b = foo (a);
+  asm volatile ("" : "+k" (b)); /* { dg-error "inconsistent operand 
constraints in an 'asm'" } */
+  return b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/noevex512-3.c 
b/gcc/testsuite/gcc.target/i386/noevex512-3.c
new file mode 100644
index 000..10e00c2d61c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noevex512-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -Wno-psabi -mavx512f" } */
+/* { dg-final { scan-assembler-not ".%zmm" } } */
+
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+__attribute__ ((target ("no-evex512"))) __m512d
+foo ()
+{
+  __m512d a, b;
+  a = a + b;
+  return a;
+}
-- 
2.31.1



[PATCH 03/18] [PATCH 2/5] Push evex512 target for 512 bit intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/avx512dqintrin.h: Add evex512 target for 512 bit
intrins.
---
 gcc/config/i386/avx512dqintrin.h | 1840 +++---
 1 file changed, 926 insertions(+), 914 deletions(-)

diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h
index 93900a0b5c7..b6a1d499e25 100644
--- a/gcc/config/i386/avx512dqintrin.h
+++ b/gcc/config/i386/avx512dqintrin.h
@@ -184,1275 +184,1426 @@ _kandn_mask8 (__mmask8 __A, __mmask8 __B)
   return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B);
 }
 
-extern __inline __m512d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_broadcast_f64x2 (__m128d __A)
-{
-  return (__m512d)
-__builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
-_mm512_undefined_pd (),
-(__mmask8) -1);
-}
-
-extern __inline __m512d
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+_kshiftli_mask8 (__mmask8 __A, unsigned int __B)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df)
-  __A,
-  (__v8df)
-  __O, __M);
+  return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B);
 }
 
-extern __inline __m512d
+extern __inline __mmask8
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+_kshiftri_mask8 (__mmask8 __A, unsigned int __B)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df)
-  __A,
-  (__v8df)
-  _mm512_setzero_ps (),
-  __M);
+  return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B);
 }
 
-extern __inline __m512i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_broadcast_i64x2 (__m128i __A)
+_mm_reduce_sd (__m128d __A, __m128d __B, int __C)
 {
-  return (__m512i)
-__builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
-_mm512_undefined_epi32 (),
+  return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+(__v2df) __B, __C,
+(__v2df) _mm_setzero_pd (),
 (__mmask8) -1);
 }
 
-extern __inline __m512i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di)
-  __A,
-  (__v8di)
-  __O, __M);
+  return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
+  (__v2df) __B, __C,
+  (__v2df)
+  _mm_setzero_pd (),
+  (__mmask8) -1, __R);
 }
 
-extern __inline __m512i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+_mm_mask_reduce_sd (__m128d __W,  __mmask8 __U, __m128d __A,
+   __m128d __B, int __C)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di)
-  __A,
-  (__v8di)
-  _mm512_setzero_si512 
(),
-  __M);
+  return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+(__v2df) __B, __C,
+(__v2df) __W,
+(__mmask8) __U);
 }
 
-extern __inline __m512
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_broadcast_f32x2 (__m128 __A)
+_mm_mask_reduce_round_sd (__m128d __W,  __mmask8 __U, __m128d __A,
+ __m128d __B, int __C, const int __R)
 {
-  return (_

[PATCH 17/18] Support -mevex512 for AVX512FP16 intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/sse.md (V48H_AVX512VL): Add TARGET_EVEX512.
(VFH): Ditto.
(VF2H): Ditto.
(VFH_AVX512VL): Ditto.
(VHFBF): Ditto.
(VHF_AVX512VL): Ditto.
(VI2H_AVX512VL): Ditto.
(VI2F_256_512): Ditto.
(VF48_I1248): Remove unused iterator.
(VF48H_AVX512VL): Add TARGET_EVEX512.
(VF_AVX512): Remove unused iterator.
(REDUC_PLUS_MODE): Add TARGET_EVEX512.
(REDUC_SMINMAX_MODE): Ditto.
(FMAMODEM): Ditto.
(VFH_SF_AVX512VL): Ditto.
(VEC_PERM_AVX2): Ditto.

Co-authored-by: Hu, Lin1 
---
 gcc/config/i386/sse.md | 44 --
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a5a95b9de66..25d53e15dce 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -280,7 +280,7 @@
 (define_mode_iterator V48H_AVX512VL
   [(V16SI "TARGET_EVEX512") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")
-   (V32HF "TARGET_AVX512FP16")
+   (V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
@@ -355,7 +355,7 @@
(V2DF "TARGET_AVX512DQ && TARGET_AVX512VL")])
 
 (define_mode_iterator VFH
-  [(V32HF "TARGET_AVX512FP16")
+  [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
@@ -401,7 +401,7 @@
 
 ;; All DFmode & HFmode vector float modes
 (define_mode_iterator VF2H
-  [(V32HF "TARGET_AVX512FP16")
+  [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX") V2DF])
@@ -463,7 +463,7 @@
   [(V16SF "TARGET_AVX512ER") (V8SF "TARGET_AVX") V4SF])
 
 (define_mode_iterator VFH_AVX512VL
-  [(V32HF "TARGET_AVX512FP16")
+  [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
@@ -475,12 +475,14 @@
 (define_mode_iterator VF1_AVX512VL
   [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")])
 
-(define_mode_iterator VHFBF [V32HF V16HF V8HF V32BF V16BF V8BF])
+(define_mode_iterator VHFBF
+  [(V32HF "TARGET_EVEX512") V16HF V8HF
+   (V32BF "TARGET_EVEX512") V16BF V8BF])
 (define_mode_iterator VHFBF_256 [V16HF V16BF])
 (define_mode_iterator VHFBF_128 [V8HF V8BF])
 
 (define_mode_iterator VHF_AVX512VL
-  [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")])
+  [(V32HF "TARGET_EVEX512") (V16HF "TARGET_AVX512VL") (V8HF 
"TARGET_AVX512VL")])
 
 (define_mode_iterator VHFBF_AVX512VL
   [(V32HF "TARGET_EVEX512") (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")
@@ -594,9 +596,9 @@
(V8BF "TARGET_AVX512VL") (V16BF "TARGET_AVX512VL") (V32BF 
"TARGET_EVEX512")])
 
 (define_mode_iterator VI2H_AVX512VL
-  [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI
-   (V8SI "TARGET_AVX512VL") V16SI
-   V8DI ])
+  [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") (V32HI "TARGET_EVEX512")
+   (V8SI "TARGET_AVX512VL") (V16SI "TARGET_EVEX512")
+   (V8DI "TARGET_EVEX512")])
 
 (define_mode_iterator VI1_AVX512VL_F
   [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F && TARGET_EVEX512")])
@@ -883,7 +885,10 @@
(V32BF "TARGET_AVX512BW && TARGET_EVEX512")])
 
 ;; Int-float size matches
-(define_mode_iterator VI2F_256_512 [V16HI V32HI V16HF V32HF V16BF V32BF])
+(define_mode_iterator VI2F_256_512
+  [V16HI (V32HI "TARGET_EVEX512")
+   V16HF (V32HF "TARGET_EVEX512")
+   V16BF (V32BF "TARGET_EVEX512")])
 (define_mode_iterator VI4F_128 [V4SI V4SF])
 (define_mode_iterator VI8F_128 [V2DI V2DF])
 (define_mode_iterator VI4F_256 [V8SI V8SF])
@@ -899,10 +904,8 @@
   (V8DI "TARGET_AVX512F && TARGET_EVEX512")
   (V8DF "TARGET_AVX512F && TARGET_EVEX512")
   (V4DI "TARGET_AVX512VL") (V4DF  "TARGET_AVX512VL")])
-(define_mode_iterator VF48_I1248
-  [V16SI V16SF V8DI V8DF V32HI V64QI])
 (define_mode_iterator VF48H_AVX512VL
-  [V8DF V16SF (V8SF "TARGET_AVX512VL")])
+  [(V8DF "TARGET_EVEX512") (V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL")])
 
 (define_mode_iterator VF48_128
   [V2DF V4SF])
@@ -928,11 +931,6 @@
 
 (define_mode_iterator VI48F_256 [V8SI V8SF V4DI V4DF])
 
-(define_mode_iterator VF_AVX512
-  [(V4SF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")
-   (V8SF "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")
-   V16SF V8DF])
-
 (define_mode_iterator V8_128 [V8HI V8HF V8BF])
 (define_mode_iterator V16_256 [V16HI V16HF V16BF])
 (define_mode_iterator 

[PATCH 07/18] [PATCH 1/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
* config/i386/i386-builtins.cc
(ix86_init_mmx_sse_builtins): Ditto.
---
 gcc/config/i386/i386-builtin.def | 648 +++
 gcc/config/i386/i386-builtins.cc |  72 ++--
 2 files changed, 372 insertions(+), 348 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 8738b3b6a8a..0cc526383db 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -200,53 +200,53 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, 
CODE_FOR_avx2_maskstored256, "__builtin_ia32_mas
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskstoreq256, 
"__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) 
VOID_FTYPE_PV4DI_V4DI_V4DI)
 
 /* AVX512F */
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev16sf_mask, 
"__builtin_ia32_compressstoresf512_mask", IX86_BUILTIN_COMPRESSPSSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev16si_mask, 
"__builtin_ia32_compressstoresi512_mask", IX86_BUILTIN_PCOMPRESSDSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev8df_mask, 
"__builtin_ia32_compressstoredf512_mask", IX86_BUILTIN_COMPRESSPDSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev8di_mask, 
"__builtin_ia32_compressstoredi512_mask", IX86_BUILTIN_PCOMPRESSQSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16sf_mask, 
"__builtin_ia32_expandloadsf512_mask", IX86_BUILTIN_EXPANDPSLOAD512, UNKNOWN, 
(int) V16SF_FTYPE_PCV16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16sf_maskz, 
"__builtin_ia32_expandloadsf512_maskz", IX86_BUILTIN_EXPANDPSLOAD512Z, UNKNOWN, 
(int) V16SF_FTYPE_PCV16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16si_mask, 
"__builtin_ia32_expandloadsi512_mask", IX86_BUILTIN_PEXPANDDLOAD512, UNKNOWN, 
(int) V16SI_FTYPE_PCV16SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16si_maskz, 
"__builtin_ia32_expandloadsi512_maskz", IX86_BUILTIN_PEXPANDDLOAD512Z, UNKNOWN, 
(int) V16SI_FTYPE_PCV16SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8df_mask, 
"__builtin_ia32_expandloaddf512_mask", IX86_BUILTIN_EXPANDPDLOAD512, UNKNOWN, 
(int) V8DF_FTYPE_PCV8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8df_maskz, 
"__builtin_ia32_expandloaddf512_maskz", IX86_BUILTIN_EXPANDPDLOAD512Z, UNKNOWN, 
(int) V8DF_FTYPE_PCV8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8di_mask, 
"__builtin_ia32_expandloaddi512_mask", IX86_BUILTIN_PEXPANDQLOAD512, UNKNOWN, 
(int) V8DI_FTYPE_PCV8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8di_maskz, 
"__builtin_ia32_expandloaddi512_maskz", IX86_BUILTIN_PEXPANDQLOAD512Z, UNKNOWN, 
(int) V8DI_FTYPE_PCV8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16si_mask, 
"__builtin_ia32_loaddqusi512_mask", IX86_BUILTIN_LOADDQUSI512, UNKNOWN, (int) 
V16SI_FTYPE_PCINT_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8di_mask, 
"__builtin_ia32_loaddqudi512_mask", IX86_BUILTIN_LOADDQUDI512, UNKNOWN, (int) 
V8DI_FTYPE_PCINT64_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8df_mask, 
"__builtin_ia32_loadupd512_mask", IX86_BUILTIN_LOADUPD512, UNKNOWN, (int) 
V8DF_FTYPE_PCDOUBLE_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16sf_mask, 
"__builtin_ia32_loadups512_mask", IX86_BUILTIN_LOADUPS512, UNKNOWN, (int) 
V16SF_FTYPE_PCFLOAT_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16sf_mask, 
"__builtin_ia32_loadaps512_mask", IX86_BUILTIN_LOADAPS512, UNKNOWN, (int) 
V16SF_FTYPE_PCV16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16si_mask, 
"__builtin_ia32_movdqa32load512_mask", IX86_BUILTIN_MOVDQA32LOAD512, UNKNOWN, 
(int) V16SI_FTYPE_PCV16SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8df_mask, 
"__builtin_ia32_loadapd512_mask", IX86_BUILTIN_LOADAPD512, UNKNOWN, (int) 
V8DF_FTYPE_PCV8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8di_mask, 
"__builtin_ia32_movdqa64load512_mask", IX86_BUILTIN_MOVDQA64LOAD512, UNKNOWN, 
(int) V8DI_FTYPE_PCV8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movntv16sf, 
"__builtin_ia32_movntps512", IX86_BUILTIN_MOVNTPS512, UNKNOWN, (int) 
VOID_FTYPE_PFLOAT_V16SF)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movntv8df, 
"__builtin_ia32_movntpd512", IX86_BUILTIN_MOVNTPD512, UNKNOWN, (int) 
VOID_FTYPE_PDOUBLE_V8DF)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movntv8di, 
"__builtin_ia32_movntdq512", IX86_BUILTIN_MOVNTDQ512, UNKNOWN, (i

[PATCH 05/18] [PATCH 4/5] Push evex512 target for 512 bit intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config.gcc: Add avx512bitalgvlintrin.h.
* config/i386/avx5124fmapsintrin.h: Add evex512 target for 512 bit
intrins.
* config/i386/avx5124vnniwintrin.h: Ditto.
* config/i386/avx512bf16intrin.h: Ditto.
* config/i386/avx512bitalgintrin.h: Add evex512 target for 512 bit
intrins. Split 128/256 bit intrins to avx512bitalgvlintrin.h.
* config/i386/avx512erintrin.h: Add evex512 target for 512 bit
intrins
* config/i386/avx512ifmaintrin.h: Ditto
* config/i386/avx512pfintrin.h: Ditto
* config/i386/avx512vbmi2intrin.h: Ditto.
* config/i386/avx512vbmiintrin.h: Ditto.
* config/i386/avx512vnniintrin.h: Ditto.
* config/i386/avx512vp2intersectintrin.h: Ditto.
* config/i386/avx512vpopcntdqintrin.h: Ditto.
* config/i386/gfniintrin.h: Ditto.
* config/i386/immintrin.h: Add avx512bitalgvlintrin.h.
* config/i386/vaesintrin.h: Add evex512 target for 512 bit intrins.
* config/i386/vpclmulqdqintrin.h: Ditto.
* config/i386/avx512bitalgvlintrin.h: New.
---
 gcc/config.gcc |  19 +--
 gcc/config/i386/avx5124fmapsintrin.h   |   2 +-
 gcc/config/i386/avx5124vnniwintrin.h   |   2 +-
 gcc/config/i386/avx512bf16intrin.h |  31 ++--
 gcc/config/i386/avx512bitalgintrin.h   | 155 +-
 gcc/config/i386/avx512bitalgvlintrin.h | 180 +
 gcc/config/i386/avx512erintrin.h   |   2 +-
 gcc/config/i386/avx512ifmaintrin.h |   4 +-
 gcc/config/i386/avx512pfintrin.h   |   2 +-
 gcc/config/i386/avx512vbmi2intrin.h|   4 +-
 gcc/config/i386/avx512vbmiintrin.h |   4 +-
 gcc/config/i386/avx512vnniintrin.h |   4 +-
 gcc/config/i386/avx512vp2intersectintrin.h |   4 +-
 gcc/config/i386/avx512vpopcntdqintrin.h|   4 +-
 gcc/config/i386/gfniintrin.h   |  76 +
 gcc/config/i386/immintrin.h|   2 +
 gcc/config/i386/vaesintrin.h   |   4 +-
 gcc/config/i386/vpclmulqdqintrin.h |   4 +-
 18 files changed, 282 insertions(+), 221 deletions(-)
 create mode 100644 gcc/config/i386/avx512bitalgvlintrin.h

diff --git a/gcc/config.gcc b/gcc/config.gcc
index ce5def08e2e..e47e6893e1d 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -425,15 +425,16 @@ i[34567]86-*-* | x86_64-*-*)
   avx512vbmi2vlintrin.h avx512vnniintrin.h
   avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h
   avx512vpopcntdqvlintrin.h avx512bitalgintrin.h
-  pconfigintrin.h wbnoinvdintrin.h movdirintrin.h
-  waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h
-  avx512bf16intrin.h enqcmdintrin.h serializeintrin.h
-  avx512vp2intersectintrin.h avx512vp2intersectvlintrin.h
-  tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h
-  amxbf16intrin.h x86gprintrin.h uintrintrin.h
-  hresetintrin.h keylockerintrin.h avxvnniintrin.h
-  mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h
-  avxifmaintrin.h avxvnniint8intrin.h avxneconvertintrin.h
+  avx512bitalgvlintrin.h pconfigintrin.h wbnoinvdintrin.h
+  movdirintrin.h waitpkgintrin.h cldemoteintrin.h
+  avx512bf16vlintrin.h avx512bf16intrin.h enqcmdintrin.h
+  serializeintrin.h avx512vp2intersectintrin.h
+  avx512vp2intersectvlintrin.h tsxldtrkintrin.h
+  amxtileintrin.h amxint8intrin.h amxbf16intrin.h
+  x86gprintrin.h uintrintrin.h hresetintrin.h
+  keylockerintrin.h avxvnniintrin.h mwaitintrin.h
+  avx512fp16intrin.h avx512fp16vlintrin.h avxifmaintrin.h
+  avxvnniint8intrin.h avxneconvertintrin.h
   cmpccxaddintrin.h amxfp16intrin.h prfchiintrin.h
   raointintrin.h amxcomplexintrin.h avxvnniint16intrin.h
   sm3intrin.h sha512intrin.h sm4intrin.h"
diff --git a/gcc/config/i386/avx5124fmapsintrin.h 
b/gcc/config/i386/avx5124fmapsintrin.h
index 97dd77c9235..4c884a5c203 100644
--- a/gcc/config/i386/avx5124fmapsintrin.h
+++ b/gcc/config/i386/avx5124fmapsintrin.h
@@ -30,7 +30,7 @@
 
 #ifndef __AVX5124FMAPS__
 #pragma GCC push_options
-#pragma GCC target("avx5124fmaps")
+#pragma GCC target("avx5124fmaps,evex512")
 #define __DISABLE_AVX5124FMAPS__
 #endif /* __AVX5124FMAPS__ */
 
diff --git a/gcc/config/i386/avx5124vnniwintrin.h 
b/gcc/config/i386/avx5124vnniwintrin.h
index fd129589798..795e4814f28 100644
--- a/gcc/config/i386/avx5124vnniwintrin.h
+++ b/gcc/config/i386/avx5124vnniwintrin.h
@@ -30,7 +30,7 @@
 
 #ifndef __AVX5124VNNIW__
 #pragma GCC push_options
-#pragma GCC target("avx5124v

[PATCH 10/18] [PATCH 4/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
---
 gcc/config/i386/i386-builtin.def | 188 +++
 1 file changed, 94 insertions(+), 94 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 167d530a537..8250e2998cd 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -299,8 +299,8 @@ BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_sto
 BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_storev64qi_mask, "__builtin_ia32_storedquqi512_mask", 
IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, (int) VOID_FTYPE_PCHAR_V64QI_UDI)
 
 /* AVX512VP2INTERSECT */
-BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, UNKNOWN, (int) 
VOID_FTYPE_PUHI_PUHI_V16SI_V16SI)
-BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectq512", IX86_BUILTIN_2INTERSECTQ512, UNKNOWN, (int) 
VOID_FTYPE_PUQI_PUQI_V8DI_V8DI)
+BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_nothing, "__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, 
UNKNOWN, (int) VOID_FTYPE_PUHI_PUHI_V16SI_V16SI)
+BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_nothing, "__builtin_ia32_2intersectq512", IX86_BUILTIN_2INTERSECTQ512, 
UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V8DI_V8DI)
 BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectd256", IX86_BUILTIN_2INTERSECTD256, UNKNOWN, (int) 
VOID_FTYPE_PUQI_PUQI_V8SI_V8SI)
 BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectq256", IX86_BUILTIN_2INTERSECTQ256, UNKNOWN, (int) 
VOID_FTYPE_PUQI_PUQI_V4DI_V4DI)
 BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectd128", IX86_BUILTIN_2INTERSECTD128, UNKNOWN, (int) 
VOID_FTYPE_PUQI_PUQI_V4SI_V4SI)
@@ -430,17 +430,17 @@ BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru,  
"__builtin_ia32_rdpkru", IX86_B
 BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru,  "__builtin_ia32_wrpkru", 
IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED)
 
 /* VBMI2 */
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressstorev64qi_mask, 
"__builtin_ia32_compressstoreuqi512_mask", IX86_BUILTIN_PCOMPRESSBSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressstorev32hi_mask, 
"__builtin_ia32_compressstoreuhi512_mask", IX86_BUILTIN_PCOMPRESSWSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_compressstorev64qi_mask, "__builtin_ia32_compressstoreuqi512_mask", 
IX86_BUILTIN_PCOMPRESSBSTORE512, UNKNOWN, (int) VOID_FTYPE_PV64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_compressstorev32hi_mask, "__builtin_ia32_compressstoreuhi512_mask", 
IX86_BUILTIN_PCOMPRESSWSTORE512, UNKNOWN, (int) VOID_FTYPE_PV32HI_V32HI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_compressstorev32qi_mask, "__builtin_ia32_compressstoreuqi256_mask", 
IX86_BUILTIN_PCOMPRESSBSTORE256, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32QI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_compressstorev16qi_mask, "__builtin_ia32_compressstoreuqi128_mask", 
IX86_BUILTIN_PCOMPRESSBSTORE128, UNKNOWN, (int) VOID_FTYPE_PV16QI_V16QI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_compressstorev16hi_mask, "__builtin_ia32_compressstoreuhi256_mask", 
IX86_BUILTIN_PCOMPRESSWSTORE256, UNKNOWN, (int) VOID_FTYPE_PV16HI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_compressstorev8hi_mask, "__builtin_ia32_compressstoreuhi128_mask", 
IX86_BUILTIN_PCOMPRESSWSTORE128, UNKNOWN, (int) VOID_FTYPE_PV8HI_V8HI_UQI)
 
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_mask, 
"__builtin_ia32_expandloadqi512_mask", IX86_BUILTIN_PEXPANDBLOAD512, UNKNOWN, 
(int) V64QI_FTYPE_PCV64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_maskz, 
"__builtin_ia32_expandloadqi512_maskz", IX86_BUILTIN_PEXPANDBLOAD512Z, UNKNOWN, 
(int) V64QI_FTYPE_PCV64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_mask, 
"__builtin_ia32_expandloadhi512_mask", IX86_BUILTIN_PEXPANDWLOAD512, UNKNOWN, 
(int) V32HI_FTYPE_PCV32HI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_maskz, 
"__builtin_ia32_expandloadhi512_maskz", IX86_BUILTIN_PEXPANDWLOAD512Z, UNKNOWN, 
(int) V32HI_FTYPE_PCV32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_expandv64qi_mask, "__builtin_ia32_expandloadqi512_mask", 
IX86_BUILTIN_PEXPANDBLOAD512, UNKNOWN, (int) V64QI_FTYPE_PCV64QI_V6

[PATCH 15/18] Support -mevex512 for AVX512BW intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/Changelog:

* config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
Make sure there is EVEX512 enabled.
(ix86_expand_vecop_qihi2): Refuse V32QI->V32HI when no EVEX512.
* config/i386/i386.cc (ix86_hard_regno_mode_ok): Disable 64 bit mask
when !TARGET_EVEX512.
* config/i386/i386.md (avx512bw_512): New.
(SWI1248_AVX512BWDQ_64): Add TARGET_EVEX512.
(*zero_extendsidi2): Change isa to avx512bw_512.
(kmov_isa): Ditto.
(*anddi_1): Ditto.
(*andn_1): Change isa to kmov_isa.
(*_1): Ditto.
(*notxor_1): Ditto.
(*one_cmpl2_1): Ditto.
(*one_cmplsi2_1_zext): Change isa to avx512bw_512.
(*ashl3_1): Change isa to kmov_isa.
(*lshr3_1): Ditto.
* config/i386/sse.md (VI12HFBF_AVX512VL): Add TARGET_EVEX512.
(VI1248_AVX512VLBW): Ditto.
(VHFBF_AVX512VL): Ditto.
(VI): Ditto.
(VIHFBF): Ditto.
(VI_AVX2): Ditto.
(VI1_AVX512): Ditto.
(VI12_256_512_AVX512VL): Ditto.
(VI2_AVX2_AVX512BW): Ditto.
(VI2_AVX512VNNIBW): Ditto.
(VI2_AVX512VL): Ditto.
(VI2HFBF_AVX512VL): Ditto.
(VI8_AVX2_AVX512BW): Ditto.
(VIMAX_AVX2_AVX512BW): Ditto.
(VIMAX_AVX512VL): Ditto.
(VI12_AVX2_AVX512BW): Ditto.
(VI124_AVX2_24_AVX512F_1_AVX512BW): Ditto.
(VI248_AVX512VL): Ditto.
(VI248_AVX512VLBW): Ditto.
(VI248_AVX2_8_AVX512F_24_AVX512BW): Ditto.
(VI248_AVX512BW): Ditto.
(VI248_AVX512BW_AVX512VL): Ditto.
(VI248_512): Ditto.
(VI124_256_AVX512F_AVX512BW): Ditto.
(VI_AVX512BW): Ditto.
(VIHFBF_AVX512BW): Ditto.
(SWI1248_AVX512BWDQ): Ditto.
(SWI1248_AVX512BW): Ditto.
(SWI1248_AVX512BWDQ2): Ditto.
(*knotsi_1_zext): Ditto.
(define_split for zero_extend + not): Ditto.
(kunpckdi): Ditto.
(REDUC_SMINMAX_MODE): Ditto.
(VEC_EXTRACT_MODE): Ditto.
(*avx512bw_permvar_truncv16siv16hi_1): Ditto.
(*avx512bw_permvar_truncv16siv16hi_1_hf): Ditto.
(truncv32hiv32qi2): Ditto.
(avx512bw_v32hiv32qi2): Ditto.
(avx512bw_v32hiv32qi2_mask): Ditto.
(avx512bw_v32hiv32qi2_mask_store): Ditto.
(usadv64qi): Ditto.
(VEC_PERM_AVX2): Ditto.
(AVX512ZEXTMASK): Ditto.
(SWI24_MASK): New.
(vec_pack_trunc_): Change iterator to SWI24_MASK.
(avx512bw_packsswb): Add TARGET_EVEX512.
(avx512bw_packssdw): Ditto.
(avx512bw_interleave_highv64qi): Ditto.
(avx512bw_interleave_lowv64qi): Ditto.
(avx512bw_pshuflwv32hi): Ditto.
(avx512bw_pshufhwv32hi): Ditto.
(vec_unpacks_lo_di): Ditto.
(SWI48x_MASK): New.
(vec_unpacks_hi_): Change iterator to SWI48x_MASK.
(avx512bw_umulhrswv32hi3): Add TARGET_EVEX512.
(VI1248_AVX512VL_AVX512BW): Ditto.
(avx512bw_v32qiv32hi2): Ditto.
(*avx512bw_zero_extendv32qiv32hi2_1): Ditto.
(*avx512bw_zero_extendv32qiv32hi2_2): Ditto.
(v32qiv32hi2): Ditto.
(pbroadcast_evex_isa): Change isa attribute to avx512bw_512.
(VPERMI2): Add TARGET_EVEX512.
(VPERMI2I): Ditto.
---
 gcc/config/i386/i386-expand.cc |   3 +-
 gcc/config/i386/i386.cc|   4 +-
 gcc/config/i386/i386.md|  54 -
 gcc/config/i386/sse.md | 193 ++---
 4 files changed, 128 insertions(+), 126 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 063561e1265..ff2423f91ed 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -15617,6 +15617,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
 case E_V32HFmode:
 case E_V32BFmode:
 case E_V64QImode:
+  gcc_assert (TARGET_EVEX512);
   if (TARGET_AVX512BW)
return ix86_vector_duplicate_value (mode, target, val);
   else
@@ -23512,7 +23513,7 @@ ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, 
rtx op1, rtx op2)
   bool uns_p = code != ASHIFTRT;
 
   if ((qimode == V16QImode && !TARGET_AVX2)
-  || (qimode == V32QImode && !TARGET_AVX512BW)
+  || (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512))
   /* There are no V64HImode instructions.  */
   || qimode == V64QImode)
  return false;
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 589b29a324d..03c96ff048d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20308,8 +20308,8 @@ ix86_hard_regno_mode_ok (unsigned int regno, 
machine_mode mode)
return MASK_PAIR_REGNO_P(regno);
 
   return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
- || (TARGET_AVX512BW
- && VALID_MASK_AVX512BW_MODE (mode)));
+ || (TARGET_AVX512BW && mode == SImode)
+ || (TARGET_AVX512BW && TARGET_EVEX512 &&

[PATCH 04/18] [PATCH 3/5] Push evex512 target for 512 bit intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/avx512bwintrin.h: Add evex512 target for 512 bit
intrins.
---
 gcc/config/i386/avx512bwintrin.h | 291 ---
 1 file changed, 153 insertions(+), 138 deletions(-)

diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h
index d1cd549ce18..925bae1457c 100644
--- a/gcc/config/i386/avx512bwintrin.h
+++ b/gcc/config/i386/avx512bwintrin.h
@@ -34,16 +34,6 @@
 #define __DISABLE_AVX512BW__
 #endif /* __AVX512BW__ */
 
-/* Internal data types for implementing the intrinsics.  */
-typedef short __v32hi __attribute__ ((__vector_size__ (64)));
-typedef short __v32hi_u __attribute__ ((__vector_size__ (64),  \
-   __may_alias__, __aligned__ (1)));
-typedef char __v64qi __attribute__ ((__vector_size__ (64)));
-typedef char __v64qi_u __attribute__ ((__vector_size__ (64),   \
-  __may_alias__, __aligned__ (1)));
-
-typedef unsigned long long __mmask64;
-
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _ktest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
@@ -54,229 +44,292 @@ _ktest_mask32_u8  (__mmask32 __A,  __mmask32 __B, 
unsigned char *__CF)
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktest_mask64_u8  (__mmask64 __A,  __mmask64 __B, unsigned char *__CF)
+_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
 {
-  *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
-  return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
+  return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
 }
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
+_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
 {
-  return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
+  return (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
 }
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
+_kortest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
 {
-  return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
+  *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
 }
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
+_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
 {
-  return (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
 }
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
+_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
 {
-  return (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
 }
 
-extern __inline unsigned char
+extern __inline __mmask32
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
+_kadd_mask32 (__mmask32 __A, __mmask32 __B)
 {
-  *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
-  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
+  return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B);
 }
 
-extern __inline unsigned char
+extern __inline unsigned int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortest_mask64_u8  (__mmask64 __A,  __mmask64 __B, unsigned char *__CF)
+_cvtmask32_u32 (__mmask32 __A)
 {
-  *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
-  return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
+  return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A);
 }
 
-extern __inline unsigned char
+extern __inline __mmask32
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
+_cvtu32_mask32 (unsigned int __A)
 {
-  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
+  return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A);
 }
 
-extern __inline unsigned char
+extern __inline __mmask32
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
+_load_mask32 (__mmask32 *__A)
 {
-  return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
+  return (__mmask32) __builtin_ia32_kmovd (*__A);
 }
 
-extern __inline unsigned char
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
+_store_mask32 (__mmask32 *__A, __mmask32 __B)
 {
-  return (unsigned char) __builtin_ia32_k

[PATCH 13/18] Support -mevex512 for AVX512F intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtins.cc
(ix86_vectorize_builtin_gather): Disable 512 bit gather
when !TARGET_EVEX512.
* config/i386/i386-expand.cc (ix86_valid_mask_cmp_mode):
Add TARGET_EVEX512.
(ix86_expand_int_sse_cmp): Ditto.
(ix86_expand_vector_init_one_nonzero): Disable subroutine
when !TARGET_EVEX512.
(ix86_emit_swsqrtsf): Add TARGET_EVEX512.
(ix86_vectorize_vec_perm_const): Disable subroutine when
!TARGET_EVEX512.
* config/i386/i386.cc
(standard_sse_constant_p): Add TARGET_EVEX512.
(standard_sse_constant_opcode): Ditto.
(ix86_get_ssemov): Ditto.
(ix86_legitimate_constant_p): Ditto.
(ix86_vectorize_builtin_scatter): Diable 512 bit scatter
when !TARGET_EVEX512.
* config/i386/i386.md (avx512f_512): New.
(movxi): Add TARGET_EVEX512.
(*movxi_internal_avx512f): Ditto.
(*movdi_internal): Change alternative 12 to ?Yv. Adjust mode
for alternative 13.
(*movsi_internal): Change alternative 8 to ?Yv. Adjust mode for
alternative 9.
(*movhi_internal): Change alternative 11 to *Yv.
(*movdf_internal): Change alternative 12 to Yv.
(*movsf_internal): Change alternative 5 to Yv. Adjust mode for
alternative 5 and 6.
(*mov_internal): Change alternative 4 to Yv.
(define_split for convert SF to DF): Add TARGET_EVEX512.
(extendbfsf2_1): Ditto.
* config/i386/predicates.md (bcst_mem_operand): Disable predicate
for 512 bit when !TARGET_EVEX512.
* config/i386/sse.md (VMOVE): Add TARGET_EVEX512.
(V48_AVX512VL): Ditto.
(V48_256_512_AVX512VL): Ditto.
(V48H_AVX512VL): Ditto.
(VI12_AVX512VL): Ditto.
(V): Ditto.
(V_512): Ditto.
(V_256_512): Ditto.
(VF): Ditto.
(VF1_VF2_AVX512DQ): Ditto.
(VFH): Ditto.
(VFB): Ditto.
(VF1): Ditto.
(VF1_AVX2): Ditto.
(VF2): Ditto.
(VF2H): Ditto.
(VF2_512_256): Ditto.
(VF2_512_256VL): Ditto.
(VF_512): Ditto.
(VFB_512): Ditto.
(VI48_AVX512VL): Ditto.
(VI1248_AVX512VLBW): Ditto.
(VF_AVX512VL): Ditto.
(VFH_AVX512VL): Ditto.
(VF1_AVX512VL): Ditto.
(VI): Ditto.
(VIHFBF): Ditto.
(VI_AVX2): Ditto.
(VI8): Ditto.
(VI8_AVX512VL): Ditto.
(VI2_AVX512F): Ditto.
(VI4_AVX512F): Ditto.
(VI4_AVX512VL): Ditto.
(VI48_AVX512F_AVX512VL): Ditto.
(VI8_AVX2_AVX512F): Ditto.
(VI8_AVX_AVX512F): Ditto.
(V8FI): Ditto.
(V16FI): Ditto.
(VI124_AVX2_24_AVX512F_1_AVX512BW): Ditto.
(VI248_AVX512VLBW): Ditto.
(VI248_AVX2_8_AVX512F_24_AVX512BW): Ditto.
(VI248_AVX512BW): Ditto.
(VI248_AVX512BW_AVX512VL): Ditto.
(VI48_AVX512F): Ditto.
(VI48_AVX_AVX512F): Ditto.
(VI12_AVX_AVX512F): Ditto.
(VI148_512): Ditto.
(VI124_256_AVX512F_AVX512BW): Ditto.
(VI48_512): Ditto.
(VI_AVX512BW): Ditto.
(VIHFBF_AVX512BW): Ditto.
(VI4F_256_512): Ditto.
(VI48F_256_512): Ditto.
(VI48F): Ditto.
(VI12_VI48F_AVX512VL): Ditto.
(V32_512): Ditto.
(AVX512MODE2P): Ditto.
(STORENT_MODE): Ditto.
(REDUC_PLUS_MODE): Ditto.
(REDUC_SMINMAX_MODE): Ditto.
(*andnot3): Change isa attribute to avx512f_512.
(*andnot3): Ditto.
(3): Ditto.
(tf3): Ditto.
(FMAMODEM): Add TARGET_EVEX512.
(FMAMODE_AVX512): Ditto.
(VFH_SF_AVX512VL): Ditto.
(avx512f_fix_notruncv16sfv16si): Ditto.
(fix_truncv16sfv16si2):
Ditto.
(avx512f_cvtdq2pd512_2): Ditto.
(avx512f_cvtpd2dq512): Ditto.
(fix_truncv8dfv8si2):
Ditto.
(avx512f_cvtpd2ps512): Ditto.
(vec_unpacks_lo_v16sf): Ditto.
(vec_unpacks_hi_v16sf): Ditto.
(vec_unpacks_float_hi_v16si): Ditto.
(vec_unpacks_float_lo_v16si): Ditto.
(vec_unpacku_float_hi_v16si): Ditto.
(vec_unpacku_float_lo_v16si): Ditto.
(vec_pack_sfix_trunc_v8df): Ditto.
(avx512f_vec_pack_sfix_v8df): Ditto.
(avx512f_unpckhps512): Ditto.
(avx512f_unpcklps512): Ditto.
(avx512f_movshdup512): Ditto.
(avx512f_movsldup512): Ditto.
(AVX512_VEC): Ditto.
(AVX512_VEC_2): Ditto.
(vec_extract_lo_v64qi): Ditto.
(vec_extract_hi_v64qi): Ditto.
(VEC_EXTRACT_MODE): Ditto.
(avx512f_unpckhpd512): Ditto.
(avx512f_movddup512): Ditto.
(avx512f_unpcklpd512): Ditto.
(*_vternlog_all): Ditto.
(*_vpternlog_1): Ditto.
(*_vpternlog_2): Ditto.
(*_vpternlog_3): Ditto.
(avx512f_shufps512_mask): Ditto.
(avx512f_shufps512_1): Ditto.
  

[PATCH 12/18] Disable zmm register and 512 bit libmvec call when !TARGET_EVEX512

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_broadcast_from_constant):
Disable zmm broadcast for !TARGET_EVEX512.
* config/i386/i386-options.cc (ix86_option_override_internal):
Do not use PVW_512 when no-evex512.
(ix86_simd_clone_adjust): Add evex512 target into string.
* config/i386/i386.cc (type_natural_mode): Report ABI warning
when using zmm register w/o evex512.
(ix86_return_in_memory): Do not allow zmm when !TARGET_EVEX512.
(ix86_hard_regno_mode_ok): Ditto.
(ix86_set_reg_reg_cost): Ditto.
(ix86_rtx_costs): Ditto.
(ix86_vector_mode_supported_p): Ditto.
(ix86_preferred_simd_mode): Ditto.
(ix86_get_mask_mode): Ditto.
(ix86_simd_clone_compute_vecsize_and_simdlen): Disable 512 bit
libmvec call when !TARGET_EVEX512.
(ix86_simd_clone_usable): Ditto.
* config/i386/i386.h (BIGGEST_ALIGNMENT): Disable 512 alignment
when !TARGET_EVEX512
(MOVE_MAX): Do not use PVW_512 when !TARGET_EVEX512.
(STORE_MAX_PIECES): Ditto.
---
 gcc/config/i386/i386-expand.cc  |  1 +
 gcc/config/i386/i386-options.cc | 14 +
 gcc/config/i386/i386.cc | 53 ++---
 gcc/config/i386/i386.h  |  7 +++--
 4 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index e42ff27c6ef..6eedcb384c0 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -611,6 +611,7 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op)
  avx512 embed broadcast is available.  */
   if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
   && (!TARGET_AVX512F
+ || (GET_MODE_SIZE (mode) == 64 && !TARGET_EVEX512)
  || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
 return nullptr;
 
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index a1a7a92da9f..e2a90d7d9e2 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2845,7 +2845,8 @@ ix86_option_override_internal (bool main_args_p,
  opts->x_ix86_move_max = opts->x_prefer_vector_width_type;
  if (opts_set->x_ix86_move_max == PVW_NONE)
{
- if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
+ if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
+ && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
  else
opts->x_ix86_move_max = PVW_AVX128;
@@ -2866,7 +2867,8 @@ ix86_option_override_internal (bool main_args_p,
  opts->x_ix86_store_max = opts->x_prefer_vector_width_type;
  if (opts_set->x_ix86_store_max == PVW_NONE)
{
- if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
+ if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
+ && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_store_max = PVW_AVX512;
  else
opts->x_ix86_store_max = PVW_AVX128;
@@ -3145,13 +3147,13 @@ ix86_simd_clone_adjust (struct cgraph_node *node)
 case 'e':
   if (TARGET_PREFER_AVX256)
{
- if (!TARGET_AVX512F)
-   str = "avx512f,prefer-vector-width=512";
+ if (!TARGET_AVX512F || !TARGET_EVEX512)
+   str = "avx512f,evex512,prefer-vector-width=512";
  else
str = "prefer-vector-width=512";
}
-  else if (!TARGET_AVX512F)
-   str = "avx512f";
+  else if (!TARGET_AVX512F || !TARGET_EVEX512)
+   str = "avx512f,evex512";
   break;
 default:
   gcc_unreachable ();
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 477e6cecc38..0df3bf10547 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -1924,7 +1924,8 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS 
*cum,
if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
&& GET_MODE_INNER (mode) == innermode)
  {
-   if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
+   if (size == 64 && (!TARGET_AVX512F || !TARGET_EVEX512)
+   && !TARGET_IAMCU)
  {
static bool warnedavx512f;
static bool warnedavx512f_ret;
@@ -4347,7 +4348,7 @@ ix86_return_in_memory (const_tree type, const_tree fntype 
ATTRIBUTE_UNUSED)
 
  /* AVX512F values are returned in ZMM0 if available.  */
  if (size == 64)
-   return !TARGET_AVX512F;
+   return !TARGET_AVX512F || !TARGET_EVEX512;
}
 
   if (mode == XFmode)
@@ -20286,7 +20287,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, 
machine_mode mode)
  - any of 512-bit wide vector mode
  - any scalar mode.  */
   if (TARGET_AVX512F
- && (VALID_AVX512F_REG_OR_XI_MODE (mode)
+  

[PATCH 11/18] [PATCH 5/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
---
 gcc/config/i386/i386-builtin.def | 156 +++
 1 file changed, 78 insertions(+), 78 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 8250e2998cd..b90d5ccc969 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -1568,9 +1568,9 @@ BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_copysignv8df3
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, 
UNKNOWN, (int) V8DF_FTYPE_V8DF)
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, CODE_FOR_sqrtv16sf2, 
"__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF)
 BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf, 
"__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, 
"__builtin_ia32_floorph512", IX86_BUILTIN_FLOORPH512, (enum rtx_code) 
ROUND_FLOOR, (int) V32HF_FTYPE_V32HF_ROUND)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, 
"__builtin_ia32_ceilph512", IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, 
(int) V32HF_FTYPE_V32HF_ROUND)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, 
"__builtin_ia32_truncph512", IX86_BUILTIN_TRUNCPH512, (enum rtx_code) 
ROUND_TRUNC, (int) V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_floorph512", 
IX86_BUILTIN_FLOORPH512, (enum rtx_code) ROUND_FLOOR, (int) 
V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_ceilph512", 
IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, (int) 
V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_truncph512", 
IX86_BUILTIN_TRUNCPH512, (enum rtx_code) ROUND_TRUNC, (int) 
V32HF_FTYPE_V32HF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512f_roundps512, "__builtin_ia32_floorps512", 
IX86_BUILTIN_FLOORPS512, (enum rtx_code) ROUND_FLOOR, (int) 
V16SF_FTYPE_V16SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512f_roundps512, "__builtin_ia32_ceilps512", 
IX86_BUILTIN_CEILPS512, (enum rtx_code) ROUND_CEIL, (int) 
V16SF_FTYPE_V16SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512f_roundps512, "__builtin_ia32_truncps512", 
IX86_BUILTIN_TRUNCPS512, (enum rtx_code) ROUND_TRUNC, (int) 
V16SF_FTYPE_V16SF_ROUND)
@@ -2874,40 +2874,40 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_extendbfsf2_1, 
"__builtin_ia32_cvtbf2sf
 /* AVX512FP16.  */
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", 
IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_addv16hf3_mask, "__builtin_ia32_addph256_mask", 
IX86_BUILTIN_ADDPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, 
"__builtin_ia32_addph512_mask", IX86_BUILTIN_ADDPH512_MASK, UNKNOWN, (int) 
V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_addv32hf3_mask, "__builtin_ia32_addph512_mask", 
IX86_BUILTIN_ADDPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_subv8hf3_mask, "__builtin_ia32_subph128_mask", 
IX86_BUILTIN_SUBPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_subv16hf3_mask, "__builtin_ia32_subph256_mask", 
IX86_BUILTIN_SUBPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, 
"__builtin_ia32_subph512_mask", IX86_BUILTIN_SUBPH512_MASK, UNKNOWN, (int) 
V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_subv32hf3_mask, "__builtin_ia32_subph512_mask", 
IX86_BUILTIN_SUBPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_mulv8hf3_mask, "__builtin_ia32_mulph128_mask", 
IX86_BUILTIN_MULPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_mulv16hf3_mask, "__builtin_ia32_mulph256_mask", 
IX86_BUILTIN_MULPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3

[PATCH] PHIOPT: Fix minmax_replacement for three way

2023-09-21 Thread Andrew Pinski
So when diamond bb support was added to minmax_replacement in 
r13-1950-g9bb19e143cfe,
the code was not expecting the alt_middle_bb not to exist if it was empty (for 
threeway_p).
So when factor_out_conditional_conversion was used to factor out conversions, 
it turns out
the assumption for alt_middle_bb to be wrong and we ended up with threeway_p 
being true but
having middle_bb being empty but alt_middle_bb not being empty which causes 
wrong code in
many cases.

This patch fixes the issue by adding a test for the 2 cases where the 
assumption on
threeway_p case having the other bb being empty.

Note my plan for GCC 15 is remove minmax_replacement as match.pd will catch all 
cases
at that point.

OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

PR tree-optimization/111469

gcc/ChangeLog:

* tree-ssa-phiopt.cc (minmax_replacement): Fix
the assumption for the `non-diamond` handling cases
of diamond code.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/pr111469-1.c: New test.
---
 .../gcc.c-torture/execute/pr111469-1.c| 38 +++
 gcc/tree-ssa-phiopt.cc| 10 -
 2 files changed, 46 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr111469-1.c

diff --git a/gcc/testsuite/gcc.c-torture/execute/pr111469-1.c 
b/gcc/testsuite/gcc.c-torture/execute/pr111469-1.c
new file mode 100644
index 000..b68d5989eac
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr111469-1.c
@@ -0,0 +1,38 @@
+/* PR tree-optimization/111469 */
+
+long f;
+char *g;
+__attribute__((noinline))
+char o() {
+  char l;
+  while (f)
+;
+  l = *g;
+  return l;
+}
+
+/* factor_out_conditional_conversion is able to remove the casts
+   from the 2 bbs (correctly)
+   but then minmax_replacement should not optimize this to a MIN_EXPR
+   as o has side effects. */
+
+__attribute__((noinline))
+unsigned short gg(unsigned short a, unsigned short b)
+{
+  short d;
+  if (a > b)
+  {
+d= b;
+  }
+  else
+  {
+o();
+d = a;
+  }
+  return d;
+}
+
+int main(void)
+{
+  gg(3, 2);
+}
diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
index 3835d25d08c..96901a40444 100644
--- a/gcc/tree-ssa-phiopt.cc
+++ b/gcc/tree-ssa-phiopt.cc
@@ -1823,7 +1823,9 @@ minmax_replacement (basic_block cond_bb, basic_block 
middle_bb, basic_block alt_
   arg_false = arg0;
 }
 
-  if (empty_block_p (middle_bb))
+  if (empty_block_p (middle_bb)
+  && (!threeway_p
+ || empty_block_p (alt_middle_bb)))
 {
   if ((operand_equal_for_phi_arg_p (arg_true, smaller)
   || (alt_smaller
@@ -2006,7 +2008,9 @@ minmax_replacement (basic_block cond_bb, basic_block 
middle_bb, basic_block alt_
 
   return true;
 }
-  else
+  else if (middle_bb == alt_middle_bb
+   && (!threeway_p
+  || empty_block_p (alt_middle_bb)))
 {
   /* Recognize the following case, assuming d <= u:
 
@@ -2182,6 +2186,8 @@ minmax_replacement (basic_block cond_bb, basic_block 
middle_bb, basic_block alt_
  SSA_OP_DEF));
   gsi_move_before (&gsi_from, &gsi);
 }
+  else
+return false;
 
   /* Emit the statement to compute min/max.  */
   gimple_seq stmts = NULL;
-- 
2.31.1



Re: [PATCH V3] RISC-V: Support combine cond extend and reduce sum to widen reduce sum

2023-09-21 Thread Robin Dapp
Hi Lehua,

> V3 Change: Back to the original method.
Was there an original method even before the first patch?

Anyway, I prefer this v3 over the others even though the large
pattern is not exactly pretty :)

What about the VLS changes?  Are they necessary for the patterns/tests?
I mean they are reasonable in themselves but are they related?
If so, please specify.  If not please split them off into a small
(pre-approved) patch.  OK with this adjusted.

Regards
 Robin


Re: [PATCH] RISC-V: Fix SUBREG move of VLS mode[PR111486]

2023-09-21 Thread Robin Dapp
OK.

This is also the approach I took locally to fix a Fortran ICE
but forgot to send/push it.

Regards
 Robin


[PATCH] RISC-V: Enable undefined support for RVV auto-vectorization[PR110751]

2023-09-21 Thread Juzhe-Zhong
Now GCC middle-end can support undefined value which is traslated into 
(scratch:mode).

This patch is to enable RISC-V backend undefine value in ELSE value of 
COND_LEN_xxx/COND_xxx.

Consider this following case:

  __attribute__((noipa))
  void vrem_int8_t (int8_t * __restrict dst, int8_t * __restrict a, int8_t * 
__restrict b, int n)
  {
for (int i = 0; i < n; i++)
  dst[i] = a[i] % b[i];
  }

Before this patch:

vrem_int8_t:
ble a3,zero,.L5
vsetvli a5,zero,e8,m1,ta,ma
vmv.v.i v4,0  ---> redundant.
.L3:
vsetvli a5,a3,e8,m1,tu,ma ---> should be TA.
vmv1r.v v1,v4 ---> redudant.
vle8.v  v3,0(a1)
vle8.v  v2,0(a2)
sub a3,a3,a5
vrem.vv v1,v3,v2
vse8.v  v1,0(a0)
add a1,a1,a5
add a2,a2,a5
add a0,a0,a5
bne a3,zero,.L3
.L5:
ret

After this patch:

vrem_int8_t:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e8,m1,ta,ma
vle8.v  v1,0(a1)
vle8.v  v2,0(a2)
sub a3,a3,a5
vrem.vv v1,v1,v2
vse8.v  v1,0(a0)
add a1,a1,a5
add a2,a2,a5
add a0,a0,a5
bne a3,zero,.L3
.L5:
ret


PR target/110751

gcc/ChangeLog:

* config/riscv/autovec.md: Enable scratch rtx in ELSE operand.
* config/riscv/predicates.md (autovec_else_operand): New predicate.
* config/riscv/riscv-v.cc (get_else_operand): New function.
(expand_cond_len_unop): Adapt ELSE value.
(expand_cond_len_binop): Ditto.
(expand_cond_len_ternop): Ditto.
* config/riscv/riscv.cc (riscv_preferred_else_value): New function.
(TARGET_PREFERRED_ELSE_VALUE): New targethook.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c: Adapt test.
* gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c: Ditto.
* gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c: Ditto.
* gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c: Ditto.
* gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c: Ditto.
* gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-10.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-11.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-12.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-8.c: Ditto.
* gcc.target/riscv/rvv/autovec/ternop/ternop_nofm-9.c: Ditto.

---
 gcc/config/riscv/autovec.md   | 48 +--
 gcc/config/riscv/predicates.md|  4 ++
 gcc/config/riscv/riscv-v.cc   | 13 +++--
 gcc/config/riscv/riscv.cc | 16 +++
 .../rvv/autovec/binop/vdiv-rv32gcv-nofm.c |  6 +++
 .../riscv/rvv/autovec/binop/vdiv-rv32gcv.c|  6 +++
 .../rvv/autovec/binop/vdiv-rv64gcv-nofm.c |  6 +++
 .../riscv/rvv/autovec/binop/vdiv-rv64gcv.c|  6 +++
 .../riscv/rvv/autovec/binop/vrem-rv32gcv.c|  6 +++
 .../riscv/rvv/autovec/binop/vrem-rv64gcv.c|  6 +++
 .../riscv/rvv/autovec/ternop/ternop_nofm-1.c  |  4 +-
 .../riscv/rvv/autovec/ternop/ternop_nofm-10.c |  4 +-
 .../riscv/rvv/autovec/ternop/ternop_nofm-11.c |  4 +-
 .../riscv/rvv/autovec/ternop/ternop_nofm-12.c |  2 +
 .../riscv/rvv/autovec/ternop/ternop_nofm-2.c  |  6 +--
 .../riscv/rvv/autovec/ternop/ternop_nofm-3.c  |  3 +-
 .../riscv/rvv/autovec/ternop/ternop_nofm-4.c  |  4 +-
 .../riscv/rvv/autovec/ternop/ternop_nofm-5.c  |  4 +-
 .../riscv/rvv/autovec/ternop/ternop_nofm-6.c  |  1 +
 .../riscv/rvv/autovec/ternop/ternop_nofm-7.c  |  4 +-
 .../riscv/rvv/autovec/ternop/ternop_nofm-8.c  |  4 +-
 .../riscv/rvv/autovec/ternop/ternop_nofm-9.c  |  1 +
 22 files changed, 105 insertions(+), 53 deletions(-)

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 55c0a04df3b..f0f1abc4e82 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1496,7 +1496,7 @@
(match_operand: 1 "vector_mask_operand")
(any_int_unop:VI
  (match_operand:VI 2 "register_operand"))
-   (match_operand:VI 3 "register_operand")]
+   (match_operand:VI 3 "autovec_else_operand")]
   "TARGET_VECTOR"
 {
   /* Normalize into cond_len_* operations.  */
@@ -1512,7 +1512,7 @@
(match_operand: 1 "vector_mask_operand")
(any_int_unop:VI
  (match_operand:VI 2 "regist

Re: [PATCH V3] RISC-V: Support combine cond extend and reduce sum to widen reduce sum

2023-09-21 Thread Lehua Ding

Hi Robin,

On 2023/9/21 16:12, Robin Dapp wrote:

Hi Lehua,


V3 Change: Back to the original method.

Was there an original method even before the first patch?


Yes, this was the method that came to mind at first, and I didn't send a 
patch because I didn't feel like the pattern looked good :)



Anyway, I prefer this v3 over the others even though the large
pattern is not exactly pretty :)

What about the VLS changes?  Are they necessary for the patterns/tests?
I mean they are reasonable in themselves but are they related?
If so, please specify.  If not please split them off into a small
(pre-approved) patch.  OK with this adjusted.


This VLS change is necessary for the patch and only used by this patch 
currently. Currently, VLMAX inside avl_type stands for VLA mode using 
VLMAX, however, we need to distinguish from NONVLMAX in the case where 
the vl is the number of units of the mode. This way it is safe and 
simple to do combine. Therefore a new avl_type enumeration VLS is 
introduced to distinguish.


There's an overlap between VLMAX and VLS here, and it's probably more 
appropriate to use FULL or something similar, like the other patch that 
changes the comments. I'm going to hold off on making any major changes 
for now, though, as the impact is more localized.


--
Best,
Lehua (RiVAI)
lehua.d...@rivai.ai


[PATCH 3/3] build: Regenerate build files

2023-09-21 Thread Arthur Cohen
From: Pierre-Emmanuel Patry 

Resending this patch without most of the diff so it fits on the ML.

-

Regenerate all build files.

ChangeLog:

* Makefile.in:
* configure: Regenerate.
* libgrust/Makefile.in: New file.
* libgrust/aclocal.m4: New file.
* libgrust/configure: New file.
* libgrust/libproc_macro/Makefile.in: New file.

libgm2/ChangeLog:

* Makefile.in: Regenerate.
* aclocal.m4: Regenerate.
* libm2cor/Makefile.in: Regenerate.
* libm2iso/Makefile.in: Regenerate.
* libm2log/Makefile.in: Regenerate.
* libm2min/Makefile.in: Regenerate.
* libm2pim/Makefile.in: Regenerate.

Signed-off-by: Pierre-Emmanuel Patry 
---
 Makefile.in|  1015 +-
 configure  | 3 +-
 libgm2/Makefile.in |10 +-
 libgm2/aclocal.m4  |10 +-
 libgm2/libm2cor/Makefile.in|10 +-
 libgm2/libm2iso/Makefile.in|10 +-
 libgm2/libm2log/Makefile.in|10 +-
 libgm2/libm2min/Makefile.in|10 +-
 libgm2/libm2pim/Makefile.in|10 +-
 libgrust/Makefile.in   |   671 +
 libgrust/aclocal.m4|  1260 ++
 libgrust/configure | 18420 +++
 libgrust/libproc_macro/Makefile.in |   704 +
 13 files changed, 22098 insertions(+), 45 deletions(-)
 create mode 100644 libgrust/Makefile.in
 create mode 100644 libgrust/aclocal.m4
 create mode 100755 libgrust/configure
 create mode 100644 libgrust/libproc_macro/Makefile.in

diff --git a/Makefile.in b/Makefile.in
index 2f136839c35..c1606758a13 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1127,7 +1127,8 @@ configure-host:  \
 maybe-configure-libcc1 \
 maybe-configure-gotools \
 maybe-configure-libctf \
-maybe-configure-libsframe
+maybe-configure-libsframe \
+maybe-configure-libgrust
 .PHONY: configure-target
 configure-target:  \
 maybe-configure-target-libstdc++-v3 \
@@ -1152,7 +1153,8 @@ configure-target:  \
 maybe-configure-target-libgm2 \
 maybe-configure-target-libgomp \
 maybe-configure-target-libitm \
-maybe-configure-target-libatomic
+maybe-configure-target-libatomic \
+maybe-configure-target-libgrust
 
 # The target built for a native non-bootstrap build.
 .PHONY: all
@@ -1308,6 +1310,7 @@ all-host: maybe-all-libctf
 @if libsframe-no-bootstrap
 all-host: maybe-all-libsframe
 @endif libsframe-no-bootstrap
+all-host: maybe-all-libgrust
 
 .PHONY: all-target
 
@@ -1352,6 +1355,7 @@ all-target: maybe-all-target-libitm
 @if target-libatomic-no-bootstrap
 all-target: maybe-all-target-libatomic
 @endif target-libatomic-no-bootstrap
+all-target: maybe-all-target-libgrust
 
 # Do a target for all the subdirectories.  A ``make do-X'' will do a
 # ``make X'' in all subdirectories (because, in general, there is a
@@ -1422,6 +1426,7 @@ info-host: maybe-info-libcc1
 info-host: maybe-info-gotools
 info-host: maybe-info-libctf
 info-host: maybe-info-libsframe
+info-host: maybe-info-libgrust
 
 .PHONY: info-target
 
@@ -1448,6 +1453,7 @@ info-target: maybe-info-target-libgm2
 info-target: maybe-info-target-libgomp
 info-target: maybe-info-target-libitm
 info-target: maybe-info-target-libatomic
+info-target: maybe-info-target-libgrust
 
 .PHONY: do-dvi
 do-dvi:
@@ -1513,6 +1519,7 @@ dvi-host: maybe-dvi-libcc1
 dvi-host: maybe-dvi-gotools
 dvi-host: maybe-dvi-libctf
 dvi-host: maybe-dvi-libsframe
+dvi-host: maybe-dvi-libgrust
 
 .PHONY: dvi-target
 
@@ -1539,6 +1546,7 @@ dvi-target: maybe-dvi-target-libgm2
 dvi-target: maybe-dvi-target-libgomp
 dvi-target: maybe-dvi-target-libitm
 dvi-target: maybe-dvi-target-libatomic
+dvi-target: maybe-dvi-target-libgrust
 
 .PHONY: do-pdf
 do-pdf:
@@ -1604,6 +1612,7 @@ pdf-host: maybe-pdf-libcc1
 pdf-host: maybe-pdf-gotools
 pdf-host: maybe-pdf-libctf
 pdf-host: maybe-pdf-libsframe
+pdf-host: maybe-pdf-libgrust
 
 .PHONY: pdf-target
 
@@ -1630,6 +1639,7 @@ pdf-target: maybe-pdf-target-libgm2
 pdf-target: maybe-pdf-target-libgomp
 pdf-target: maybe-pdf-target-libitm
 pdf-target: maybe-pdf-target-libatomic
+pdf-target: maybe-pdf-target-libgrust
 
 .PHONY: do-html
 do-html:
@@ -1695,6 +1705,7 @@ html-host: maybe-html-libcc1
 html-host: maybe-html-gotools
 html-host: maybe-html-libctf
 html-host: maybe-html-libsframe
+html-host: maybe-html-libgrust
 
 .PHONY: html-target
 
@@ -1721,6 +1732,7 @@ html-target: maybe-html-target-libgm2
 html-target: maybe-html-target-libgomp
 html-target: maybe-html-target-libitm
 html-target: maybe-html-target-libatomic
+html-target: maybe-html-target-libgrust
 
 .PHONY: do-TAGS
 do-TAGS:
@@ -1786,6 +1798,7 @@ TAGS-host: maybe-TAGS-libcc1
 TAGS-host: maybe-TAGS-gotools
 TAGS-host: maybe-TAGS-libctf
 TAGS-host: maybe-TAGS-libsframe
+TAGS-host: maybe-TAGS-libgrust
 
 .PHONY: TAGS-target
 
@@ -1812,6 +1825,7 @@ TAGS-target: maybe-TAGS-target-libgm2
 TAGS-target: maybe-TAGS-target-libgomp
 TAGS-target: mayb

Re: [PATCH 3/3] build: Regenerate build files

2023-09-21 Thread Jakub Jelinek
On Thu, Sep 21, 2023 at 10:44:30AM +0200, Arthur Cohen wrote:
> From: Pierre-Emmanuel Patry 
> 
> Resending this patch without most of the diff so it fits on the ML.
> 
> -
> 
> Regenerate all build files.
> 
> ChangeLog:
> 
>   * Makefile.in:

Missing Regenerate. above?

>   * configure: Regenerate.
>   * libgrust/Makefile.in: New file.
>   * libgrust/aclocal.m4: New file.
>   * libgrust/configure: New file.
>   * libgrust/libproc_macro/Makefile.in: New file.
> 
> libgm2/ChangeLog:
> 
>   * Makefile.in: Regenerate.
>   * aclocal.m4: Regenerate.
>   * libm2cor/Makefile.in: Regenerate.
>   * libm2iso/Makefile.in: Regenerate.
>   * libm2log/Makefile.in: Regenerate.
>   * libm2min/Makefile.in: Regenerate.
>   * libm2pim/Makefile.in: Regenerate.

Jakub



Re: [PATCH] RISC-V: Adjusting the comments of the emit_vlmax_insn/emit_vlmax_insn_lra/emit_nonvlmax_insn functions

2023-09-21 Thread Robin Dapp
Hi Lehua,

I once had different comments for those but either I never pushed them
or they got buried in the process of refactoring.  The explanatory
comment explaining vlmax is also in "nowhere land" below autovec_use_vlmax_p.
(it says vsetvli instead of vsetvl as well...)  It would be useful
to move it to above the function comments you touch.

> +/* Emit RVV insn which vl is the number of units of the vector mode.
> +   This function can only be used before LRA pass or for VLS_AVL_IMM modes.  
> */

Emit an RVV insn with a vector length that equals the number of units of
the vector mode.  For VLA modes this corresponds to VLMAX.

Unless the vector length can be encoded in the vsetivl[i] instruction this
function must only be used as long as we can create pseudo registers.
This is because it will set a pseudo register to VLMAX using vsetvl and
use this as definition for the vector length.


Besides, we could add a const_vlmax_p () || can_create_pseudo_p assert here?


> +/* Like emit_vlmax_insn but can be only used after LRA pass that can't create
> +   pseudo register.  */

Like emit_vlmax_insn but must only be used when we cannot create pseudo
registers anymore.  This function, however, takes a predefined vector
length from the value in VL.

> +/* Emit RVV insn which vl is the VL argument.  */
> +emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)

I think I renamed this to emit_len_insn or something before but Juzhe didn't
like it ;)

How about something like:
Emit an RVV insn with a predefined vector length.  Contrary to emit_vlmax_insn
the instruction's vector length is not deduced from its mode but taken from 
the value in VL.

Regards
 Robin



Re: [PATCH] RISC-V: Enable undefined support for RVV auto-vectorization[PR110751]

2023-09-21 Thread Robin Dapp
Hi Juzhe,

with the middle-end changes that's a nice improvement.  LGTM.

Regards
 Robin


Re: [PATCHSET] Reintroduce targetrustm hooks

2023-09-21 Thread Arthur Cohen

Pushed the patches to trunk.

Thank you both for the review.

Kindly,

Arthur

On 9/14/23 12:12, Richard Biener via Gcc-rust wrote:

On Wed, Sep 13, 2023 at 10:14 PM Iain Buclaw via Gcc-patches
 wrote:


Excerpts from Arthur Cohen's message of September 7, 2023 3:41 pm:

Alright, was not expecting to mess up this patchset so bad so here we go:

This patchset reintroduces proper targetrustm hooks without the old
problematic mess of macros we had, which had been removed for the first
merge of gccrs upstream.

Tested on x86-64 GNU Linux, and has also been present in our development
repository for a long time - added by this pull-request from Iain [1]
which was merged in October 2022.

Ok for trunk?

[PATCH 01/14] rust: Add skeleton support and documentation for
[PATCH 02/14] rust: Reintroduce TARGET_RUST_CPU_INFO hook
[PATCH 03/14] rust: Reintroduce TARGET_RUST_OS_INFO hook
[PATCH 04/14] rust: Implement TARGET_RUST_CPU_INFO for i[34567]86-*-*
[PATCH 05/14] rust: Implement TARGET_RUST_OS_INFO for *-*-darwin*
[PATCH 06/14] rust: Implement TARGET_RUST_OS_INFO for *-*-freebsd*
[PATCH 07/14] rust: Implement TARGET_RUST_OS_INFO for *-*-netbsd*
[PATCH 08/14] rust: Implement TARGET_RUST_OS_INFO for *-*-openbsd*
[PATCH 09/14] rust: Implement TARGET_RUST_OS_INFO for *-*-solaris2*.
[PATCH 10/14] rust: Implement TARGET_RUST_OS_INFO for *-*-dragonfly*
[PATCH 11/14] rust: Implement TARGET_RUST_OS_INFO for *-*-vxworks*
[PATCH 12/14] rust: Implement TARGET_RUST_OS_INFO for *-*-fuchsia*.
[PATCH 13/14] rust: Implement TARGET_RUST_OS_INFO for
[PATCH 14/14] rust: Implement TARGET_RUST_OS_INFO for *-*-*linux*.



Thanks for eventually getting round to this.

As the co-author of this patch series, I'm not going to look at it.

FWIW, these being Rust-specific target changes isolated to just
Rust-specific files, you should have the automony to commit without
needing any request for review - at least this is my understanding when
have made D-specific target changes in the past that have not touched
common back-end headers.

I'll let someone else confirm and check over the shared parts touched by
the patch however.


I confirm.  I briefly went over the shared parts and they look OK.

Thanks,
Richard.


For reviewers, this is pretty much a mirror of the D front-end's CPU and
OS-specific target hooks (D has built-in version identifiers, not
built-in attributes, but both Rust and D are otherwise the same in the
kind of information exposed by them).


[1]: https://github.com/Rust-GCC/gccrs/pull/1543



The other GitHub pull request that added these is here.

https://github.com/Rust-GCC/gccrs/pull/1596

Regards,
Iain.


Re: [PATCH] RISC-V: Enable undefined support for RVV auto-vectorization[PR110751]

2023-09-21 Thread Kito Cheng
Cool, LGTM as well!

On Thu, Sep 21, 2023 at 10:11 AM Robin Dapp  wrote:
>
> Hi Juzhe,
>
> with the middle-end changes that's a nice improvement.  LGTM.
>
> Regards
>  Robin


[wwwdocs] OpenMP: gcc-14/changes.html and projects/gomp/ update

2023-09-21 Thread Tobias Burnus

This updates for newer features implemented very recently (project status)
and since a while (gcc-14/changes/).

Comments? Remarks? Suggestions?

I not, I plan to commit it relatively soon - but follow-up changes are
of course possible :-)

Tobias
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
OpenMP: gcc-14/changes.html and projects/gomp/ update

* htdocs/gcc-14/changes.html (OpenMP): Update for new features.
* htdocs/projects/gomp/index.html: Likewise, update the impl. status.
diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index 65382746..2ca05ad0 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -55,11 +55,30 @@ a work-in-progress.
   The requires directive's unified_address
   requirement is now fulfilled by both AMD GCN and nvptx devices.
 
+
+  OpenMP 5.1: Support was added for collapsing imperfectly nested loops and
+  using present as map-type modifier and in
+  defaultmap.
+
 
   OpenMP 5.2: The OMP_TARGET_OFFLOAD=mandatory handling has
   been updated for the clarifications and changes of the 5.2 specification.
   For Fortran, the list of directives permitted in Fortran pure procedures
-  was extended.
+  was extended. Additionally, the spec change has been implemented for
+  default implicit mapping of C/C++ pointers pointing to unmapped storage.
+
+
+  OpenMP 6.0 preview (TR11): The decl attribute is now
+  supported in C++ 11 attributes.
+
+
+  The https://gcc.gnu.org/onlinedocs/libgomp/";>GNU Offloading and
+  Multi Processing Runtime Library Manual has been updated and extended,
+  improving especially the ICV description, memory allocation, and the
+  description of the environment variables and OpenMP routines. On Linux,
+  the https://github.com/numactl/numactl";>libnuma is now used
+  for allocators requesting the nearest-partition trait as detailed in the
+  manual.
 
   
   
diff --git a/htdocs/projects/gomp/index.html b/htdocs/projects/gomp/index.html
index 04bfd908..7f0b97c3 100644
--- a/htdocs/projects/gomp/index.html
+++ b/htdocs/projects/gomp/index.html
@@ -479,8 +479,8 @@ than listed, depending on resolved corner cases and optimizations.
   
   
 allocate directive
-No
-
+GCC 14
+Only C, only stack variables
   
   
 Discontiguous array section with target update construct
@@ -554,8 +554,8 @@ than listed, depending on resolved corner cases and optimizations.
   
   
 align clause in allocate directive
-No
-
+GCC 14
+Only C (and only stack variables)
   
   
 align modifier in allocate clause
@@ -996,7 +996,7 @@ error.
   
   
 The decl attribute was added to the C++ attribute syntax
-No
+GCC 14
 
   
   


Re: [PATCH] libgomp, nvptx, amdgcn: parallel reverse offload

2023-09-21 Thread Tobias Burnus

Hi Andrew, hi Thomas, hi all,

@Thomas: I wouldn't mind if you could glance at the nvptx/CUDA bits.

On 12.09.23 16:27, Andrew Stubbs wrote:

This patch implements parallel execution of OpenMP reverse offload
kernels.
...
The device threads that sent requests are still blocked waiting for
the completion signal, but any other threads may continue as usual.

Which matches the spec. (Except that, starting with TR12, a user may
also use the 'nowait' clause on the reverse-offload target directive.)

+++ b/libgomp/config/nvptx/target.c
@@ -93,7 +93,6 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t 
mapnum,
   void **hostaddrs, size_t *sizes, unsigned short *kinds,
   unsigned int flags, void **depend, void **args)
  {

...

+  if ((unsigned int) (index + 1) < GOMP_REV_OFFLOAD_VAR->consumed)
+abort ();  /* Overflow.  */


[I assume the ideas is that this gets diagnosed by the host (via the
GOMP_PLUGIN_fatal) and that that diagnosis is faster then the
propagation of the abort() from the device to the host such that the
message is always printed.]

Should there be an "Error message is printed via the nvptx plugin on the
host" or something along this line?



+++ b/libgomp/libgomp.texi

...

+* GOMP_REVERSE_OFFLOAD_THREADS:: Set the maximum number of host threads
  @end menu

...

+@node GOMP_REVERSE_OFFLOAD_THREADS
+@section @env{GOMP_REVERSE_OFFLOAD_THREADS} -- Set the maximum number of host 
threads


Thanks but can you also update the gcn/nvptx description? We currently have:
https://gcc.gnu.org/onlinedocs/libgomp/index.html

--
@section AMD Radeon (GCN)
...
@item Reverse offload regions (i.e. @code{target} regions with
  @code{device(ancestor:1)}) are processed serially per @code{target} region
  such that the next reverse offload region is only executed after the 
previous
  one returned.
...
@section nvptx
...
@item Reverse offload regions (i.e. @code{target} regions with
  @code{device(ancestor:1)}) are processed serially per @code{target} region
  such that the next reverse offload region is only executed after the 
previous
  one returned.
--

Possibly by also adding a @ref{GOMP_REVERSE_OFFLOAD_THREADS} such that a user
can find this.

(I wonder whether the "UINTMAX (= 4294967296)" should be documented or whether
that's an implementation detail we do not need to document. But given that a
long-running code taking 4 weeks still can issues 1775 reverse offloads per
second before exhausting the count, that should be a nonissue.)




+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1639,9 +1639,10 @@ nvptx_goacc_asyncqueue_construct (unsigned int flags)
  }

  struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+GOMP_OFFLOAD_openacc_async_construct (int device)
  {
-  return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+  nvptx_attach_host_thread_to_device (device);
+  return nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
  }


That's not really new and we have plenty of code of this kind, but isn't
this a race if this is called nearly instantaneously for multiple
devices? (Still, the new code is surely better than the previous one.)

@Thomas: ?

I will have another look after the Cauldron, but I think the patch is
otherwise okay.

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH] RISC-V: Enable undefined support for RVV auto-vectorization[PR110751]

2023-09-21 Thread Lehua Ding

Committed, thanks Robin and Kito.

On 2023/9/21 17:19, Kito Cheng wrote:

Cool, LGTM as well!

On Thu, Sep 21, 2023 at 10:11 AM Robin Dapp  wrote:


Hi Juzhe,

with the middle-end changes that's a nice improvement.  LGTM.

Regards
  Robin


--
Best,
Lehua (RiVAI)
lehua.d...@rivai.ai


[PATCH V2] RISC-V: Adjusting the comments of the emit_vlmax_insn/emit_vlmax_insn_lra/emit_nonvlmax_insn functions

2023-09-21 Thread Lehua Ding
V2 Change: Use Robin's comments.

This patch adjusts the comments of the
emit_vlmax_insn/emit_vlmax_insn_lra/emit_nonvlmax_insn functions.
The purpose of the adjustment is to make it clear that vlmax here is not
VLMAX as defined inside the RVV ISA. This is because this function is used
by RVV mode (e.g. RVVM1SImode) in addition to VLS mode (V16QI). For RVV mode,
it means the same thing, for VLS mode, it indicates setting the vl to the
number of units of the mode. Changed the comment because I didn't think of
a better name. If there is a suitable name, feel free to discuss it.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (emit_vlmax_insn): Adjust comments.
(emit_nonvlmax_insn): Adjust comments.
(emit_vlmax_insn_lra): Adjust comments.

Co-Authored-By: Robin Dapp 

---
 gcc/config/riscv/riscv-v.cc | 33 +
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 64a71a128d4..bb08289d39a 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -347,33 +347,42 @@ private:
   expand_operand m_ops[MAX_OPERANDS];
 };

-/* Emit RVV insn which vl is VLMAX.
-   This function can only be used before LRA pass or
-   for VLS_AVL_IMM modes.  */
+/* Emit an RVV insn with a vector length that equals the number of units of the
+   vector mode.  For VLA modes this corresponds to VLMAX.
+
+   Unless the vector length can be encoded in the vsetivl[i] instruction this
+   function must only be used as long as we can create pseudo registers. This 
is
+   because it will set a pseudo register to VLMAX using vsetvl and use this as
+   definition for the vector length.  */
 void
 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
 {
   insn_expander e (insn_flags, true);
+  gcc_assert (can_create_pseudo_p () || const_vlmax_p (e.get_vtype_mode 
(ops)));
+
   e.emit_insn ((enum insn_code) icode, ops);
 }

-/* Emit RVV insn which vl is VL.  */
+/* Like emit_vlmax_insn but must only be used when we cannot create pseudo
+   registers anymore.  This function, however, takes a predefined vector length
+   from the value in VL. */
 void
-emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
+emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 {
-  insn_expander e (insn_flags, false);
+  gcc_assert (!can_create_pseudo_p ());
+
+  insn_expander e (insn_flags, true);
   e.set_vl (vl);
   e.emit_insn ((enum insn_code) icode, ops);
 }

-/* Emit RVV insn which vl is VL but the AVL_TYPE insn attr is VLMAX.
-   This function used after LRA pass that cann't create pseudo register.  */
+/* Emit an RVV insn with a predefined vector length.  Contrary to
+   emit_vlmax_insn the instruction's vector length is not deduced from its mode
+   but taken from  the value in VL.  */
 void
-emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
+emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 {
-  gcc_assert (!can_create_pseudo_p ());
-
-  insn_expander e (insn_flags, true);
+  insn_expander e (insn_flags, false);
   e.set_vl (vl);
   e.emit_insn ((enum insn_code) icode, ops);
 }
--
2.36.3



Re: [PATCH] RISC-V: Adjusting the comments of the emit_vlmax_insn/emit_vlmax_insn_lra/emit_nonvlmax_insn functions

2023-09-21 Thread Lehua Ding

Hi Robin,


I once had different comments for those but either I never pushed them
or they got buried in the process of refactoring.  The explanatory
comment explaining vlmax is also in "nowhere land" below autovec_use_vlmax_p.
(it says vsetvli instead of vsetvl as well...)  It would be useful
to move it to above the function comments you touch.


I would like to move this comment to insn_expander::emit_insn body 
before set avl in another patch which add VLS avl_type.





+/* Emit RVV insn which vl is the number of units of the vector mode.
+   This function can only be used before LRA pass or for VLS_AVL_IMM modes.  */


Emit an RVV insn with a vector length that equals the number of units of
the vector mode.  For VLA modes this corresponds to VLMAX.

Unless the vector length can be encoded in the vsetivl[i] instruction this
function must only be used as long as we can create pseudo registers.
This is because it will set a pseudo register to VLMAX using vsetvl and
use this as definition for the vector length.


Besides, we could add a const_vlmax_p () || can_create_pseudo_p assert here?



+/* Like emit_vlmax_insn but can be only used after LRA pass that can't create
+   pseudo register.  */


Like emit_vlmax_insn but must only be used when we cannot create pseudo
registers anymore.  This function, however, takes a predefined vector
length from the value in VL.


+/* Emit RVV insn which vl is the VL argument.  */
+emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)


I think I renamed this to emit_len_insn or something before but Juzhe didn't
like it ;)

How about something like:
Emit an RVV insn with a predefined vector length.  Contrary to emit_vlmax_insn
the instruction's vector length is not deduced from its mode but taken from
the value in VL.


Thank you very much, I used all of them. Here the V2 patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631114.html

--
Best,
Lehua (RiVAI)
lehua.d...@rivai.ai


Re: [PATCH V2] RISC-V: Adjusting the comments of the emit_vlmax_insn/emit_vlmax_insn_lra/emit_nonvlmax_insn functions

2023-09-21 Thread Robin Dapp
LGTM.

Regards
 Robin


Re: [PATCH V2] RISC-V: Adjusting the comments of the emit_vlmax_insn/emit_vlmax_insn_lra/emit_nonvlmax_insn functions

2023-09-21 Thread Lehua Ding

Committed, thanks Robin.

On 2023/9/21 17:57, Robin Dapp wrote:

LGTM.

Regards
  Robin


--
Best,
Lehua (RiVAI)
lehua.d...@rivai.ai



Re: [Committed] RISC-V: Support VLS unary floating-point patterns

2023-09-21 Thread Palmer Dabbelt

On Wed, 20 Sep 2023 10:47:23 PDT (-0700), Patrick O'Neill wrote:

Juzhe,

On a more general note, are we expecting #include  to cause a
testcase to fail?

My motivation is to make the testsuite less noisy when checking for
regressions. For example, a patch like this one:
https://patchwork.sourceware.org/project/gcc/patch/20230920023059.1728132-1-pan2...@intel.com/
is showing 4 new failures on rv32gcv from the {dg-do compile} testcases
that #include . I might be wrong, but those don't look like real
failures to me [1][2][3].

On glibc rv64gcv I'm seeing tests like:
gcc.target/riscv/rvv/autovec/unop/vnot-rv32gcv.c
fail with similar missing stubs-ilp32d.h errors.

I want to sanity-check with other people that they are seeing similar
errors and that these errors indicate something wrong with the testsuite.
If nobody else is seeing these errors, I'd like to hear how you're
running the testsuite so I can debug the riscv-gnu-toolchain repo.

Patrick

[1]:
Executing on host:
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc
-B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
-march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output 
-O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
-fno-vect-cost-model -ffast-math -fno-schedule-insns
-fno-schedule-insns2 -S   -o math-ceil-1.s (timeout = 600)
spawn -ignore SIGHUP
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc
-B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
-march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output
-O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
-fno-vect-cost-model -ffast-math -fno-schedule-insns
-fno-schedule-insns2 -S -o math-ceil-1.s
In file included from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/features.h:515,
  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/bits/libc-header-start.h:33,
  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/math.h:27,
  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h:1,
  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c:5:
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/gnu/stubs.h:17:11:
fatal error: gnu/stubs-lp64d.h: No such file or directory


That looks like a toolchain build/configuration issue, not a test issue.  
IIRC this comes up from time to time, something's probably broken in 
riscv-gnu-toolchain but I'm not sure what's wrong.


I get a working setup with just `./configure --enable-linux 
--disable-multilib` and the latest riscv-gnu-toolchain master.  How are 
you building things?



compilation terminated.
compiler exited with status 1
FAIL: gcc.target/riscv/rvv/autovec/math-ceil-1.c -O3 -ftree-vectorize
(test for excess errors)

[2]:
https://github.com/ewlu/riscv-gnu-toolchain/issues/170

[3]:
This also extends beyond math.h. I'm seeing similar failures for
testcases like
gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-rv64-1.c that
#include .


On 9/19/23 18:12, Patrick O'Neill wrote:


I'll let it run overnight and see if this helps. Even before this patch,
I was seeing 233 stubs related failures for rv32gcv and 7 for rv64gcv so
this won't fix all the issues.

It's easily replicated using upstream riscv-gnu-toolchain
git clone https://github.com/riscv-collab/riscv-gnu-toolchain
cd riscv-gnu-toolchain
git submodule update --init gcc
cd gcc
git pull master
cd ..
mkdir build
cd build
../configure --prefix=$(pwd) --with-arch=rv32gcv --with-abi=ilp32d
make report-linux -j32

Then search for "stubs" in the debug logs
(/build-gcc-linux-stage2/gcc/testsuite/*.log)

Patrick

On 9/19/23 17:54, juzhe.zh...@rivai.ai wrote:

I think we could remove match.h.

Hi, @Patrick. Could you verify it?

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
index 2292372d7a3..674098e9ba6 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
@@ -1,5 +1,4 @@
 #include 
-#include 

and commit it.

Thanks.



[PATCH v2] RISC-V: Support ceil and ceilf auto-vectorization

2023-09-21 Thread pan2 . li
From: Pan Li 

This patch would like to support auto-vectorization for both the
ceil and ceilf of math.h. It depends on the -ffast-math option.

When we would like to call ceil/ceilf like v2 = ceil (v1), we will
convert it into below insn (reference the implementation of llvm).

* vfcvt.x.f v3, v1, RUP
* vfcvt.f.x v2, v3

However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.

  +---+---+
  | float | binary layout |
  +---+---+
  | 8388607.5 | 0x4aff|
  | 8388608.0 | 0x4b00|
  | 8388609.0 | 0x4b01|
  +---+---+

All single floating point great than 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.

Befor this patch:
math-ceil-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callceilf
  fsw fa0,-4(s1)
  bne s0,s2,.L3

After this patch:
  ...
  fsrmi   3
.L4:
  vfabs.v v0,v1
  vmv1r.v v2,v1
  vmflt.vvv0,v0,v4
  sub a3,a3,a4
  vfcvt.x.f.v v3,v1,v0.t
  vfcvt.f.x.v v2,v3,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret

Please note VLS mode is also involved in this patch and covered by the
test cases.

gcc/ChangeLog:

* config/riscv/autovec.md (ceil2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_ceil): New function decl.
* config/riscv/riscv-v.cc (gen_ceil_const_fp): New function impl.
(expand_vec_float_cmp_mask): Ditto.
(expand_vec_copysign): Ditto.
(expand_vec_ceil): Ditto.
* config/riscv/vector-iterators.md: Add VLS mode to VCONVERT.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/math-ceil-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-4.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-4.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-double.h: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-single.h: New test.
* gcc.target/riscv/rvv/autovec/test-math.h: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec.md   |  16 +++
 gcc/config/riscv/riscv-protos.h   |   5 +
 gcc/config/riscv/riscv-v.cc   | 116 ++
 gcc/config/riscv/vector-iterators.md  |  12 ++
 .../riscv/rvv/autovec/math-ceil-1.c   |  26 
 .../riscv/rvv/autovec/math-ceil-2.c   |  26 
 .../riscv/rvv/autovec/math-ceil-3.c   |  28 +
 .../riscv/rvv/autovec/math-ceil-4.c   |  28 +
 .../riscv/rvv/autovec/math-ceil-run-1.c   |   4 +
 .../riscv/rvv/autovec/math-ceil-run-2.c   |   4 +
 .../riscv/rvv/autovec/math-ceil-run-3.c   |   4 +
 .../riscv/rvv/autovec/math-ceil-run-4.c   |   4 +
 .../riscv/rvv/autovec/math-ceil-run-double.h  |  36 ++
 .../riscv/rvv/autovec/math-ceil-run-single.h  |  36 ++
 .../gcc.target/riscv/rvv/autovec/test-math.h  |  40 ++
 15 files changed, 385 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-4.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-4.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-double.h
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-single.h
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 493d5745485..36ed839aa5b 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2374,3 +2374,19 @@ (define_expand "avg3_ceil"
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops3);
   DONE;
 })
+
+;; -
+;;  [FP] Math.h.
+;; -
+;; Includes:
+;; - ceil/ceilf
+;; -

Re: [PATCH v2] RISC-V: Support ceil and ceilf auto-vectorization

2023-09-21 Thread juzhe.zh...@rivai.ai
+(define_expand "ceil2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR"
+  {
+riscv_vector::expand_vec_ceil (operands[0], operands[1], mode, 
mode);
+DONE;
+  }

I think you should add !flag_trapping_math && !flag_rounding_math

You can try -ftrapping-math or frounding-mode, LLVM failed to vectorize.

Like  X86:

(define_expand "round2"
  [(match_operand:X87MODEF 0 "register_operand")
   (match_operand:X87MODEF 1 "nonimmediate_operand")]
  "(TARGET_USE_FANCY_MATH_387
&& (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
  || TARGET_MIX_SSE_I387)
&& flag_unsafe_math_optimizations
&& (flag_fp_int_builtin_inexact || !flag_trapping_math))
   || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH
   && !flag_trapping_math && !flag_rounding_math)"

Otherwise LGTM.


juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-21 18:32
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v2] RISC-V: Support ceil and ceilf auto-vectorization
From: Pan Li 
 
This patch would like to support auto-vectorization for both the
ceil and ceilf of math.h. It depends on the -ffast-math option.
 
When we would like to call ceil/ceilf like v2 = ceil (v1), we will
convert it into below insn (reference the implementation of llvm).
 
* vfcvt.x.f v3, v1, RUP
* vfcvt.f.x v2, v3
 
However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.
 
  +---+---+
  | float | binary layout |
  +---+---+
  | 8388607.5 | 0x4aff|
  | 8388608.0 | 0x4b00|
  | 8388609.0 | 0x4b01|
  +---+---+
 
All single floating point great than 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.
 
Befor this patch:
math-ceil-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callceilf
  fsw fa0,-4(s1)
  bne s0,s2,.L3
 
After this patch:
  ...
  fsrmi   3
.L4:
  vfabs.v v0,v1
  vmv1r.v v2,v1
  vmflt.vvv0,v0,v4
  sub a3,a3,a4
  vfcvt.x.f.v v3,v1,v0.t
  vfcvt.f.x.v v2,v3,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret
 
Please note VLS mode is also involved in this patch and covered by the
test cases.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (ceil2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_ceil): New function decl.
* config/riscv/riscv-v.cc (gen_ceil_const_fp): New function impl.
(expand_vec_float_cmp_mask): Ditto.
(expand_vec_copysign): Ditto.
(expand_vec_ceil): Ditto.
* config/riscv/vector-iterators.md: Add VLS mode to VCONVERT.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/math-ceil-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-4.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-4.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-double.h: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-single.h: New test.
* gcc.target/riscv/rvv/autovec/test-math.h: New test.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/autovec.md   |  16 +++
gcc/config/riscv/riscv-protos.h   |   5 +
gcc/config/riscv/riscv-v.cc   | 116 ++
gcc/config/riscv/vector-iterators.md  |  12 ++
.../riscv/rvv/autovec/math-ceil-1.c   |  26 
.../riscv/rvv/autovec/math-ceil-2.c   |  26 
.../riscv/rvv/autovec/math-ceil-3.c   |  28 +
.../riscv/rvv/autovec/math-ceil-4.c   |  28 +
.../riscv/rvv/autovec/math-ceil-run-1.c   |   4 +
.../riscv/rvv/autovec/math-ceil-run-2.c   |   4 +
.../riscv/rvv/autovec/math-ceil-run-3.c   |   4 +
.../riscv/rvv/autovec/math-ceil-run-4.c   |   4 +
.../riscv/rvv/autovec/math-ceil-run-double.h  |  36 ++
.../riscv/rvv/autovec/math-ceil-run-single.h  |  36 ++
.../gcc.target/riscv/rvv/autovec/test-math.h  |  40 ++
15 files changed, 385 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-4.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-3.c
create m

Re: [Committed] RISC-V: Support VLS unary floating-point patterns

2023-09-21 Thread Kito Cheng
GCC has built in function[1] for those math function stuff, e.g.
__builtin_ceilf, so we don't really need math.h :)

[1] https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html

On Thu, Sep 21, 2023 at 11:20 AM Palmer Dabbelt  wrote:
>
> On Wed, 20 Sep 2023 10:47:23 PDT (-0700), Patrick O'Neill wrote:
> > Juzhe,
> >
> > On a more general note, are we expecting #include  to cause a
> > testcase to fail?
> >
> > My motivation is to make the testsuite less noisy when checking for
> > regressions. For example, a patch like this one:
> > https://patchwork.sourceware.org/project/gcc/patch/20230920023059.1728132-1-pan2...@intel.com/
> > is showing 4 new failures on rv32gcv from the {dg-do compile} testcases
> > that #include . I might be wrong, but those don't look like real
> > failures to me [1][2][3].
> >
> > On glibc rv64gcv I'm seeing tests like:
> > gcc.target/riscv/rvv/autovec/unop/vnot-rv32gcv.c
> > fail with similar missing stubs-ilp32d.h errors.
> >
> > I want to sanity-check with other people that they are seeing similar
> > errors and that these errors indicate something wrong with the testsuite.
> > If nobody else is seeing these errors, I'd like to hear how you're
> > running the testsuite so I can debug the riscv-gnu-toolchain repo.
> >
> > Patrick
> >
> > [1]:
> > Executing on host:
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc
> > -B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
> > -march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output
> > -O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
> > -fno-vect-cost-model -ffast-math -fno-schedule-insns
> > -fno-schedule-insns2 -S   -o math-ceil-1.s (timeout = 600)
> > spawn -ignore SIGHUP
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc
> > -B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
> > -march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output
> > -O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
> > -fno-vect-cost-model -ffast-math -fno-schedule-insns
> > -fno-schedule-insns2 -S -o math-ceil-1.s
> > In file included from
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/features.h:515,
> >   from
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/bits/libc-header-start.h:33,
> >   from
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/math.h:27,
> >   from
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h:1,
> >   from
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c:5:
> > /github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/gnu/stubs.h:17:11:
> > fatal error: gnu/stubs-lp64d.h: No such file or directory
>
> That looks like a toolchain build/configuration issue, not a test issue.
> IIRC this comes up from time to time, something's probably broken in
> riscv-gnu-toolchain but I'm not sure what's wrong.
>
> I get a working setup with just `./configure --enable-linux
> --disable-multilib` and the latest riscv-gnu-toolchain master.  How are
> you building things?
>
> > compilation terminated.
> > compiler exited with status 1
> > FAIL: gcc.target/riscv/rvv/autovec/math-ceil-1.c -O3 -ftree-vectorize
> > (test for excess errors)
> >
> > [2]:
> > https://github.com/ewlu/riscv-gnu-toolchain/issues/170
> >
> > [3]:
> > This also extends beyond math.h. I'm seeing similar failures for
> > testcases like
> > gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-rv64-1.c that
> > #include .
> >
> >
> > On 9/19/23 18:12, Patrick O'Neill wrote:
> >>
> >> I'll let it run overnight and see if this helps. Even before this patch,
> >> I was seeing 233 stubs related failures for rv32gcv and 7 for rv64gcv so
> >> this won't fix all the issues.
> >>
> >> It's easily replicated using upstream riscv-gnu-toolchain
> >> git clone https://github.com/riscv-collab/riscv-gnu-toolchain
> >> cd riscv-gnu-toolchain
> >> git submodule update --init gcc
> >> cd gcc
> >> git pull master
> >> cd ..
> >> mkdir build
> >> cd build
> >> ../configure --prefix=$(pwd) --with-arch=rv32gcv --with-abi=ilp32d
> >> make report-linux -j32
> >>
> >> Then search for "stubs" in the debug logs
> >> (/build-gcc-linux-stage2/gcc/testsu

Re: [Committed] RISC-V: Support VLS unary floating-point patterns

2023-09-21 Thread Palmer Dabbelt

On Thu, 21 Sep 2023 04:24:48 PDT (-0700), kito.ch...@sifive.com wrote:

GCC has built in function[1] for those math function stuff, e.g.
__builtin_ceilf, so we don't really need math.h :)

[1] https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html


That's probably the right way to go for the test suite.  Something's 
still wrong somewhere with Patrick's builds, though...




On Thu, Sep 21, 2023 at 11:20 AM Palmer Dabbelt  wrote:


On Wed, 20 Sep 2023 10:47:23 PDT (-0700), Patrick O'Neill wrote:
> Juzhe,
>
> On a more general note, are we expecting #include  to cause a
> testcase to fail?
>
> My motivation is to make the testsuite less noisy when checking for
> regressions. For example, a patch like this one:
> 
https://patchwork.sourceware.org/project/gcc/patch/20230920023059.1728132-1-pan2...@intel.com/
> is showing 4 new failures on rv32gcv from the {dg-do compile} testcases
> that #include . I might be wrong, but those don't look like real
> failures to me [1][2][3].
>
> On glibc rv64gcv I'm seeing tests like:
> gcc.target/riscv/rvv/autovec/unop/vnot-rv32gcv.c
> fail with similar missing stubs-ilp32d.h errors.
>
> I want to sanity-check with other people that they are seeing similar
> errors and that these errors indicate something wrong with the testsuite.
> If nobody else is seeing these errors, I'd like to hear how you're
> running the testsuite so I can debug the riscv-gnu-toolchain repo.
>
> Patrick
>
> [1]:
> Executing on host:
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc
> 
-B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
> -march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output
> -O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
> -fno-vect-cost-model -ffast-math -fno-schedule-insns
> -fno-schedule-insns2 -S   -o math-ceil-1.s (timeout = 600)
> spawn -ignore SIGHUP
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc
> 
-B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
> -march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output
> -O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
> -fno-vect-cost-model -ffast-math -fno-schedule-insns
> -fno-schedule-insns2 -S -o math-ceil-1.s
> In file included from
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/features.h:515,
>   from
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/bits/libc-header-start.h:33,
>   from
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/math.h:27,
>   from
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h:1,
>   from
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c:5:
> 
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/gnu/stubs.h:17:11:
> fatal error: gnu/stubs-lp64d.h: No such file or directory

That looks like a toolchain build/configuration issue, not a test issue.
IIRC this comes up from time to time, something's probably broken in
riscv-gnu-toolchain but I'm not sure what's wrong.

I get a working setup with just `./configure --enable-linux
--disable-multilib` and the latest riscv-gnu-toolchain master.  How are
you building things?

> compilation terminated.
> compiler exited with status 1
> FAIL: gcc.target/riscv/rvv/autovec/math-ceil-1.c -O3 -ftree-vectorize
> (test for excess errors)
>
> [2]:
> https://github.com/ewlu/riscv-gnu-toolchain/issues/170
>
> [3]:
> This also extends beyond math.h. I'm seeing similar failures for
> testcases like
> gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-rv64-1.c that
> #include .
>
>
> On 9/19/23 18:12, Patrick O'Neill wrote:
>>
>> I'll let it run overnight and see if this helps. Even before this patch,
>> I was seeing 233 stubs related failures for rv32gcv and 7 for rv64gcv so
>> this won't fix all the issues.
>>
>> It's easily replicated using upstream riscv-gnu-toolchain
>> git clone https://github.com/riscv-collab/riscv-gnu-toolchain
>> cd riscv-gnu-toolchain
>> git submodule update --init gcc
>> cd gcc
>> git pull master
>> cd ..
>> mkdir build
>> cd build
>> ../configure --prefix=$(pwd) --with-arch=rv32gcv --with-abi=ilp32d
>> make report-linux -j32
>>
>> Then search for "stubs" in the debug logs
>> (/build-gc

Re: [PATCH v2 1/2] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-09-21 Thread waffl3x
> This seems like a reasonable place for it since 'this' is supposed to
> precede the decl-specifiers, and since we are parsing initial attributes
> here rather than in the caller. You will want to give an error if
> found_decl_spec is set. And elsewhere complain about 'this' on
> parameters after the first (in cp_parser_parameter_declaration_list?),
> or in a non-member/lambda (in grokdeclarator?).

Bringing this back up, I recalled another detail regarding this.

I'm pretty sure that found_decl_spec can be false when parsing the
second or latter decl-specifier. I tested it quickly and I believe I am
correct. I raise this as my diagnostics patch introduces another
variable to track whether we are on the first decl-specifier, given the
results of my quick test, I believe that was the correct choice.

This kinda unclear machinery is what makes me really want to refactor
this code, but I've resisted as it would be inappropriate to try to do
so while implementing a feature. Once I am finished implementing
`deducing this` would you be open to me refactoring grokdeclarator and
it's various auxiliary functions?

As for where the complaining happens, I believe I implemented this
particular error in cp_parser_decl_specifier_seq, I don't plan to be
stubborn on any of the diagnostic code though as I'm pretty unhappy
with how it got scattered about. I intend to get more input on that
after I finish v2 of the diagnostic patch though.


> That's a good point, but the flag you chose seems even more general purpose.

Yeah, I had to just settle on it because I was bikeshedding it for a
couple hours despite being very unhappy with it.

> A better option might be, instead of putting this flag on the PARM_DECL,
> to put it on the short-lived TREE_LIST which is only used for
> communication between cp_parser_parameter_declaration_list and
> grokparms, and have grokdeclarator grab it from
> declarator->u.function.parameters?

That does sound ideal! I will look into doing it this way.

> Generally the flags that aren't specifically specified to be
> language-specific are reserved for language-independent uses; even if
> only one front-end actually uses the feature, it should be for
> communication to language-independent code rather than communication
> within the particular front-end.

Ah okay, that makes perfect sense to me, understood.

> The patch modified tree-core.h to
> refer to a macro in cp-tree.h.

Yeah, I wasn't sure about doing that, I will refrain from that in the
future, (along with removing it from v3, but the other change you
suggested should eliminate the referred to macro anyway.)

> > Yeah, I separated all the diagnostics out into the second patch. This
> > patch was meant to include the bare minimum of what was necessary to
> > get the feature functional. As for the diagnostics patch, I'm not happy
> > with how scattered about the code base it is, but you'll be able to
> > judge for yourself when I resubmit that patch, hopefully later today.
> > So not to worry, I didn't neglect diagnostics, it's just in a follow
> > up. The v1 of it was submitted on August 31st if you want to find it,
> > but I wouldn't recommend it. I misunderstood how some things were to be
> > formatted so it's probably best you just wait for me to finish a v2 of
> > it.
> 
> 
> Ah, oops, I assumed that v2 completely replaced v1.

I had intended to complete v2 of it quite some time ago, I've just been
busy. Today as well I got sidetracked with some job hunting, but I plan
on finishing v3 of the initial support patch (the one related to this
thread) tonight at the very least. I can't commit to diagnostics v2
tonight, but if it happens it happens. :)

I might even have to leave out communicating that a PARM_DECL is an
xobj parm cp_parser_parameter_declaration_list if I have too hard a
time figuring out how to work it in, if that is the case then I will
make that change in a v4.

Alex






Re: [PATCH v2] RISC-V: Support ceil and ceilf auto-vectorization

2023-09-21 Thread juzhe.zh...@rivai.ai
Also。 Remove math.h include。
Instead, plz use __builtin_ceil.



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-21 18:32
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v2] RISC-V: Support ceil and ceilf auto-vectorization
From: Pan Li 
 
This patch would like to support auto-vectorization for both the
ceil and ceilf of math.h. It depends on the -ffast-math option.
 
When we would like to call ceil/ceilf like v2 = ceil (v1), we will
convert it into below insn (reference the implementation of llvm).
 
* vfcvt.x.f v3, v1, RUP
* vfcvt.f.x v2, v3
 
However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.
 
  +---+---+
  | float | binary layout |
  +---+---+
  | 8388607.5 | 0x4aff|
  | 8388608.0 | 0x4b00|
  | 8388609.0 | 0x4b01|
  +---+---+
 
All single floating point great than 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.
 
Befor this patch:
math-ceil-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callceilf
  fsw fa0,-4(s1)
  bne s0,s2,.L3
 
After this patch:
  ...
  fsrmi   3
.L4:
  vfabs.v v0,v1
  vmv1r.v v2,v1
  vmflt.vvv0,v0,v4
  sub a3,a3,a4
  vfcvt.x.f.v v3,v1,v0.t
  vfcvt.f.x.v v2,v3,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret
 
Please note VLS mode is also involved in this patch and covered by the
test cases.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (ceil2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_ceil): New function decl.
* config/riscv/riscv-v.cc (gen_ceil_const_fp): New function impl.
(expand_vec_float_cmp_mask): Ditto.
(expand_vec_copysign): Ditto.
(expand_vec_ceil): Ditto.
* config/riscv/vector-iterators.md: Add VLS mode to VCONVERT.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/math-ceil-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-4.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-4.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-double.h: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-single.h: New test.
* gcc.target/riscv/rvv/autovec/test-math.h: New test.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/autovec.md   |  16 +++
gcc/config/riscv/riscv-protos.h   |   5 +
gcc/config/riscv/riscv-v.cc   | 116 ++
gcc/config/riscv/vector-iterators.md  |  12 ++
.../riscv/rvv/autovec/math-ceil-1.c   |  26 
.../riscv/rvv/autovec/math-ceil-2.c   |  26 
.../riscv/rvv/autovec/math-ceil-3.c   |  28 +
.../riscv/rvv/autovec/math-ceil-4.c   |  28 +
.../riscv/rvv/autovec/math-ceil-run-1.c   |   4 +
.../riscv/rvv/autovec/math-ceil-run-2.c   |   4 +
.../riscv/rvv/autovec/math-ceil-run-3.c   |   4 +
.../riscv/rvv/autovec/math-ceil-run-4.c   |   4 +
.../riscv/rvv/autovec/math-ceil-run-double.h  |  36 ++
.../riscv/rvv/autovec/math-ceil-run-single.h  |  36 ++
.../gcc.target/riscv/rvv/autovec/test-math.h  |  40 ++
15 files changed, 385 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-4.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-3.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-4.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-double.h
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-single.h
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 493d5745485..36ed839aa5b 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2374,3 +2374,19 @@ (define_expand "avg3_ceil"
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops3);
   DONE;
})
+
+;; -
+;;  [FP] Math.h.
+;; 

Re: [Committed] RISC-V: Remove math.h import to resolve missing stubs failures

2023-09-21 Thread juzhe.zh...@rivai.ai
Hi, Patrick.

GNU rvv intrinsic api test-generator has been merged:
https://github.com/riscv-non-isa/rvv-intrinsic-doc/commits/main 

Could you include the full RVV intrinsic API test in your test CI?
Currently, we don't include all API test in the GCC testsuite since it's too 
big.



juzhe.zh...@rivai.ai
 
From: Patrick O'Neill
Date: 2023-09-21 01:51
To: Kito Cheng
CC: GCC Patches; Robin Dapp; 钟居哲
Subject: [Committed] RISC-V: Remove math.h import to resolve missing stubs 
failures
Committed. Thanks!
On 9/20/23 10:19, Kito Cheng wrote:
LGTM 

Patrick O'Neill  於 2023年9月20日 週三 18:07 寫道:
Resolves some of the missing stubs failures:
fatal error: gnu/stubs-lp64d.h: No such file or directory
compilation terminated.

2023-09-20 Juzhe Zhong 

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/def.h: Remove unneeded math.h
import.

Tested-by: Patrick O'Neill 
---
Tested using 590a8bec3ed92118e084b0a1897d3314a666170e
glibc rv64gcv
glibc rv32gcv

glibc rv64gcv
Resolved failures:
FAIL: gcc.target/riscv/rvv/autovec/vls/mov-2.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/mov-4.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/mov-6.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)

glibc rv32gcv
Resolved failures:
FAIL: gcc.target/riscv/rvv/autovec/vls/and-1.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/and-2.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/and-3.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-1.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-2.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-3.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-4.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-5.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-6.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-1.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-2.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-3.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-4.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-5.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/div-1.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/dup-1.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/dup-2.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/dup-3.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/dup-4.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/dup-5.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/dup-6.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/dup-7.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/extract-1.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/extract-2.c -O3 -ftree-vectorize --param 
riscv-autovec-preference=scalable (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/floating-point-add-1.c -O3 
-ftree-vectorize --param riscv-autovec-preference=scalable (test for excess 
errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/floating-point-add-2.c -O3 
-ftree-vectorize --param riscv-autovec-prefere

[Committed] RISC-V: Support VLS mult high

2023-09-21 Thread Juzhe-Zhong
Regression passed.

Committed.

gcc/ChangeLog:

* config/riscv/vector-iterators.md: Extend VLS modes.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/def.h: Add VLS mult high.
* gcc.target/riscv/rvv/autovec/vls/mulh-1.c: New test.

---
 gcc/config/riscv/vector-iterators.md  |  47 
 .../gcc.target/riscv/rvv/autovec/vls/def.h|   8 ++
 .../gcc.target/riscv/rvv/autovec/vls/mulh-1.c | 104 ++
 3 files changed, 159 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mulh-1.c

diff --git a/gcc/config/riscv/vector-iterators.md 
b/gcc/config/riscv/vector-iterators.md
index 5c4b433c6bf..4aa64127df7 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -670,6 +670,53 @@
   RVVM8SI RVVM4SI RVVM2SI RVVM1SI (RVVMF2SI "TARGET_MIN_VLEN > 32")
 
   (RVVM8DI "TARGET_FULL_V") (RVVM4DI "TARGET_FULL_V") (RVVM2DI 
"TARGET_FULL_V") (RVVM1DI "TARGET_FULL_V")
+
+  (V1QI "TARGET_VECTOR_VLS")
+  (V2QI "TARGET_VECTOR_VLS")
+  (V4QI "TARGET_VECTOR_VLS")
+  (V8QI "TARGET_VECTOR_VLS")
+  (V16QI "TARGET_VECTOR_VLS")
+  (V32QI "TARGET_VECTOR_VLS")
+  (V64QI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 64")
+  (V128QI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 128")
+  (V256QI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 256")
+  (V512QI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 512")
+  (V1024QI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 1024")
+  (V2048QI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 2048")
+  (V4096QI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 4096")
+  (V1HI "TARGET_VECTOR_VLS")
+  (V2HI "TARGET_VECTOR_VLS")
+  (V4HI "TARGET_VECTOR_VLS")
+  (V8HI "TARGET_VECTOR_VLS")
+  (V16HI "TARGET_VECTOR_VLS")
+  (V32HI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 64")
+  (V64HI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 128")
+  (V128HI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 256")
+  (V256HI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 512")
+  (V512HI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 1024")
+  (V1024HI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 2048")
+  (V2048HI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 4096")
+  (V1SI "TARGET_VECTOR_VLS")
+  (V2SI "TARGET_VECTOR_VLS")
+  (V4SI "TARGET_VECTOR_VLS")
+  (V8SI "TARGET_VECTOR_VLS")
+  (V16SI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 64")
+  (V32SI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 128")
+  (V64SI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 256")
+  (V128SI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 512")
+  (V256SI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 1024")
+  (V512SI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 2048")
+  (V1024SI "TARGET_VECTOR_VLS && TARGET_MIN_VLEN >= 4096")
+  (V1DI "TARGET_VECTOR_VLS && TARGET_FULL_V")
+  (V2DI "TARGET_VECTOR_VLS && TARGET_FULL_V")
+  (V4DI "TARGET_VECTOR_VLS && TARGET_FULL_V")
+  (V8DI "TARGET_VECTOR_VLS && TARGET_FULL_V && TARGET_MIN_VLEN >= 64")
+  (V16DI "TARGET_VECTOR_VLS && TARGET_FULL_V && TARGET_MIN_VLEN >= 128")
+  (V32DI "TARGET_VECTOR_VLS && TARGET_FULL_V && TARGET_MIN_VLEN >= 256")
+  (V64DI "TARGET_VECTOR_VLS && TARGET_FULL_V && TARGET_MIN_VLEN >= 512")
+  (V128DI "TARGET_VECTOR_VLS && TARGET_FULL_V && TARGET_MIN_VLEN >= 1024")
+  (V256DI "TARGET_VECTOR_VLS && TARGET_FULL_V && TARGET_MIN_VLEN >= 2048")
+  (V512DI "TARGET_VECTOR_VLS && TARGET_FULL_V && TARGET_MIN_VLEN >= 4096")
 ])
 
 (define_mode_iterator VI_QH [
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
index 74685f8d05e..26671b2975c 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
@@ -518,3 +518,11 @@ typedef double v512df __attribute__ ((vector_size (4096)));
 for (int i = 0; i < NUM; i++)  
\
   dst[i] = ((TYPE2) a[i] + b[i] + 1) >> 1; 
\
   }
+
+#define DEF_MULH(TYPE, NUM)
\
+  void __attribute__ ((noipa)) 
\
+  mod_##TYPE##_##NUM (TYPE *__restrict dst, TYPE *__restrict src)  
\
+  {
\
+for (int i = 0; i < NUM; ++i)  
\
+  dst[i] = src[i] % 19;
\
+  }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mulh-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mulh-1.c
new file mode 100644
index 000..47bb40f9828
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mulh-1.c
@@ -0,0 +1,104 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mabi=lp64d -O3 
--param=riscv-autovec-lmul=m8 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_MULH (int8_t, 4)
+DEF_MULH (int8_t, 8)
+DEF_MULH (int8_t, 16)
+DEF_MULH (int8_t, 32)
+DEF_MULH (int8_t, 64)
+DEF_MULH (int8_t, 128)
+DEF_MULH (int8_t, 

[Committed] RISC-V: Add more VLS unary tests

2023-09-21 Thread Juzhe-Zhong
Notice we are missing these tests.

Committed.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/abs-1.c: New test.
* gcc.target/riscv/rvv/autovec/vls/not-1.c: New test.
* gcc.target/riscv/rvv/autovec/vls/sqrt-1.c: New test.

---
 .../gcc.target/riscv/rvv/autovec/vls/abs-1.c  | 52 ++
 .../gcc.target/riscv/rvv/autovec/vls/not-1.c  | 69 +++
 .../gcc.target/riscv/rvv/autovec/vls/sqrt-1.c | 52 ++
 3 files changed, 173 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/abs-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/not-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/sqrt-1.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/abs-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/abs-1.c
new file mode 100644
index 000..7c7a5bd6ac7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/abs-1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
--param=riscv-autovec-lmul=m8 -ffast-math -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (fabs, 2, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 4, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 8, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 16, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 32, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 64, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 128, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 256, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 512, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 1024, _Float16, __builtin_fabs)
+DEF_OP_V (fabs, 2048, _Float16, __builtin_fabs)
+
+DEF_OP_V (fabs, 2, float, __builtin_fabs)
+DEF_OP_V (fabs, 4, float, __builtin_fabs)
+DEF_OP_V (fabs, 8, float, __builtin_fabs)
+DEF_OP_V (fabs, 16, float, __builtin_fabs)
+DEF_OP_V (fabs, 32, float, __builtin_fabs)
+DEF_OP_V (fabs, 64, float, __builtin_fabs)
+DEF_OP_V (fabs, 128, float, __builtin_fabs)
+DEF_OP_V (fabs, 256, float, __builtin_fabs)
+DEF_OP_V (fabs, 512, float, __builtin_fabs)
+DEF_OP_V (fabs, 1024, float, __builtin_fabs)
+
+DEF_OP_V (fabs, 2, double, __builtin_fabs)
+DEF_OP_V (fabs, 4, double, __builtin_fabs)
+DEF_OP_V (fabs, 8, double, __builtin_fabs)
+DEF_OP_V (fabs, 16, double, __builtin_fabs)
+DEF_OP_V (fabs, 32, double, __builtin_fabs)
+DEF_OP_V (fabs, 64, double, __builtin_fabs)
+DEF_OP_V (fabs, 128, double, __builtin_fabs)
+DEF_OP_V (fabs, 256, double, __builtin_fabs)
+DEF_OP_V (fabs, 512, double, __builtin_fabs)
+
+/* { dg-final { scan-assembler-times {vfabs\.v\s+v[0-9]+,\s*v[0-9]+} 30 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/not-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/not-1.c
new file mode 100644
index 000..316bac88fed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/not-1.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mabi=lp64d -O3 
--param=riscv-autovec-lmul=m8 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (not, 1, int8_t, ~)
+DEF_OP_V (not, 2, int8_t, ~)
+DEF_OP_V (not, 4, int8_t, ~)
+DEF_OP_V (not, 8, int8_t, ~)
+DEF_OP_V (not, 16, int8_t, ~)
+DEF_OP_V (not, 32, int8_t, ~)
+DEF_OP_V (not, 64, int8_t, ~)
+DEF_OP_V (not, 128, int8_t, ~)
+DEF_OP_V (not, 256, int8_t, ~)
+DEF_OP_V (not, 512, int8_t, ~)
+DEF_OP_V (not, 1024, int8_t, ~)
+DEF_OP_V (not, 2048, int8_t, ~)
+DEF_OP_V (not, 4096, int8_t, ~)
+
+DEF_OP_V (not, 1, int16_t, ~)
+DEF_OP_V (not, 2, int16_t, ~)
+DEF_OP_V (not, 4, int16_t, ~)
+DEF_OP_V (not, 8, int16_t, ~)
+DEF_OP_V (not, 16, int16_t, ~)
+DEF_OP_V (not, 32, int16_t, ~)
+DEF_OP_V (not, 64, int16_t, ~)
+DEF_OP_V (not, 128, int16_t, ~)
+DEF_OP_V (not, 256, int16_t, ~)
+DEF_OP_V (not, 512, int16_t, ~)
+DEF_OP_V (not, 1024, int16_t, ~)
+DEF_OP_V (not, 2048, int16_t, ~)
+
+DEF_OP_V (not, 1, int32_t, ~)
+DEF_OP_V (not, 2, int32_t, ~)
+DEF_OP_V (not, 4, int32_t, ~)
+DEF_OP_V (not, 8, int32_t, ~)
+DEF_OP_V (not, 16, int32_t, ~)
+DEF_OP_V (not, 32, int32_t, ~)
+DEF_OP_V (not, 64, int32_t, ~)
+DEF_OP_V (not, 128, int32_t, ~)
+DEF_OP_V (not, 256, int32_t, ~)
+DEF_OP_V (not, 512, i

[PATCH v3] c++: Catch indirect change of active union member in constexpr [PR101631]

2023-09-21 Thread Nathaniel Shead
I've updated the error messages, and also fixed another bug I found
while retesting (value-initialised unions weren't considered to have any
active member yet).

Bootstrapped and regtested on x86_64-pc-linux-gnu.

-- >8 --

This patch adds checks for attempting to change the active member of a
union by methods other than a member access expression.

To be able to properly distinguish `*(&u.a) = ` from `u.a = `, this
patch redoes the solution for c++/59950 to avoid extranneous *&; it
seems that the only case that needed the workaround was when copying
empty classes.

This patch also ensures that constructors for a union field mark that
field as the active member before entering the call itself; this ensures
that modifications of the field within the constructor's body don't
cause false positives (as these will not appear to be member access
expressions). This means that we no longer need to start the lifetime of
empty union members after the constructor body completes.

As a drive-by fix, this patch also ensures that value-initialised unions
are considered to have activated their initial member for the purpose of
checking stores, which catches some additional mistakes pre-C++20.

PR c++/101631

gcc/cp/ChangeLog:

* call.cc (build_over_call): Fold more indirect refs for trivial
assignment op.
* class.cc (type_has_non_deleted_trivial_default_ctor): Create.
* constexpr.cc (cxx_eval_call_expression): Start lifetime of
union member before entering constructor.
(cxx_eval_store_expression): Activate member for
value-initialised union. Check for accessing inactive union
member indirectly.
* cp-tree.h (type_has_non_deleted_trivial_default_ctor):
Forward declare.

gcc/testsuite/ChangeLog:

* g++.dg/cpp1y/constexpr-89336-3.C: Fix union initialisation.
* g++.dg/cpp1y/constexpr-union6.C: New test.
* g++.dg/cpp2a/constexpr-union2.C: New test.
* g++.dg/cpp2a/constexpr-union3.C: New test.
* g++.dg/cpp2a/constexpr-union4.C: New test.
* g++.dg/cpp2a/constexpr-union5.C: New test.

Signed-off-by: Nathaniel Shead 
---
 gcc/cp/call.cc|  11 +-
 gcc/cp/class.cc   |   8 ++
 gcc/cp/constexpr.cc   | 129 +-
 gcc/cp/cp-tree.h  |   1 +
 .../g++.dg/cpp1y/constexpr-89336-3.C  |   2 +-
 gcc/testsuite/g++.dg/cpp1y/constexpr-union6.C |  13 ++
 gcc/testsuite/g++.dg/cpp2a/constexpr-union2.C |  30 
 gcc/testsuite/g++.dg/cpp2a/constexpr-union3.C |  45 ++
 gcc/testsuite/g++.dg/cpp2a/constexpr-union4.C |  29 
 gcc/testsuite/g++.dg/cpp2a/constexpr-union5.C |  71 ++
 10 files changed, 296 insertions(+), 43 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp1y/constexpr-union6.C
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-union2.C
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-union3.C
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-union4.C
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-union5.C

diff --git a/gcc/cp/call.cc b/gcc/cp/call.cc
index e8dafbd8ba6..c1fb8807d3f 100644
--- a/gcc/cp/call.cc
+++ b/gcc/cp/call.cc
@@ -10330,10 +10330,7 @@ build_over_call (struct z_candidate *cand, int flags, 
tsubst_flags_t complain)
   && DECL_OVERLOADED_OPERATOR_IS (fn, NOP_EXPR)
   && trivial_fn_p (fn))
 {
-  /* Don't use cp_build_fold_indirect_ref, op= returns an lvalue even if
-the object argument isn't one.  */
-  tree to = cp_build_indirect_ref (input_location, argarray[0],
-  RO_ARROW, complain);
+  tree to = cp_build_fold_indirect_ref (argarray[0]);
   tree type = TREE_TYPE (to);
   tree as_base = CLASSTYPE_AS_BASE (type);
   tree arg = argarray[1];
@@ -10341,7 +10338,11 @@ build_over_call (struct z_candidate *cand, int flags, 
tsubst_flags_t complain)
 
   if (is_really_empty_class (type, /*ignore_vptr*/true))
{
- /* Avoid copying empty classes.  */
+ /* Avoid copying empty classes, but ensure op= returns an lvalue even
+if the object argument isn't one. This isn't needed in other cases
+since MODIFY_EXPR is always considered an lvalue.  */
+ to = cp_build_addr_expr (to, tf_none);
+ to = cp_build_indirect_ref (input_location, to, RO_ARROW, complain);
  val = build2 (COMPOUND_EXPR, type, arg, to);
  suppress_warning (val, OPT_Wunused);
}
diff --git a/gcc/cp/class.cc b/gcc/cp/class.cc
index b71333af1f8..e31aeb8e68b 100644
--- a/gcc/cp/class.cc
+++ b/gcc/cp/class.cc
@@ -5688,6 +5688,14 @@ type_has_virtual_destructor (tree type)
   return (dtor && DECL_VIRTUAL_P (dtor));
 }
 
+/* True iff class TYPE has a non-deleted trivial default
+   constructor.  */
+
+bool type_has_non_deleted_trivial_default_ctor (tree type)
+{
+  return TYPE_HAS_TRIVIAL_D

[PATCH v2] AArch64: Fix strict-align cpymem/setmem [PR103100]

2023-09-21 Thread Wilco Dijkstra
v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
PR target/103100
* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
(setmemdi): Likewise.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
strict-align.  Cleanup condition for using MOPS.
(aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-= TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
 return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+  || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
  the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
- when available.  */
-  if (TARGET_MOPS
-  && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
  Count the number of operations involved to see if it's worth it
  against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
   simd_ops++;
   n -= mode_bits;
 
-  /* Do certain trailing copies as overlapping if it's going to be
-cheaper.  i.e. less instructions to do so.  For instance doing a 15
-byte copy it's more efficient to do two overlapping 8 byte copies than
-8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+  /* Emit trailing writes using overlapping unaligned accesses
+   (when !STRICT_ALIGNMENT) - th

[PATCH v3] RISC-V: Support ceil and ceilf auto-vectorization

2023-09-21 Thread pan2 . li
From: Pan Li 

This patch would like to support auto-vectorization for both the
ceil and ceilf of math.h. It depends on the -ffast-math option.

When we would like to call ceil/ceilf like v2 = ceil (v1), we will
convert it into below insn (reference the implementation of llvm).

* vfcvt.x.f v3, v1, RUP
* vfcvt.f.x v2, v3

However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.

  +---+---+
  | float | binary layout |
  +---+---+
  | 8388607.5 | 0x4aff|
  | 8388608.0 | 0x4b00|
  | 8388609.0 | 0x4b01|
  +---+---+

All single floating point great than 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.

Befor this patch:
math-ceil-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callceilf
  fsw fa0,-4(s1)
  bne s0,s2,.L3

After this patch:
  ...
  fsrmi   3
.L4:
  vfabs.v v0,v1
  vmv1r.v v2,v1
  vmflt.vvv0,v0,v4
  sub a3,a3,a4
  vfcvt.x.f.v v3,v1,v0.t
  vfcvt.f.x.v v2,v3,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret

Please note VLS mode is also involved in this patch and covered by the
test cases.

gcc/ChangeLog:

* config/riscv/autovec.md (ceil2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_ceil): New function decl.
* config/riscv/riscv-v.cc (gen_ceil_const_fp): New function impl.
(expand_vec_float_cmp_mask): Ditto.
(expand_vec_copysign): Ditto.
(expand_vec_ceil): Ditto.
* config/riscv/vector.md: Add VLS mode support.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/def.h: New macro.
* gcc.target/riscv/rvv/autovec/math-ceil-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/test-math.h: New test.
* gcc.target/riscv/rvv/autovec/vls/math-ceil-1.c: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec.md   |  16 +++
 gcc/config/riscv/riscv-protos.h   |   5 +
 gcc/config/riscv/riscv-v.cc   | 116 ++
 gcc/config/riscv/vector.md|   2 +-
 .../riscv/rvv/autovec/math-ceil-1.c   |  26 
 .../riscv/rvv/autovec/math-ceil-2.c   |  26 
 .../riscv/rvv/autovec/math-ceil-3.c   |  28 +
 .../riscv/rvv/autovec/math-ceil-run-1.c   |  39 ++
 .../riscv/rvv/autovec/math-ceil-run-2.c   |  39 ++
 .../gcc.target/riscv/rvv/autovec/test-math.h  |  38 ++
 .../gcc.target/riscv/rvv/autovec/vls/def.h|   8 ++
 .../riscv/rvv/autovec/vls/math-ceil-1.c   |  43 +++
 12 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-ceil-1.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index f0f1abc4e82..1b4bd82f9ec 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2239,3 +2239,19 @@ (define_expand "avg3_ceil"
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops3);
   DONE;
 })
+
+;; -
+;;  [FP] Math.h.
+;; -
+;; Includes:
+;; - ceil/ceilf
+;; -
+(define_expand "ceil2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_ceil (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 9ea0bcf15d3..07b4ffe3edf 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -250,6 +250,9 @@ enum insn_flags : unsigned int
   /* flags for the floating-point rounding mode.  */
   /* Means INSN has FRM operand and the value is FRM_DYN.  */
   FRM_DYN_P = 1 <

Re: [PATCH] c++: improve class NTTP object pretty printing [PR111471]

2023-09-21 Thread Jason Merrill

On 9/20/23 10:13, Patrick Palka wrote:

On Tue, 19 Sep 2023, Patrick Palka wrote:


On Tue, 19 Sep 2023, Jason Merrill wrote:


On 9/19/23 12:40, Patrick Palka wrote:

Tested on x86_64-pc-linux-gnu, does this look OK for trunk/13?


OK for trunk.  What's your argument for backporting?


Thanks.  I don't feel strongly about it, but I was thinking that since
we typically backport C++20-only correctness fixes to the most recent
release branch, C++20-only diagnostic improvements might be suitable
too?




-- >8 --

1. Move class NTTP object pretty printing to a more general spot in
 the pretty printer.


FWIW this first change isn't just a refactoring, it means we now pretty
print an NTTP object that appears elsewhere besides in a template
argument list, e.g. in a parameter mapping:

Before:

diagnostic19.C:8:15: note: the expression ‘((const A)V).value [with V = 
_ZTAXtl1AEE]’ evaluated to ‘false’

After:

diagnostic19.C:8:15: note: the expression ‘(V).value [with V = A{false}]’ 
evaluated to ‘false’


Ah, that is a pretty big improvement.  The patch is OK.

Jason



Re: [Committed] RISC-V: Support VLS unary floating-point patterns

2023-09-21 Thread Patrick O'Neill



On 9/21/23 03:20, Palmer Dabbelt wrote:

On Wed, 20 Sep 2023 10:47:23 PDT (-0700), Patrick O'Neill wrote:

...

[1]:
Executing on host:
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc 

-B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/ 

/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c 


-march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output
-O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
-fno-vect-cost-model -ffast-math -fno-schedule-insns
-fno-schedule-insns2 -S   -o math-ceil-1.s (timeout = 600)
spawn -ignore SIGHUP
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc 

-B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/ 

/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c 


-march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output
-O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
-fno-vect-cost-model -ffast-math -fno-schedule-insns
-fno-schedule-insns2 -S -o math-ceil-1.s
In file included from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/features.h:515, 


  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/bits/libc-header-start.h:33, 


  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/math.h:27, 


  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h:1, 


  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c:5: 

/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/gnu/stubs.h:17:11: 


fatal error: gnu/stubs-lp64d.h: No such file or directory


That looks like a toolchain build/configuration issue, not a test 
issue.  IIRC this comes up from time to time, something's probably 
broken in riscv-gnu-toolchain but I'm not sure what's wrong.


I get a working setup with just `./configure --enable-linux 
--disable-multilib` and the latest riscv-gnu-toolchain master. How are 
you building things?

I've kicked off a few builds to confirm but I use:

git clone https://github.com/riscv-collab/riscv-gnu-toolchain
cd riscv-gnu-toolchain
git submodule update --init gcc
cd gcc && git checkout master && cd ..
mkdir build && cd build
../configure --prefix=$(pwd) --with-arch=rv32gcv --with-abi=ilp32d

Make sure you bump gcc to tip of tree!
The latest riscv-gnu-toolchain master uses gcc 12.2 which doesn't have
the testcases that are failing (rvv folder):
https://github.com/gcc-mirror/gcc/tree/2ee5e4300186a92ad73f1a1a64cb918dc76c8d67/gcc/testsuite/gcc.target/riscv

The failures only show up for tests in:
gcc.target/riscv/rvv/
gcc.dg/vect/costmodel/riscv/rvv/

After bumping gcc and using your command (with --prefix)
../configure --enable-linux --disable-multilib --prefix=$(pwd)
I still get the missing stubs failures.

I've also tried this with the build directory both inside and outside
the repo and that doesn't make a difference.

Patrick




compilation terminated.
compiler exited with status 1
FAIL: gcc.target/riscv/rvv/autovec/math-ceil-1.c -O3 -ftree-vectorize
(test for excess errors)

[2]:
https://github.com/ewlu/riscv-gnu-toolchain/issues/170

[3]:
This also extends beyond math.h. I'm seeing similar failures for
testcases like
gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-rv64-1.c that
#include .


[PATCH] AArch64: Add inline memmove expansion

2023-09-21 Thread Wilco Dijkstra

Add support for inline memmove expansions.  The generated code is identical
as for memcpy, except that all loads are emitted before stores rather than
being interleaved.  The maximum size is 256 bytes which requires at most 16
registers.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
* config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
Change default.
* config/aarch64/aarch64.md (cpymemdi): Add a parameter.
(movmemdi): Call aarch64_expand_cpymem.
* config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
simplify, support storing generated loads/stores. 
(aarch64_expand_cpymem): Support expansion of memmove.
* config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.

gcc/testsuite/ChangeLog/
* gcc.target/aarch64/memmove.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
e8d91cba30e32e03c4794ccc24254691d135f2dd..e224218600969d9d052128790f1524414bbab5c6
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -766,7 +766,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
8a12894d6b80de1031d6e7d02dca680c57bce136..a573e3bded2736f5108ad2d4004f530e0f32c99c
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25191,48 +25191,35 @@ aarch64_progress_pointer (rtx pointer)
MODE bytes.  */
 
 static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
- machine_mode mode)
+aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst,
+   int offset, machine_mode mode)
 {
   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
  address copies using V4SImode so that we can use Q registers.  */
   if (known_eq (GET_MODE_BITSIZE (mode), 256))
 {
   mode = V4SImode;
+  rtx src1 = adjust_address (src, mode, offset);
+  rtx src2 = adjust_address (src, mode, offset + 16);
+  rtx dst1 = adjust_address (dst, mode, offset);
+  rtx dst2 = adjust_address (dst, mode, offset + 16);
   rtx reg1 = gen_reg_rtx (mode);
   rtx reg2 = gen_reg_rtx (mode);
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-   aarch64_progress_pointer (*src)));
-  emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-aarch64_progress_pointer (*dst), 
reg2));
-  /* Move the pointers forward.  */
-  *src = aarch64_move_pointer (*src, 32);
-  *dst = aarch64_move_pointer (*dst, 32);
+  *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+  *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
   return;
 }
 
   rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  *load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  *store = gen_move_insn (adjust_address (dst, mode, offset), reg);
 }
 
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
rather than memcpy.  Return true iff we succeeded.  */
 bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
 return false;
@@ -25251,12 +25238,12 @@ aarch64_expand_cpymem_mops (rtx *operands, bool 
is_memmove = false)
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to
-   memcpy should be emitted.  */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
+   if this is a memmove rather than memcpy.  Return true if we succeed,
+   otherwise return false, indicating that a libcall should be emitted.  */
 bool
-aarch64_expand_cpymem (rtx *operands)
+aarch64_expand

Re: RFC: Introduce -fhardened to enable security-related flags

2023-09-21 Thread Hans-Peter Nilsson
> From: Qing Zhao 
> Date: Tue, 19 Sep 2023 14:19:09 +
> > On Sep 17, 2023, at 12:36 PM, Hans-Peter Nilsson via Gcc-patches 
> >  wrote:
> >> From: Sam James 
> >> Date: Sun, 17 Sep 2023 05:00:37 +0100
> >> Did some bug ever get filed for this to see if we can do a bit
> >> better here?
> > 
> > Not that I know of; neither for systemd nor gcc.
> 
> Then, is it convenient to file a bug on this?

A fair request, but I can't commit to analyze it myself to
the usual level, producing a self-contained test-case.

I see Sam James was super fast and has already added you to
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111523
(thanks!)

> That will
> be very helpful for us to locate the issue and fix it.
> 
> Before I committing the -ftrivial-auto-var-init patch, I
> have done some performance testing on CPU2017 for x86 and
> aarch64,
> The runtime overhead was quite limited. 

Perhaps it would also make sense to performance-test on
network-facing software and system software such as systemd?

> Which platform the 35% performance slowdown was on?

arm-linux-eabi on ARM Cortex-A9. 

brgds, H-P


Re: [Committed] RISC-V: Remove math.h import to resolve missing stubs failures

2023-09-21 Thread Patrick O'Neill

Hi Juzhe,

I've added this to my TODO once pre-commit patchworks CI is sorted out :)

How often do we want to run these tests?

I was thinking of starting up a once-weekly run of compute-intensive
tasks like --enable-checking=rtl builds/testsuite runs.

If we aren't expecting frequent breakages, it probably makes sense to
run these intrinsic tests weekly too.

Patrick


On 9/21/23 04:46, juzhe.zh...@rivai.ai wrote:

Hi, Patrick.

GNU rvv intrinsic api test-generator has been merged:
https://github.com/riscv-non-isa/rvv-intrinsic-doc/commits/main

Could you include the full RVV intrinsic API test in your test CI?
Currently, we don't include all API test in the GCC testsuite since 
it's too big.



juzhe.zh...@rivai.ai

*From:* Patrick O'Neill 
*Date:* 2023-09-21 01:51
*To:* Kito Cheng 
*CC:* GCC Patches ; Robin Dapp
; 钟居哲 
*Subject:* [Committed] RISC-V: Remove math.h import to resolve
missing stubs failures

Committed. Thanks!

On 9/20/23 10:19, Kito Cheng wrote:

LGTM

Patrick O'Neill  於 2023年9月20日 週三 18:07
寫道:

Resolves some of the missing stubs failures:
fatal error: gnu/stubs-lp64d.h: No such file or directory
compilation terminated.

2023-09-20 Juzhe Zhong 

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/vls/def.h: Remove
unneeded math.h
        import.

Tested-by: Patrick O'Neill 
---
Tested using 590a8bec3ed92118e084b0a1897d3314a666170e
glibc rv64gcv
glibc rv32gcv

glibc rv64gcv
Resolved failures:
FAIL: gcc.target/riscv/rvv/autovec/vls/mov-2.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/mov-4.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/mov-6.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)

glibc rv32gcv
Resolved failures:
FAIL: gcc.target/riscv/rvv/autovec/vls/and-1.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/and-2.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/and-3.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-1.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-2.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-3.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-4.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-5.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/cmp-6.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-1.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-2.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-3.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-4.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/const-5.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/div-1.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/dup-1.c -O3
-ftree-vectorize --param riscv-autovec-preference=scalable
(test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/vls/dup-2.c -O3
-ftree-vectorize -

Re: [Committed] RISC-V: Support VLS unary floating-point patterns

2023-09-21 Thread Patrick O'Neill

On 9/21/23 09:14, Patrick O'Neill wrote:


On 9/21/23 03:20, Palmer Dabbelt wrote:

On Wed, 20 Sep 2023 10:47:23 PDT (-0700), Patrick O'Neill wrote:

...

[1]:
Executing on host:
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc 

-B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/ 

/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c 


-march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output
-O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
-fno-vect-cost-model -ffast-math -fno-schedule-insns
-fno-schedule-insns2 -S   -o math-ceil-1.s (timeout = 600)
spawn -ignore SIGHUP
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/xgcc 

-B/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/ 

/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c 


-march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output
-O3 -ftree-vectorize -march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
-fno-vect-cost-model -ffast-math -fno-schedule-insns
-fno-schedule-insns2 -S -o math-ceil-1.s
In file included from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/features.h:515, 


  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/bits/libc-header-start.h:33, 


  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/math.h:27, 


  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h:1, 


  from
/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c:5: 

/github/ewlu-runner-2/_work/riscv-gnu-toolchain/riscv-gnu-toolchain/build/sysroot/usr/include/gnu/stubs.h:17:11: 


fatal error: gnu/stubs-lp64d.h: No such file or directory


That looks like a toolchain build/configuration issue, not a test 
issue.  IIRC this comes up from time to time, something's probably 
broken in riscv-gnu-toolchain but I'm not sure what's wrong.


I get a working setup with just `./configure --enable-linux 
--disable-multilib` and the latest riscv-gnu-toolchain master. How 
are you building things?

I've kicked off a few builds to confirm but I use:

The non-multilib runs finished and still have issues.

The missing-stubs failures go away when building with multilib - so at
least now I have something to compare against :)
I'll dig into why and see what's needed for non-multilib tests to pass.

Patrick


git clone https://github.com/riscv-collab/riscv-gnu-toolchain
cd riscv-gnu-toolchain
git submodule update --init gcc
cd gcc && git checkout master && cd ..
mkdir build && cd build
../configure --prefix=$(pwd) --with-arch=rv32gcv --with-abi=ilp32d

Make sure you bump gcc to tip of tree!
The latest riscv-gnu-toolchain master uses gcc 12.2 which doesn't have
the testcases that are failing (rvv folder):
https://github.com/gcc-mirror/gcc/tree/2ee5e4300186a92ad73f1a1a64cb918dc76c8d67/gcc/testsuite/gcc.target/riscv 



The failures only show up for tests in:
gcc.target/riscv/rvv/
gcc.dg/vect/costmodel/riscv/rvv/

After bumping gcc and using your command (with --prefix)
../configure --enable-linux --disable-multilib --prefix=$(pwd)
I still get the missing stubs failures.

I've also tried this with the build directory both inside and outside
the repo and that doesn't make a difference.

Patrick




compilation terminated.
compiler exited with status 1
FAIL: gcc.target/riscv/rvv/autovec/math-ceil-1.c -O3 -ftree-vectorize
(test for excess errors)

[2]:
https://github.com/ewlu/riscv-gnu-toolchain/issues/170

[3]:
This also extends beyond math.h. I'm seeing similar failures for
testcases like
gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-rv64-1.c that
#include .


Re: [PATCH v3] RISC-V: Support ceil and ceilf auto-vectorization

2023-09-21 Thread 钟居哲
Add FP16 tests:
https://godbolt.org/z/e9vrzKTvn

Like LLVM.


diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
index 74685f8d05e..ccc1d1d70ab 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/def.h
@@ -518,3 +518,11 @@ typedef double v512df __attribute__ ((vector_size (4096)));
 for (int i = 0; i < NUM; i++)  
\
   dst[i] = ((TYPE2) a[i] + b[i] + 1) >> 1; 
\
   }
+
+#define DEF_CALL_V(PREFIX, NUM, TYPE, CALL)
\
+  void __attribute__ ((noinline, noclone)) 
\
+  PREFIX##_##TYPE##NUM (TYPE *restrict a, TYPE *restrict b)
\
+  {
\
+for (int i = 0; i < NUM; ++i)  
\
+  a[i] = CALL (b[i]);  
\
+  }

You don't need to add this.

Just directly use this in def.h:
#define DEF_OP_V(PREFIX, NUM, TYPE, OP)\
  void __attribute__ ((noinline, noclone)) \
  PREFIX##_##TYPE##NUM (TYPE *restrict a, TYPE *restrict b)\
  {\
for (int i = 0; i < NUM; ++i)  \
  a[i] = OP (b[i]);\
  }





juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-21 23:18
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v3] RISC-V: Support ceil and ceilf auto-vectorization
From: Pan Li 
 
This patch would like to support auto-vectorization for both the
ceil and ceilf of math.h. It depends on the -ffast-math option.
 
When we would like to call ceil/ceilf like v2 = ceil (v1), we will
convert it into below insn (reference the implementation of llvm).
 
* vfcvt.x.f v3, v1, RUP
* vfcvt.f.x v2, v3
 
However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.
 
  +---+---+
  | float | binary layout |
  +---+---+
  | 8388607.5 | 0x4aff|
  | 8388608.0 | 0x4b00|
  | 8388609.0 | 0x4b01|
  +---+---+
 
All single floating point great than 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.
 
Befor this patch:
math-ceil-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callceilf
  fsw fa0,-4(s1)
  bne s0,s2,.L3
 
After this patch:
  ...
  fsrmi   3
.L4:
  vfabs.v v0,v1
  vmv1r.v v2,v1
  vmflt.vvv0,v0,v4
  sub a3,a3,a4
  vfcvt.x.f.v v3,v1,v0.t
  vfcvt.f.x.v v2,v3,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret
 
Please note VLS mode is also involved in this patch and covered by the
test cases.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (ceil2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_ceil): New function decl.
* config/riscv/riscv-v.cc (gen_ceil_const_fp): New function impl.
(expand_vec_float_cmp_mask): Ditto.
(expand_vec_copysign): Ditto.
(expand_vec_ceil): Ditto.
* config/riscv/vector.md: Add VLS mode support.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls/def.h: New macro.
* gcc.target/riscv/rvv/autovec/math-ceil-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/test-math.h: New test.
* gcc.target/riscv/rvv/autovec/vls/math-ceil-1.c: New test.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/autovec.md   |  16 +++
gcc/config/riscv/riscv-protos.h   |   5 +
gcc/config/riscv/riscv-v.cc   | 116 ++
gcc/config/riscv/vector.md|   2 +-
.../riscv/rvv/autovec/math-ceil-1.c   |  26 
.../riscv/rvv/autovec/math-ceil-2.c   |  26 
.../riscv/rvv/autovec/math-ceil-3.c   |  28 +
.../riscv/rvv/autovec/math-ceil-run-1.c   |  39 ++
.../riscv/rvv/autovec/math-ceil-run-2.c   |  39 ++
.../gcc.target/riscv/rvv/autovec/test-math.h  |  38 ++
.../gcc.target/riscv/rvv/autovec/vls/def.h|   8 ++
.../riscv/rvv/autovec/vls/math-ceil-1.c   |  43 +++
12 files changed, 385 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
create mode 100644 gcc/

[Committed] RISC-V: Add VLS integer ABS support

2023-09-21 Thread Juzhe-Zhong
Regression passed.

Committed.

gcc/ChangeLog:

* config/riscv/autovec.md: Extend VLS modes.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/abs-2.c: New test.

---
 gcc/config/riscv/autovec.md   |  6 +-
 .../gcc.target/riscv/rvv/autovec/vls/abs-2.c  | 62 +++
 2 files changed, 65 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/abs-2.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index f0f1abc4e82..c895d41376d 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1004,9 +1004,9 @@
 ;; 
---
 
 (define_insn_and_split "abs2"
-  [(set (match_operand:VI 0 "register_operand")
- (abs:VI
-   (match_operand:VI 1 "register_operand")))]
+  [(set (match_operand:V_VLSI 0 "register_operand")
+ (abs:V_VLSI
+   (match_operand:V_VLSI 1 "register_operand")))]
   "TARGET_VECTOR && can_create_pseudo_p ()"
   "#"
   "&& 1"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/abs-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/abs-2.c
new file mode 100644
index 000..e98f5c4bbf8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/abs-2.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mabi=lp64d -O3 
--param=riscv-autovec-lmul=m8 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (neg, 4, int8_t, __builtin_abs)
+DEF_OP_V (neg, 8, int8_t, __builtin_abs)
+DEF_OP_V (neg, 16, int8_t, __builtin_abs)
+DEF_OP_V (neg, 32, int8_t, __builtin_abs)
+DEF_OP_V (neg, 64, int8_t, __builtin_abs)
+DEF_OP_V (neg, 128, int8_t, __builtin_abs)
+DEF_OP_V (neg, 256, int8_t, __builtin_abs)
+DEF_OP_V (neg, 512, int8_t, __builtin_abs)
+DEF_OP_V (neg, 1024, int8_t, __builtin_abs)
+DEF_OP_V (neg, 2048, int8_t, __builtin_abs)
+DEF_OP_V (neg, 4096, int8_t, __builtin_abs)
+
+DEF_OP_V (neg, 4, int16_t, __builtin_abs)
+DEF_OP_V (neg, 8, int16_t, __builtin_abs)
+DEF_OP_V (neg, 16, int16_t, __builtin_abs)
+DEF_OP_V (neg, 32, int16_t, __builtin_abs)
+DEF_OP_V (neg, 64, int16_t, __builtin_abs)
+DEF_OP_V (neg, 128, int16_t, __builtin_abs)
+DEF_OP_V (neg, 256, int16_t, __builtin_abs)
+DEF_OP_V (neg, 512, int16_t, __builtin_abs)
+DEF_OP_V (neg, 1024, int16_t, __builtin_abs)
+DEF_OP_V (neg, 2048, int16_t, __builtin_abs)
+
+DEF_OP_V (neg, 4, int32_t, __builtin_abs)
+DEF_OP_V (neg, 8, int32_t, __builtin_abs)
+DEF_OP_V (neg, 16, int32_t, __builtin_abs)
+DEF_OP_V (neg, 32, int32_t, __builtin_abs)
+DEF_OP_V (neg, 64, int32_t, __builtin_abs)
+DEF_OP_V (neg, 128, int32_t, __builtin_abs)
+DEF_OP_V (neg, 256, int32_t, __builtin_abs)
+DEF_OP_V (neg, 512, int32_t, __builtin_abs)
+DEF_OP_V (neg, 1024, int32_t, __builtin_abs)
+
+DEF_OP_V (neg, 4, int64_t, __builtin_abs)
+DEF_OP_V (neg, 8, int64_t, __builtin_abs)
+DEF_OP_V (neg, 16, int64_t, __builtin_abs)
+DEF_OP_V (neg, 32, int64_t, __builtin_abs)
+DEF_OP_V (neg, 64, int64_t, __builtin_abs)
+DEF_OP_V (neg, 128, int64_t, __builtin_abs)
+DEF_OP_V (neg, 256, int64_t, __builtin_abs)
+DEF_OP_V (neg, 512, int64_t, __builtin_abs)
+
+/* { dg-final { scan-assembler-times {vneg\.v} 38 } } */
+/* { dg-final { scan-assembler-times {vmslt\.vi} 38 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
-- 
2.36.3



Re: [PATCH 8/8] OpenMP: Fortran "!$omp declare mapper" support

2023-09-21 Thread Bernhard Reutner-Fischer
On 18 September 2023 12:19:17 CEST, Julian Brown  
wrote:
>On Thu, 14 Sep 2023 17:13:02 +0200
>Bernhard Reutner-Fischer via Gcc-patches 
>wrote:
>
>> On Tue, 5 Sep 2023 12:28:28 -0700
>> Julian Brown  wrote:
>> 
>> > +  static bool
>> > +  equal (const omp_name_type &a,
>> > +   const omp_name_type &b)
>> > +  {
>> > +if (a.name == NULL_TREE && b.name == NULL_TREE)
>> > +  return a.type == b.type;  
>> 
>> I'm curious if (and why) the type comparison above is safe and does
>> not use gfc_compare_types () ?
>> 
>> thanks,
>
>Probably ignorance on my part! It works for (derived, class) types which
>are canonicalized to exactly the same gfc_typespec, but you're likely
>right that a more Fortran-ish notion of type equality should be used
>here instead when comparing "declare mapper"s.

Sounds like a word2vec question.

>
>Or maybe using gfc_compare_types would smush too many types together
>into one? E.g. if b.type is an extension of a.type, do we want those
>types to be able to have separate mappers?

I fear this really depends.

>
>I'll have a look at addressing this when it's time to reroll these
>patches.
>
>Thanks,

TIA,

PS: I'm not thrilled about those recent get_identifier("") as temporary 
additions fed into the hasher. I know it's not funny, but please let's avoid 
that. Said the cat.

>
>Julian



[PATCH v4] RISC-V: Support ceil and ceilf auto-vectorization

2023-09-21 Thread pan2 . li
From: Pan Li 

Update in v4:

* Add test for _Float16.
* Remove unnecessary macro in def.h for test.

Original log:

This patch would like to support auto-vectorization for both the
ceil and ceilf of math.h. It depends on the -ffast-math option.

When we would like to call ceil/ceilf like v2 = ceil (v1), we will
convert it into below insn (reference the implementation of llvm).

* vfcvt.x.f v3, v1, RUP
* vfcvt.f.x v2, v3

However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.

  +---+---+
  | float | binary layout |
  +---+---+
  | 8388607.5 | 0x4aff|
  | 8388608.0 | 0x4b00|
  | 8388609.0 | 0x4b01|
  +---+---+

All single floating point great than 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.

Befor this patch:
math-ceil-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callceilf
  fsw fa0,-4(s1)
  bne s0,s2,.L3

After this patch:
  ...
  fsrmi   3
.L4:
  vfabs.v v0,v1
  vmv1r.v v2,v1
  vmflt.vvv0,v0,v4
  sub a3,a3,a4
  vfcvt.x.f.v v3,v1,v0.t
  vfcvt.f.x.v v2,v3,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret

Please note VLS mode is also involved in this patch and covered by the
test cases.

gcc/ChangeLog:

* config/riscv/autovec.md (ceil2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_ceil): New function decl.
* config/riscv/riscv-v.cc (gen_ceil_const_fp): New function impl.
(expand_vec_float_cmp_mask): Ditto.
(expand_vec_copysign): Ditto.
(expand_vec_ceil): Ditto.
* config/riscv/vector.md: Add VLS mode support.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/math-ceil-0.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-0.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/test-math.h: New test.
* gcc.target/riscv/rvv/autovec/vls/math-ceil-1.c: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec.md   |  16 +++
 gcc/config/riscv/riscv-protos.h   |   5 +
 gcc/config/riscv/riscv-v.cc   | 133 ++
 gcc/config/riscv/vector.md|   2 +-
 .../riscv/rvv/autovec/math-ceil-0.c   |  26 
 .../riscv/rvv/autovec/math-ceil-1.c   |  26 
 .../riscv/rvv/autovec/math-ceil-2.c   |  26 
 .../riscv/rvv/autovec/math-ceil-3.c   |  28 
 .../riscv/rvv/autovec/math-ceil-run-0.c   |  39 +
 .../riscv/rvv/autovec/math-ceil-run-1.c   |  39 +
 .../riscv/rvv/autovec/math-ceil-run-2.c   |  39 +
 .../gcc.target/riscv/rvv/autovec/test-math.h  |  38 +
 .../riscv/rvv/autovec/vls/math-ceil-1.c   |  56 
 13 files changed, 472 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-0.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-ceil-1.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index f0f1abc4e82..1b4bd82f9ec 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2239,3 +2239,19 @@ (define_expand "avg3_ceil"
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops3);
   DONE;
 })
+
+;; -
+;;  [FP] Math.h.
+;; -
+;; Includes:
+;; - ceil/ceilf
+;; -
+(define_expand "ceil2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_ceil (operands[0], operands[1], mode, 
mode);
+DONE

Re: [PATCH v4] RISC-V: Support ceil and ceilf auto-vectorization

2023-09-21 Thread juzhe.zh...@rivai.ai
LGTM



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-22 08:12
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v4] RISC-V: Support ceil and ceilf auto-vectorization
From: Pan Li 
 
Update in v4:
 
* Add test for _Float16.
* Remove unnecessary macro in def.h for test.
 
Original log:
 
This patch would like to support auto-vectorization for both the
ceil and ceilf of math.h. It depends on the -ffast-math option.
 
When we would like to call ceil/ceilf like v2 = ceil (v1), we will
convert it into below insn (reference the implementation of llvm).
 
* vfcvt.x.f v3, v1, RUP
* vfcvt.f.x v2, v3
 
However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.
 
  +---+---+
  | float | binary layout |
  +---+---+
  | 8388607.5 | 0x4aff|
  | 8388608.0 | 0x4b00|
  | 8388609.0 | 0x4b01|
  +---+---+
 
All single floating point great than 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.
 
Befor this patch:
math-ceil-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callceilf
  fsw fa0,-4(s1)
  bne s0,s2,.L3
 
After this patch:
  ...
  fsrmi   3
.L4:
  vfabs.v v0,v1
  vmv1r.v v2,v1
  vmflt.vvv0,v0,v4
  sub a3,a3,a4
  vfcvt.x.f.v v3,v1,v0.t
  vfcvt.f.x.v v2,v3,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret
 
Please note VLS mode is also involved in this patch and covered by the
test cases.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (ceil2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_ceil): New function decl.
* config/riscv/riscv-v.cc (gen_ceil_const_fp): New function impl.
(expand_vec_float_cmp_mask): Ditto.
(expand_vec_copysign): Ditto.
(expand_vec_ceil): Ditto.
* config/riscv/vector.md: Add VLS mode support.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/math-ceil-0.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-0.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/test-math.h: New test.
* gcc.target/riscv/rvv/autovec/vls/math-ceil-1.c: New test.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/autovec.md   |  16 +++
gcc/config/riscv/riscv-protos.h   |   5 +
gcc/config/riscv/riscv-v.cc   | 133 ++
gcc/config/riscv/vector.md|   2 +-
.../riscv/rvv/autovec/math-ceil-0.c   |  26 
.../riscv/rvv/autovec/math-ceil-1.c   |  26 
.../riscv/rvv/autovec/math-ceil-2.c   |  26 
.../riscv/rvv/autovec/math-ceil-3.c   |  28 
.../riscv/rvv/autovec/math-ceil-run-0.c   |  39 +
.../riscv/rvv/autovec/math-ceil-run-1.c   |  39 +
.../riscv/rvv/autovec/math-ceil-run-2.c   |  39 +
.../gcc.target/riscv/rvv/autovec/test-math.h  |  38 +
.../riscv/rvv/autovec/vls/math-ceil-1.c   |  56 
13 files changed, 472 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-ceil-1.c
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index f0f1abc4e82..1b4bd82f9ec 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2239,3 +2239,19 @@ (define_expand "avg3_ceil"
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops3);
   DONE;
})
+
+;; -
+;;  [FP] Math.h.
+;; -
+;; Includes:
+;; - ceil/ceilf
+;; -
+(define_expand "ceil2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec

[PATCH v1] RISC-V: Leverage __builtin_xx instead of math.h for test

2023-09-21 Thread pan2 . li
From: Pan Li 

The math.h may have problems in some environment, take __builtin__xx
instead for testing.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c:
Remove reference to math.h.
* gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/floating-point-sgnjx-2.c: Ditto.

Signed-off-by: Pan Li 
---
 .../rvv/autovec/vls/floating-point-max-5.c| 43 +--
 .../rvv/autovec/vls/floating-point-min-5.c| 43 +--
 .../rvv/autovec/vls/floating-point-sgnjx-2.c  | 43 +--
 3 files changed, 63 insertions(+), 66 deletions(-)

diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c
index 775ddb1d25e..dd163682396 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c
@@ -2,30 +2,29 @@
 /* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
-fno-schedule-insns -fno-schedule-insns2 --param=riscv-autovec-lmul=m8 
-ffast-math" } */
 
 #include "def.h"
-#include "math.h"
 
-DEF_CALL_VV (max, 1, float, fmaxf)
-DEF_CALL_VV (max, 2, float, fmaxf)
-DEF_CALL_VV (max, 4, float, fmaxf)
-DEF_CALL_VV (max, 8, float, fmaxf)
-DEF_CALL_VV (max, 16, float, fmaxf)
-DEF_CALL_VV (max, 32, float, fmaxf)
-DEF_CALL_VV (max, 64, float, fmaxf)
-DEF_CALL_VV (max, 128, float, fmaxf)
-DEF_CALL_VV (max, 256, float, fmaxf)
-DEF_CALL_VV (max, 512, float, fmaxf)
-DEF_CALL_VV (max, 1024, float, fmaxf)
+DEF_CALL_VV (max, 1, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 2, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 4, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 8, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 16, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 32, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 64, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 128, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 256, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 512, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 1024, float, __builtin_fmaxf)
 
-DEF_CALL_VV (max, 1, double, fmax)
-DEF_CALL_VV (max, 2, double, fmax)
-DEF_CALL_VV (max, 4, double, fmax)
-DEF_CALL_VV (max, 8, double, fmax)
-DEF_CALL_VV (max, 16, double, fmax)
-DEF_CALL_VV (max, 32, double, fmax)
-DEF_CALL_VV (max, 64, double, fmax)
-DEF_CALL_VV (max, 128, double, fmax)
-DEF_CALL_VV (max, 256, double, fmax)
-DEF_CALL_VV (max, 512, double, fmax)
+DEF_CALL_VV (max, 1, double, __builtin_fmax)
+DEF_CALL_VV (max, 2, double, __builtin_fmax)
+DEF_CALL_VV (max, 4, double, __builtin_fmax)
+DEF_CALL_VV (max, 8, double, __builtin_fmax)
+DEF_CALL_VV (max, 16, double, __builtin_fmax)
+DEF_CALL_VV (max, 32, double, __builtin_fmax)
+DEF_CALL_VV (max, 64, double, __builtin_fmax)
+DEF_CALL_VV (max, 128, double, __builtin_fmax)
+DEF_CALL_VV (max, 256, double, __builtin_fmax)
+DEF_CALL_VV (max, 512, double, __builtin_fmax)
 
 /* { dg-final { scan-assembler-times 
{vfmax\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c
index 1e9ff7d5054..0e3cbf2acec 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c
@@ -2,30 +2,29 @@
 /* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
-fno-schedule-insns -fno-schedule-insns2 --param=riscv-autovec-lmul=m8 
-ffast-math" } */
 
 #include "def.h"
-#include "math.h"
 
-DEF_CALL_VV (min, 1, float, fminf)
-DEF_CALL_VV (min, 2, float, fminf)
-DEF_CALL_VV (min, 4, float, fminf)
-DEF_CALL_VV (min, 8, float, fminf)
-DEF_CALL_VV (min, 16, float, fminf)
-DEF_CALL_VV (min, 32, float, fminf)
-DEF_CALL_VV (min, 64, float, fminf)
-DEF_CALL_VV (min, 128, float, fminf)
-DEF_CALL_VV (min, 256, float, fminf)
-DEF_CALL_VV (min, 512, float, fminf)
-DEF_CALL_VV (min, 1024, float, fminf)
+DEF_CALL_VV (min, 1, float, __builtin_fminf)
+DEF_CALL_VV (min, 2, float, __builtin_fminf)
+DEF_CALL_VV (min, 4, float, __builtin_fminf)
+DEF_CALL_VV (min, 8, float, __builtin_fminf)
+DEF_CALL_VV (min, 16, float, __builtin_fminf)
+DEF_CALL_VV (min, 32, float, __builtin_fminf)
+DEF_CALL_VV (min, 64, float, __builtin_fminf)
+DEF_CALL_VV (min, 128, float, __builtin_fminf)
+DEF_CALL_VV (min, 256, float, __builtin_fminf)
+DEF_CALL_VV (min, 512, float, __builtin_fminf)
+DEF_CALL_VV (min, 1024, float, __builtin_fminf)
 
-DEF_CALL_VV (min, 1, double, fmin)
-DEF_CALL_VV (min, 2, double, fmin)
-DEF_CALL_VV (min, 4, double, fmin)
-DEF_CALL_VV (min, 8, double, fmin)
-DEF_CALL_VV (min, 16, double, fmin)
-DEF_CALL_VV (min, 32, double, fmin)
-DEF_CALL_VV (min, 64, double, fmin)
-DEF_CALL_VV (min, 128, double, fmin)
-DEF_CALL_VV (min, 256, double, fmin)
-DEF_CALL_VV (min, 512, dou

Re: [PATCH v1] RISC-V: Leverage __builtin_xx instead of math.h for test

2023-09-21 Thread juzhe.zh...@rivai.ai
LGTM。



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-22 09:12
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Leverage __builtin_xx instead of math.h for test
From: Pan Li 
 
The math.h may have problems in some environment, take __builtin__xx
instead for testing.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c:
Remove reference to math.h.
* gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/floating-point-sgnjx-2.c: Ditto.
 
Signed-off-by: Pan Li 
---
.../rvv/autovec/vls/floating-point-max-5.c| 43 +--
.../rvv/autovec/vls/floating-point-min-5.c| 43 +--
.../rvv/autovec/vls/floating-point-sgnjx-2.c  | 43 +--
3 files changed, 63 insertions(+), 66 deletions(-)
 
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c
index 775ddb1d25e..dd163682396 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c
@@ -2,30 +2,29 @@
/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
-fno-schedule-insns -fno-schedule-insns2 --param=riscv-autovec-lmul=m8 
-ffast-math" } */
#include "def.h"
-#include "math.h"
-DEF_CALL_VV (max, 1, float, fmaxf)
-DEF_CALL_VV (max, 2, float, fmaxf)
-DEF_CALL_VV (max, 4, float, fmaxf)
-DEF_CALL_VV (max, 8, float, fmaxf)
-DEF_CALL_VV (max, 16, float, fmaxf)
-DEF_CALL_VV (max, 32, float, fmaxf)
-DEF_CALL_VV (max, 64, float, fmaxf)
-DEF_CALL_VV (max, 128, float, fmaxf)
-DEF_CALL_VV (max, 256, float, fmaxf)
-DEF_CALL_VV (max, 512, float, fmaxf)
-DEF_CALL_VV (max, 1024, float, fmaxf)
+DEF_CALL_VV (max, 1, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 2, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 4, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 8, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 16, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 32, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 64, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 128, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 256, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 512, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 1024, float, __builtin_fmaxf)
-DEF_CALL_VV (max, 1, double, fmax)
-DEF_CALL_VV (max, 2, double, fmax)
-DEF_CALL_VV (max, 4, double, fmax)
-DEF_CALL_VV (max, 8, double, fmax)
-DEF_CALL_VV (max, 16, double, fmax)
-DEF_CALL_VV (max, 32, double, fmax)
-DEF_CALL_VV (max, 64, double, fmax)
-DEF_CALL_VV (max, 128, double, fmax)
-DEF_CALL_VV (max, 256, double, fmax)
-DEF_CALL_VV (max, 512, double, fmax)
+DEF_CALL_VV (max, 1, double, __builtin_fmax)
+DEF_CALL_VV (max, 2, double, __builtin_fmax)
+DEF_CALL_VV (max, 4, double, __builtin_fmax)
+DEF_CALL_VV (max, 8, double, __builtin_fmax)
+DEF_CALL_VV (max, 16, double, __builtin_fmax)
+DEF_CALL_VV (max, 32, double, __builtin_fmax)
+DEF_CALL_VV (max, 64, double, __builtin_fmax)
+DEF_CALL_VV (max, 128, double, __builtin_fmax)
+DEF_CALL_VV (max, 256, double, __builtin_fmax)
+DEF_CALL_VV (max, 512, double, __builtin_fmax)
/* { dg-final { scan-assembler-times 
{vfmax\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c
index 1e9ff7d5054..0e3cbf2acec 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c
@@ -2,30 +2,29 @@
/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
-fno-schedule-insns -fno-schedule-insns2 --param=riscv-autovec-lmul=m8 
-ffast-math" } */
#include "def.h"
-#include "math.h"
-DEF_CALL_VV (min, 1, float, fminf)
-DEF_CALL_VV (min, 2, float, fminf)
-DEF_CALL_VV (min, 4, float, fminf)
-DEF_CALL_VV (min, 8, float, fminf)
-DEF_CALL_VV (min, 16, float, fminf)
-DEF_CALL_VV (min, 32, float, fminf)
-DEF_CALL_VV (min, 64, float, fminf)
-DEF_CALL_VV (min, 128, float, fminf)
-DEF_CALL_VV (min, 256, float, fminf)
-DEF_CALL_VV (min, 512, float, fminf)
-DEF_CALL_VV (min, 1024, float, fminf)
+DEF_CALL_VV (min, 1, float, __builtin_fminf)
+DEF_CALL_VV (min, 2, float, __builtin_fminf)
+DEF_CALL_VV (min, 4, float, __builtin_fminf)
+DEF_CALL_VV (min, 8, float, __builtin_fminf)
+DEF_CALL_VV (min, 16, float, __builtin_fminf)
+DEF_CALL_VV (min, 32, float, __builtin_fminf)
+DEF_CALL_VV (min, 64, float, __builtin_fminf)
+DEF_CALL_VV (min, 128, float, __builtin_fminf)
+DEF_CALL_VV (min, 256, float, __builtin_fminf)
+DEF_CALL_VV (min, 512, float, __builtin_fminf)
+DEF_CALL_VV (min, 1024, float, __builtin_fminf)
-DEF_CALL_VV (min, 1, double, fmin)
-DEF_CALL_VV (min, 2, double, fmin)
-DEF_CALL_VV (min, 4, double, fmin)
-DEF_CALL_VV (min, 8, double, fmin)
-DEF_CALL_VV (min, 16, double, fmin)
-DEF_CALL_VV (

RE: [PATCH v1] RISC-V: Leverage __builtin_xx instead of math.h for test

2023-09-21 Thread Li, Pan2
Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Friday, September 22, 2023 9:17 AM
To: Li, Pan2 ; gcc-patches 
Cc: Li, Pan2 ; Wang, Yanzhang ; 
kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Leverage __builtin_xx instead of math.h for test

LGTM。


juzhe.zh...@rivai.ai

From: pan2.li
Date: 2023-09-22 09:12
To: gcc-patches
CC: juzhe.zhong; 
pan2.li; 
yanzhang.wang; 
kito.cheng
Subject: [PATCH v1] RISC-V: Leverage __builtin_xx instead of math.h for test
From: Pan Li mailto:pan2...@intel.com>>

The math.h may have problems in some environment, take __builtin__xx
instead for testing.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c:
Remove reference to math.h.
* gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/floating-point-sgnjx-2.c: Ditto.

Signed-off-by: Pan Li mailto:pan2...@intel.com>>
---
.../rvv/autovec/vls/floating-point-max-5.c| 43 +--
.../rvv/autovec/vls/floating-point-min-5.c| 43 +--
.../rvv/autovec/vls/floating-point-sgnjx-2.c  | 43 +--
3 files changed, 63 insertions(+), 66 deletions(-)

diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c
index 775ddb1d25e..dd163682396 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-max-5.c
@@ -2,30 +2,29 @@
/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
-fno-schedule-insns -fno-schedule-insns2 --param=riscv-autovec-lmul=m8 
-ffast-math" } */
#include "def.h"
-#include "math.h"
-DEF_CALL_VV (max, 1, float, fmaxf)
-DEF_CALL_VV (max, 2, float, fmaxf)
-DEF_CALL_VV (max, 4, float, fmaxf)
-DEF_CALL_VV (max, 8, float, fmaxf)
-DEF_CALL_VV (max, 16, float, fmaxf)
-DEF_CALL_VV (max, 32, float, fmaxf)
-DEF_CALL_VV (max, 64, float, fmaxf)
-DEF_CALL_VV (max, 128, float, fmaxf)
-DEF_CALL_VV (max, 256, float, fmaxf)
-DEF_CALL_VV (max, 512, float, fmaxf)
-DEF_CALL_VV (max, 1024, float, fmaxf)
+DEF_CALL_VV (max, 1, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 2, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 4, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 8, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 16, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 32, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 64, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 128, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 256, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 512, float, __builtin_fmaxf)
+DEF_CALL_VV (max, 1024, float, __builtin_fmaxf)
-DEF_CALL_VV (max, 1, double, fmax)
-DEF_CALL_VV (max, 2, double, fmax)
-DEF_CALL_VV (max, 4, double, fmax)
-DEF_CALL_VV (max, 8, double, fmax)
-DEF_CALL_VV (max, 16, double, fmax)
-DEF_CALL_VV (max, 32, double, fmax)
-DEF_CALL_VV (max, 64, double, fmax)
-DEF_CALL_VV (max, 128, double, fmax)
-DEF_CALL_VV (max, 256, double, fmax)
-DEF_CALL_VV (max, 512, double, fmax)
+DEF_CALL_VV (max, 1, double, __builtin_fmax)
+DEF_CALL_VV (max, 2, double, __builtin_fmax)
+DEF_CALL_VV (max, 4, double, __builtin_fmax)
+DEF_CALL_VV (max, 8, double, __builtin_fmax)
+DEF_CALL_VV (max, 16, double, __builtin_fmax)
+DEF_CALL_VV (max, 32, double, __builtin_fmax)
+DEF_CALL_VV (max, 64, double, __builtin_fmax)
+DEF_CALL_VV (max, 128, double, __builtin_fmax)
+DEF_CALL_VV (max, 256, double, __builtin_fmax)
+DEF_CALL_VV (max, 512, double, __builtin_fmax)
/* { dg-final { scan-assembler-times 
{vfmax\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c
index 1e9ff7d5054..0e3cbf2acec 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/floating-point-min-5.c
@@ -2,30 +2,29 @@
/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
-fno-schedule-insns -fno-schedule-insns2 --param=riscv-autovec-lmul=m8 
-ffast-math" } */
#include "def.h"
-#include "math.h"
-DEF_CALL_VV (min, 1, float, fminf)
-DEF_CALL_VV (min, 2, float, fminf)
-DEF_CALL_VV (min, 4, float, fminf)
-DEF_CALL_VV (min, 8, float, fminf)
-DEF_CALL_VV (min, 16, float, fminf)
-DEF_CALL_VV (min, 32, float, fminf)
-DEF_CALL_VV (min, 64, float, fminf)
-DEF_CALL_VV (min, 128, float, fminf)
-DEF_CALL_VV (min, 256, float, fminf)
-DEF_CALL_VV (min, 512, float, fminf)
-DEF_CALL_VV (min, 1024, float, fminf)
+DEF_CALL_VV (min, 1, float, __builtin_fminf)
+DEF_CALL_VV (min, 2, float, __builtin_fminf)
+DEF_CALL_VV (min, 4, float, __builtin_fminf)
+DEF_CALL_VV (min, 8, float, __builti

[PATCH] RISC-V: Optimization of vrgather.vv into vrgatherei16.vv[PR111451]

2023-09-21 Thread Li Xu
From: xuli 

Consider this following case:

typedef int32_t vnx32si __attribute__ ((vector_size (128)));

  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
   TYPE *out)  \
  {\
TYPE v \
  = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
*(TYPE *) out = v; \
  }

  T (vnx32si, 32)  \

TEST_ALL (PERMUTE)

Before this patch:
  lia4,31
  vsetvli   a5,zero,e32,m8,ta,ma
  vl8re32.v v24,0(a0)
  vid.v v8
  vrsub.vx  v8,v8,a4
  vrgather.vv   v16,v24,v8
  vs8r.vv16,0(a2)
  ret

The index vector register "v8" occupies 8 registers.
We should optimize it into vrgatherei16.vv which is
using int16 as the index elements.

After this patch:
  vsetvli   a5,zero,e16,m4,ta,ma
  lia4,31
  vid.v v4
  vl8re32.v v16,0(a0)
  vrsub.vx  v4,v4,a4
  vsetvli   zero,zero,e32,m8,ta,ma
  vrgatherei16.vv   v8,v16,v4
  vs8r.vv8,0(a2)
  ret
With vrgatherei16.vv, the v8 will occupy 4 registers instead
of 8. Lower the register consuming and register pressure.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (emit_vlmax_gather_insn): Optimization of 
vrgather.vv into vrgatherei16.vv.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c: Adjust case.
* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Ditto.
---
 gcc/config/riscv/riscv-v.cc   | 20 +++
 .../riscv/rvv/autovec/vls-vlmax/perm-4.c  |  3 ++-
 .../gcc.target/riscv/rvv/autovec/vls/perm-4.c |  3 ++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 64a71a128d4..271e0ff6dfc 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -783,6 +783,8 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
   insn_code icode;
   machine_mode data_mode = GET_MODE (target);
   machine_mode sel_mode = GET_MODE (sel);
+  unsigned int data_sew = get_sew (data_mode);
+  enum vlmul_type data_lmul = get_vlmul (data_mode);
   if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 icode = code_for_pred_gatherei16 (data_mode);
   else if (const_vec_duplicate_p (sel, &elt))
@@ -790,6 +792,24 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
   icode = code_for_pred_gather_scalar (data_mode);
   sel = elt;
 }
+  else if (CONST_VECTOR_P (sel) && data_sew != 16
+   && data_sew != 8 && (data_lmul == LMUL_2
+   || data_lmul == LMUL_4 || data_lmul == LMUL_8))
+{
+  /* If the inner mode of data is not QI or HI and data_lmul > 1,
+ emitting vrgatherei16.vv instruction will lower register
+ pressure.
+ data_mode  sel_mode  ei16
+ RVVM1QIRVVM1QI   RVVM2HI  not needed
+ RVVM2QIRVVM2QI   RVVM4HI  not needed
+ RVVM2HIRVVM2HI   RVVM2HI  not needed
+ RVVM2SIRVVM2SI   RVVM1HI  need
+ RVVM4SIRVVM4SI   RVVM2HI  need
+ RVVM8DIRVVM8DI   RVVM2HI  need */
+  PUT_MODE (sel, get_vector_mode (HImode,
+GET_MODE_NUNITS (data_mode)).require ());
+  icode = code_for_pred_gatherei16 (data_mode);
+}
   else
 icode = code_for_pred_gather (data_mode);
   rtx ops[] = {target, op, sel};
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
index 9df69a0cc2c..7ab31043547 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
@@ -55,6 +55,7 @@
 
 TEST_ALL (PERMUTE)
 
-/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
+/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times 
{vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
 /* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
 /* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
index 46cad8ea2f4..4d6862cf1c0 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,6 +3,7 @@
 
 #include "../vls-vlmax/perm-4.c"
 
-/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
+/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times 
{vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12

Re: [PATCH] RISC-V: Optimization of vrgather.vv into vrgatherei16.vv[PR111451]

2023-09-21 Thread juzhe.zh...@rivai.ai

+  unsigned int data_sew = get_sew (data_mode);
+  enum vlmul_type data_lmul = get_vlmul (data_mode);

Remove this.

+  else if (CONST_VECTOR_P (sel) && data_sew != 16
+   && data_sew != 8 && (data_lmul == LMUL_2
+   || data_lmul == LMUL_4 || data_lmul == LMUL_8))

change it into:

else if (CONST_VECTOR_P (sel) 
&& GET_MODE_BITSIZE (GET_MODE_INNER (sel_mode)).to_constant () > 16
&& riscv_get_v_regno_alignment (data_mode) > LMUL_1)




juzhe.zh...@rivai.ai
 
From: Li Xu
Date: 2023-09-22 09:33
To: gcc-patches
CC: kito.cheng; palmer; juzhe.zhong; xuli
Subject: [PATCH] RISC-V: Optimization of vrgather.vv into 
vrgatherei16.vv[PR111451]
From: xuli 
 
Consider this following case:
 
typedef int32_t vnx32si __attribute__ ((vector_size (128)));
 
  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
   TYPE *out)  \
  {\
TYPE v \
  = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
*(TYPE *) out = v; \
  }
 
  T (vnx32si, 32)  \
 
TEST_ALL (PERMUTE)
 
Before this patch:
  li a4,31
  vsetvli a5,zero,e32,m8,ta,ma
  vl8re32.v v24,0(a0)
  vid.v v8
  vrsub.vx v8,v8,a4
  vrgather.vv v16,v24,v8
  vs8r.v v16,0(a2)
  ret
 
The index vector register "v8" occupies 8 registers.
We should optimize it into vrgatherei16.vv which is
using int16 as the index elements.
 
After this patch:
  vsetvli a5,zero,e16,m4,ta,ma
  li a4,31
  vid.v v4
  vl8re32.v v16,0(a0)
  vrsub.vx v4,v4,a4
  vsetvli zero,zero,e32,m8,ta,ma
  vrgatherei16.vv v8,v16,v4
  vs8r.v v8,0(a2)
  ret
With vrgatherei16.vv, the v8 will occupy 4 registers instead
of 8. Lower the register consuming and register pressure.
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (emit_vlmax_gather_insn): Optimization of vrgather.vv 
into vrgatherei16.vv.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c: Adjust case.
* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Ditto.
---
gcc/config/riscv/riscv-v.cc   | 20 +++
.../riscv/rvv/autovec/vls-vlmax/perm-4.c  |  3 ++-
.../gcc.target/riscv/rvv/autovec/vls/perm-4.c |  3 ++-
3 files changed, 24 insertions(+), 2 deletions(-)
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 64a71a128d4..271e0ff6dfc 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -783,6 +783,8 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
   insn_code icode;
   machine_mode data_mode = GET_MODE (target);
   machine_mode sel_mode = GET_MODE (sel);
+  unsigned int data_sew = get_sew (data_mode);
+  enum vlmul_type data_lmul = get_vlmul (data_mode);
   if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 icode = code_for_pred_gatherei16 (data_mode);
   else if (const_vec_duplicate_p (sel, &elt))
@@ -790,6 +792,24 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
   icode = code_for_pred_gather_scalar (data_mode);
   sel = elt;
 }
+  else if (CONST_VECTOR_P (sel) && data_sew != 16
+   && data_sew != 8 && (data_lmul == LMUL_2
+   || data_lmul == LMUL_4 || data_lmul == LMUL_8))
+{
+  /* If the inner mode of data is not QI or HI and data_lmul > 1,
+ emitting vrgatherei16.vv instruction will lower register
+ pressure.
+ data_mode  sel_mode  ei16
+ RVVM1QIRVVM1QI   RVVM2HI  not needed
+ RVVM2QIRVVM2QI   RVVM4HI  not needed
+ RVVM2HIRVVM2HI   RVVM2HI  not needed
+ RVVM2SIRVVM2SI   RVVM1HI  need
+ RVVM4SIRVVM4SI   RVVM2HI  need
+ RVVM8DIRVVM8DI   RVVM2HI  need */
+  PUT_MODE (sel, get_vector_mode (HImode,
+GET_MODE_NUNITS (data_mode)).require ());
+  icode = code_for_pred_gatherei16 (data_mode);
+}
   else
 icode = code_for_pred_gather (data_mode);
   rtx ops[] = {target, op, sel};
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
index 9df69a0cc2c..7ab31043547 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
@@ -55,6 +55,7 @@
TEST_ALL (PERMUTE)
-/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
+/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times 
{vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c 
b/gcc/testsuite/gcc.targ

Re: Re: [PATCH] RISC-V: Optimization of vrgather.vv into vrgatherei16.vv[PR111451]

2023-09-21 Thread juzhe.zh...@rivai.ai
Sorry. It should be:

else if (CONST_VECTOR_P (sel) 
&& GET_MODE_BITSIZE (GET_MODE_INNER (sel_mode)).to_constant () > 16
&& riscv_get_v_regno_alignment (data_mode) > 1)



juzhe.zh...@rivai.ai
 
From: juzhe.zh...@rivai.ai
Date: 2023-09-22 09:39
To: Li Xu; gcc-patches
CC: kito.cheng; palmer; Li Xu
Subject: Re: [PATCH] RISC-V: Optimization of vrgather.vv into 
vrgatherei16.vv[PR111451]

+  unsigned int data_sew = get_sew (data_mode);
+  enum vlmul_type data_lmul = get_vlmul (data_mode);

Remove this.

+  else if (CONST_VECTOR_P (sel) && data_sew != 16
+   && data_sew != 8 && (data_lmul == LMUL_2
+   || data_lmul == LMUL_4 || data_lmul == LMUL_8))

change it into:

else if (CONST_VECTOR_P (sel) 
&& GET_MODE_BITSIZE (GET_MODE_INNER (sel_mode)).to_constant () > 16
&& riscv_get_v_regno_alignment (data_mode) > LMUL_1)




juzhe.zh...@rivai.ai
 
From: Li Xu
Date: 2023-09-22 09:33
To: gcc-patches
CC: kito.cheng; palmer; juzhe.zhong; xuli
Subject: [PATCH] RISC-V: Optimization of vrgather.vv into 
vrgatherei16.vv[PR111451]
From: xuli 
 
Consider this following case:
 
typedef int32_t vnx32si __attribute__ ((vector_size (128)));
 
  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
   TYPE *out)  \
  {\
TYPE v \
  = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
*(TYPE *) out = v; \
  }
 
  T (vnx32si, 32)  \
 
TEST_ALL (PERMUTE)
 
Before this patch:
  li a4,31
  vsetvli a5,zero,e32,m8,ta,ma
  vl8re32.v v24,0(a0)
  vid.v v8
  vrsub.vx v8,v8,a4
  vrgather.vv v16,v24,v8
  vs8r.v v16,0(a2)
  ret
 
The index vector register "v8" occupies 8 registers.
We should optimize it into vrgatherei16.vv which is
using int16 as the index elements.
 
After this patch:
  vsetvli a5,zero,e16,m4,ta,ma
  li a4,31
  vid.v v4
  vl8re32.v v16,0(a0)
  vrsub.vx v4,v4,a4
  vsetvli zero,zero,e32,m8,ta,ma
  vrgatherei16.vv v8,v16,v4
  vs8r.v v8,0(a2)
  ret
With vrgatherei16.vv, the v8 will occupy 4 registers instead
of 8. Lower the register consuming and register pressure.
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (emit_vlmax_gather_insn): Optimization of vrgather.vv 
into vrgatherei16.vv.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c: Adjust case.
* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Ditto.
---
gcc/config/riscv/riscv-v.cc   | 20 +++
.../riscv/rvv/autovec/vls-vlmax/perm-4.c  |  3 ++-
.../gcc.target/riscv/rvv/autovec/vls/perm-4.c |  3 ++-
3 files changed, 24 insertions(+), 2 deletions(-)
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 64a71a128d4..271e0ff6dfc 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -783,6 +783,8 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
   insn_code icode;
   machine_mode data_mode = GET_MODE (target);
   machine_mode sel_mode = GET_MODE (sel);
+  unsigned int data_sew = get_sew (data_mode);
+  enum vlmul_type data_lmul = get_vlmul (data_mode);
   if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 icode = code_for_pred_gatherei16 (data_mode);
   else if (const_vec_duplicate_p (sel, &elt))
@@ -790,6 +792,24 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
   icode = code_for_pred_gather_scalar (data_mode);
   sel = elt;
 }
+  else if (CONST_VECTOR_P (sel) && data_sew != 16
+   && data_sew != 8 && (data_lmul == LMUL_2
+   || data_lmul == LMUL_4 || data_lmul == LMUL_8))
+{
+  /* If the inner mode of data is not QI or HI and data_lmul > 1,
+ emitting vrgatherei16.vv instruction will lower register
+ pressure.
+ data_mode  sel_mode  ei16
+ RVVM1QIRVVM1QI   RVVM2HI  not needed
+ RVVM2QIRVVM2QI   RVVM4HI  not needed
+ RVVM2HIRVVM2HI   RVVM2HI  not needed
+ RVVM2SIRVVM2SI   RVVM1HI  need
+ RVVM4SIRVVM4SI   RVVM2HI  need
+ RVVM8DIRVVM8DI   RVVM2HI  need */
+  PUT_MODE (sel, get_vector_mode (HImode,
+GET_MODE_NUNITS (data_mode)).require ());
+  icode = code_for_pred_gatherei16 (data_mode);
+}
   else
 icode = code_for_pred_gather (data_mode);
   rtx ops[] = {target, op, sel};
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
index 9df69a0cc2c..7ab31043547 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
@@ -55,6 +55,7 @@
TEST_ALL (PERMUTE)
-/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
+/* { dg-final { s

[PATCH V2] RISC-V: Optimization of vrgather.vv into vrgatherei16.vv[PR111451]

2023-09-21 Thread Li Xu
From: xuli 

Consider this following case:

typedef int32_t vnx32si __attribute__ ((vector_size (128)));

  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
   TYPE *out)  \
  {\
TYPE v \
  = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
*(TYPE *) out = v; \
  }

  T (vnx32si, 32)  \

TEST_ALL (PERMUTE)

Before this patch:
  lia4,31
  vsetvli   a5,zero,e32,m8,ta,ma
  vl8re32.v v24,0(a0)
  vid.v v8
  vrsub.vx  v8,v8,a4
  vrgather.vv   v16,v24,v8
  vs8r.vv16,0(a2)
  ret

The index vector register "v8" occupies 8 registers.
We should optimize it into vrgatherei16.vv which is
using int16 as the index elements.

After this patch:
  vsetvli   a5,zero,e16,m4,ta,ma
  lia4,31
  vid.v v4
  vl8re32.v v16,0(a0)
  vrsub.vx  v4,v4,a4
  vsetvli   zero,zero,e32,m8,ta,ma
  vrgatherei16.vv   v8,v16,v4
  vs8r.vv8,0(a2)
  ret
With vrgatherei16.vv, the v8 will occupy 4 registers instead
of 8. Lower the register consuming and register pressure.

PR target/111451

gcc/ChangeLog:

* config/riscv/riscv-v.cc (emit_vlmax_gather_insn): Optimization of 
vrgather.vv into vrgatherei16.vv.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c: Adjust case.
* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Ditto.
---
 gcc/config/riscv/riscv-v.cc| 18 ++
 .../riscv/rvv/autovec/vls-vlmax/perm-4.c   |  3 ++-
 .../gcc.target/riscv/rvv/autovec/vls/perm-4.c  |  3 ++-
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 64a71a128d4..455efa7ea8a 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -790,6 +790,24 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
   icode = code_for_pred_gather_scalar (data_mode);
   sel = elt;
 }
+  else if (CONST_VECTOR_P (sel)
+   && GET_MODE_BITSIZE (GET_MODE_INNER (sel_mode)) > 16
+   && riscv_get_v_regno_alignment (data_mode) > 1)
+{
+  /* If the inner mode of data is not QI or HI and data_lmul > 1,
+ emitting vrgatherei16.vv instruction will lower register
+ pressure.
+ data_mode  sel_mode  ei16
+ RVVM1QIRVVM1QI   RVVM2HI  not needed
+ RVVM2QIRVVM2QI   RVVM4HI  not needed
+ RVVM2HIRVVM2HI   RVVM2HI  not needed
+ RVVM2SIRVVM2SI   RVVM1HI  need
+ RVVM4SIRVVM4SI   RVVM2HI  need
+ RVVM8DIRVVM8DI   RVVM2HI  need */
+  PUT_MODE (sel, get_vector_mode (HImode,
+GET_MODE_NUNITS (data_mode)).require ());
+  icode = code_for_pred_gatherei16 (data_mode);
+}
   else
 icode = code_for_pred_gather (data_mode);
   rtx ops[] = {target, op, sel};
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
index 9df69a0cc2c..7ab31043547 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
@@ -55,6 +55,7 @@
 
 TEST_ALL (PERMUTE)
 
-/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
+/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times 
{vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
 /* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
 /* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
index 46cad8ea2f4..4d6862cf1c0 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,6 +3,7 @@
 
 #include "../vls-vlmax/perm-4.c"
 
-/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
+/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times 
{vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
 /* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
 /* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
-- 
2.17.1



Re: [PATCH V2] RISC-V: Optimization of vrgather.vv into vrgatherei16.vv[PR111451]

2023-09-21 Thread juzhe.zh...@rivai.ai
LGTM. You can commit it after you pass the regression.



juzhe.zh...@rivai.ai
 
From: Li Xu
Date: 2023-09-22 10:37
To: gcc-patches
CC: kito.cheng; palmer; juzhe.zhong; xuli
Subject: [PATCH V2] RISC-V: Optimization of vrgather.vv into 
vrgatherei16.vv[PR111451]
From: xuli 
 
Consider this following case:
 
typedef int32_t vnx32si __attribute__ ((vector_size (128)));
 
  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
   TYPE *out)  \
  {\
TYPE v \
  = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
*(TYPE *) out = v; \
  }
 
  T (vnx32si, 32)  \
 
TEST_ALL (PERMUTE)
 
Before this patch:
  li a4,31
  vsetvli a5,zero,e32,m8,ta,ma
  vl8re32.v v24,0(a0)
  vid.v v8
  vrsub.vx v8,v8,a4
  vrgather.vv v16,v24,v8
  vs8r.v v16,0(a2)
  ret
 
The index vector register "v8" occupies 8 registers.
We should optimize it into vrgatherei16.vv which is
using int16 as the index elements.
 
After this patch:
  vsetvli a5,zero,e16,m4,ta,ma
  li a4,31
  vid.v v4
  vl8re32.v v16,0(a0)
  vrsub.vx v4,v4,a4
  vsetvli zero,zero,e32,m8,ta,ma
  vrgatherei16.vv v8,v16,v4
  vs8r.v v8,0(a2)
  ret
With vrgatherei16.vv, the v8 will occupy 4 registers instead
of 8. Lower the register consuming and register pressure.
 
PR target/111451
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (emit_vlmax_gather_insn): Optimization of vrgather.vv 
into vrgatherei16.vv.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c: Adjust case.
* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Ditto.
---
gcc/config/riscv/riscv-v.cc| 18 ++
.../riscv/rvv/autovec/vls-vlmax/perm-4.c   |  3 ++-
.../gcc.target/riscv/rvv/autovec/vls/perm-4.c  |  3 ++-
3 files changed, 22 insertions(+), 2 deletions(-)
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 64a71a128d4..455efa7ea8a 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -790,6 +790,24 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
   icode = code_for_pred_gather_scalar (data_mode);
   sel = elt;
 }
+  else if (CONST_VECTOR_P (sel)
+   && GET_MODE_BITSIZE (GET_MODE_INNER (sel_mode)) > 16
+   && riscv_get_v_regno_alignment (data_mode) > 1)
+{
+  /* If the inner mode of data is not QI or HI and data_lmul > 1,
+ emitting vrgatherei16.vv instruction will lower register
+ pressure.
+ data_mode  sel_mode  ei16
+ RVVM1QIRVVM1QI   RVVM2HI  not needed
+ RVVM2QIRVVM2QI   RVVM4HI  not needed
+ RVVM2HIRVVM2HI   RVVM2HI  not needed
+ RVVM2SIRVVM2SI   RVVM1HI  need
+ RVVM4SIRVVM4SI   RVVM2HI  need
+ RVVM8DIRVVM8DI   RVVM2HI  need */
+  PUT_MODE (sel, get_vector_mode (HImode,
+GET_MODE_NUNITS (data_mode)).require ());
+  icode = code_for_pred_gatherei16 (data_mode);
+}
   else
 icode = code_for_pred_gather (data_mode);
   rtx ops[] = {target, op, sel};
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
index 9df69a0cc2c..7ab31043547 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
@@ -55,6 +55,7 @@
TEST_ALL (PERMUTE)
-/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
+/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times 
{vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
index 46cad8ea2f4..4d6862cf1c0 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,6 +3,7 @@
#include "../vls-vlmax/perm-4.c"
-/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
+/* { dg-final { scan-assembler-times 
{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times 
{vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
-- 
2.17.1
 
 


[COMMITTED] RISC-V: Split VLS avl_type from NONVLMAX avl_type

2023-09-21 Thread Lehua Ding
This patch split a VLS avl_type from the NONVLMAX avl_type, denoting
those RVV insn with length set to the number of units of VLS modes.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (enum avl_type): New VLS avl_type.
* config/riscv/riscv-v.cc (autovec_use_vlmax_p): Move comments.

---
 gcc/config/riscv/riscv-protos.h | 21 +++--
 gcc/config/riscv/riscv-v.cc | 25 ++---
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 07b4ffe3edf..d8372a7886f 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -368,10 +368,27 @@ enum vlmul_type
   NUM_LMUL = 8
 };

+/* The RISC-V vsetvli pass uses "known vlmax" operations for optimization.
+   Whether or not an instruction actually is a vlmax operation is not
+   recognizable from the length operand alone but the avl_type operand
+   is used instead.  In general, there are two cases:
+
+- Emit a vlmax operation by calling emit_vlmax_insn[_lra].  Here we emit
+  a vsetvli with vlmax configuration and set the avl_type to VLMAX for
+  VLA modes or VLS for VLS modes.
+- Emit an operation that uses the existing (last-set) length and
+  set the avl_type to NONVLMAX.
+
+Sometimes we also need to set the VLMAX or VLS avl_type to an operation 
that
+already uses a given length register.  This can happen during or after
+register allocation when we are not allowed to create a new register.
+For that case we also allow to set the avl_type to VLMAX or VLS.
+*/
 enum avl_type
 {
-  NONVLMAX,
-  VLMAX,
+  NONVLMAX = 0,
+  VLMAX = 1,
+  VLS = 2,
 };
 /* Routines implemented in riscv-vector-builtins.cc.  */
 void init_builtins (void);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index f63dec573ef..e8266218394 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -284,6 +284,7 @@ public:

 /* Add vl operand.  */
 rtx len = m_vl_op;
+bool vls_p = false;
 if (m_vlmax_p)
   {
if (riscv_v_ext_vls_mode_p (vtype_mode))
@@ -294,7 +295,7 @@ public:
len = gen_int_mode (nunits, Pmode);
if (!satisfies_constraint_K (len))
  len = force_reg (Pmode, len);
-   m_vlmax_p = false;
+   vls_p = true;
  }
else if (const_vlmax_p (vtype_mode))
  {
@@ -302,7 +303,7 @@ public:
   the vsetvli to obtain the value of vlmax.  */
poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
len = gen_int_mode (nunits, Pmode);
-   m_vlmax_p = false;
+   vls_p = true;
  }
else if (can_create_pseudo_p ())
  {
@@ -318,7 +319,9 @@ public:
 add_policy_operand ();

 /* Add avl_type operand.  */
-add_avl_type_operand (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX);
+add_avl_type_operand (
+  vls_p ? avl_type::VLS
+   : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));

 /* Add rounding mode operand.  */
 if (m_insn_flags & FRM_DYN_P)
@@ -769,22 +772,6 @@ autovec_use_vlmax_p (void)
  || riscv_autovec_preference == RVV_FIXED_VLMAX);
 }

-/* The RISC-V vsetvli pass uses "known vlmax" operations for optimization.
-   Whether or not an instruction actually is a vlmax operation is not
-   recognizable from the length operand alone but the avl_type operand
-   is used instead.  In general, there are two cases:
-
-- Emit a vlmax operation by passing a NULL length.  Here we emit
-  a vsetvli with vlmax configuration and set the avl_type to VLMAX.
-- Emit an operation that uses the existing (last-set) length and
-  set the avl_type to NONVLMAX.
-
-Sometimes we also need to set the VLMAX avl_type to an operation that
-already uses a given length register.  This can happen during or after
-register allocation when we are not allowed to create a new register.
-For that case we also allow to set the avl_type to VLMAX.
-*/
-
 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
is a const duplicate vector. Otherwise, emit vrgather.vv.  */
 static void
--
2.36.3



[COMMITTED V4] RISC-V: Support combine cond extend and reduce sum to widen reduce sum

2023-09-21 Thread Lehua Ding
This patch support combining cond extend and reduce_sum to cond widen reduce_sum
like combine the following three insns:
   (set (reg:RVVM2HI 149)
(if_then_else:RVVM2HI
  (unspec:RVVMF8BI [
(const_vector:RVVMF8BI repeat [
  (const_int 1 [0x1])
])
(reg:DI 146)
(const_int 2 [0x2]) repeated x2
(const_int 1 [0x1])
(reg:SI 66 vl)
(reg:SI 67 vtype)
  ] UNSPEC_VPREDICATE)
 (const_vector:RVVM2HI repeat [
   (const_int 0 [0])
 ])
 (unspec:RVVM2HI [
   (reg:SI 0 zero)
 ] UNSPEC_VUNDEF)))
  (set (reg:RVVM2HI 138)
(if_then_else:RVVM2HI
  (reg:RVVMF8BI 135)
  (reg:RVVM2HI 148)
  (reg:RVVM2HI 149)))
  (set (reg:HI 150)
(unspec:HI [
  (reg:RVVM2HI 138)
] UNSPEC_REDUC_SUM))
into one insn:
  (set (reg:SI 147)
(unspec:SI [
  (if_then_else:RVVM2SI
(reg:RVVMF16BI 135)
(sign_extend:RVVM2SI (reg:RVVM1HI 136))
(if_then_else:RVVM2HI
  (unspec:RVVMF8BI [
(const_vector:RVVMF8BI repeat [
  (const_int 1 [0x1])
])
(reg:DI 146)
(const_int 2 [0x2]) repeated x2
(const_int 1 [0x1])
(reg:SI 66 vl)
(reg:SI 67 vtype)
  ] UNSPEC_VPREDICATE)
 (const_vector:RVVM2HI repeat [
   (const_int 0 [0])
 ])
 (unspec:RVVM2HI [
   (reg:SI 0 zero)
 ] UNSPEC_VUNDEF)))
] UNSPEC_REDUC_SUM))

Consider the following C code:

int16_t foo (int8_t *restrict a, int8_t *restrict pred)
{
  int16_t sum = 0;
  for (int i = 0; i < 16; i += 1)
if (pred[i])
  sum += a[i];
  return sum;
}

assembly before this patch:

foo:
vsetivlizero,16,e16,m2,ta,ma
li  a5,0
vmv.v.i v2,0
vsetvli zero,zero,e8,m1,ta,ma
vl1re8.vv0,0(a1)
vmsne.viv0,v0,0
vsetvli zero,zero,e16,m2,ta,mu
vle8.v  v4,0(a0),v0.t
vmv.s.x v1,a5
vsext.vf2   v2,v4,v0.t
vredsum.vs  v2,v2,v1
vmv.x.s a0,v2
slliw   a0,a0,16
sraiw   a0,a0,16
ret

assembly after this patch:

foo:
li  a5,0
vsetivlizero,16,e16,m1,ta,ma
vmv.s.x v3,a5
vsetivlizero,16,e8,m1,ta,ma
vl1re8.vv0,0(a1)
vmsne.viv0,v0,0
vle8.v  v2,0(a0),v0.t
vwredsum.vs v1,v2,v3,v0.t
vsetivlizero,0,e16,m1,ta,ma
vmv.x.s a0,v1
slliw   a0,a0,16
sraiw   a0,a0,16
ret

gcc/ChangeLog:

* config/riscv/autovec-opt.md (*cond_widen_reduc_plus_scal_):
New combine patterns.
* config/riscv/riscv-protos.h (enum insn_type): New insn_type.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc-1.c: New test.
* gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc-2.c: New test.
* gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc_run-1.c: New test.
* gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc_run-2.c: New test.

---
 gcc/config/riscv/autovec-opt.md   | 72 +++
 gcc/config/riscv/riscv-protos.h   |  1 +
 .../rvv/autovec/cond/cond_widen_reduc-1.c | 30 
 .../rvv/autovec/cond/cond_widen_reduc-2.c | 30 
 .../rvv/autovec/cond/cond_widen_reduc_run-1.c | 28 
 .../rvv/autovec/cond/cond_widen_reduc_run-2.c | 28 
 6 files changed, 189 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc_run-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc_run-2.c

diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index a97a095691c..ed9c0777eb9 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1119,6 +1119,78 @@
   }
   [(set_attr "type" "vfwmuladd")])

+;; Combine mask_extend + vredsum to mask_vwredsum[u]
+;; where the mrege of mask_extend is vector const 0
+(define_insn_and_split "*cond_widen_reduc_plus_scal_"
+  [(set (match_operand: 0 "register_operand")
+(unspec: [
+  (if_then_else:
+(match_operand: 1 "register_operand")
+(any_extend:
+  (match_operand:VI_QHS_NO_M8 2 "register_operand"))
+(if_then_else:
+  (unspec: [
+(match_operand: 3 "vector_all_trues_mask_operand")
+(match_operand 6 "vector_length_operand")
+(match_operand 7 "const_int_operand")
+(match_operand 8 "const_int_operand")
+(match_operand 9 "const_1_or_2_operand")
+(reg:SI VL_REGNUM)
+   

Re: [PATCH V3] RISC-V: Support combine cond extend and reduce sum to widen reduce sum

2023-09-21 Thread Lehua Ding

Committed with splited patchs, thanks Robin.

[COMMITTED] RISC-V: Split VLS avl_type from NONVLMAX avl_type
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631152.html

[COMMITTED V4] RISC-V: Support combine cond extend and reduce sum to 
widen reduce sum

https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631153.html

On 2023/9/21 16:27, Lehua Ding wrote:

Hi Robin,

On 2023/9/21 16:12, Robin Dapp wrote:

Hi Lehua,


V3 Change: Back to the original method.

Was there an original method even before the first patch?


Yes, this was the method that came to mind at first, and I didn't send a 
patch because I didn't feel like the pattern looked good :)



Anyway, I prefer this v3 over the others even though the large
pattern is not exactly pretty :)

What about the VLS changes?  Are they necessary for the patterns/tests?
I mean they are reasonable in themselves but are they related?
If so, please specify.  If not please split them off into a small
(pre-approved) patch.  OK with this adjusted.


This VLS change is necessary for the patch and only used by this patch 
currently. Currently, VLMAX inside avl_type stands for VLA mode using 
VLMAX, however, we need to distinguish from NONVLMAX in the case where 
the vl is the number of units of the mode. This way it is safe and 
simple to do combine. Therefore a new avl_type enumeration VLS is 
introduced to distinguish.


There's an overlap between VLMAX and VLS here, and it's probably more 
appropriate to use FULL or something similar, like the other patch that 
changes the comments. I'm going to hold off on making any major changes 
for now, though, as the impact is more localized.




--
Best,
Lehua (RiVAI)
lehua.d...@rivai.ai


Re: [PATCH 00/18] Support -mevex512 for AVX512

2023-09-21 Thread Hongtao Liu
On Thu, Sep 21, 2023 at 3:22 PM Hu, Lin1  wrote:
>
> Hi all,
>
> After previous discussion, instead of supporting option -mavx10.1, we
> will first introduct option -m[no-]evex512, which will enable/disable
> 512 bit register and 64 bit mask register.
>
> It will not change the current option behavior since if AVX512F is
> enabled with no evex512 option specified, it will automatically enable
> 512 bit register and 64 bit mask register.
>
> How the patches go comes following:
>
> Patch 1 added initial support for option -mevex512.
>
> Patch 2-6 refined current intrin file to push evex512 target for all
> 512 bit intrins. Those scalar intrins remained untouched.
>
> Patch 7-11 added OPTION_MASK_ISA2_EVEX512 for all related builtins.
>
> Patch 12 disabled zmm register, 512 bit libmvec call for no-evex512,
> also requested evex512 for vectorization when using 512 bit register.
>
> Patch 13-17 supported evex512 in related patterns.
>
> Patch 18 added testcases for -mno-evex512 and allowed its usage.
>
> The patches currently cause scan-asm fail for pr89229-{5,6,7}b.c since
> we will emit scalar vmovss here. When trying to use x/ymm 16+ w/o
> avx512vl but with avx512f+evex512, I suppose we could either emit scalar
> or zmm instructions. It is quite a rare case on HW since there is no
> HW w/o avx512vl but with avx512f, so I prefer to not to add maintainence
> effort here to get a slightly perf improvement. But it could be changed
> to former behavior.
To make it easier for people to test before committing, I pushed the
patch to the vendor branch
refs/vendors/ix86/heads/evex512.
Welcome to try it out.

>
> Discussions are welcomed for all the patches.
>
> Thx,
> Haochen
>
> Haochen Jiang (18):
>   Initial support for -mevex512
>   Push evex512 target for 512 bit intrins
>   Push evex512 target for 512 bit intrins
>   Push evex512 target for 512 bit intrins
>   Push evex512 target for 512 bit intrins
>   Push evex512 target for 512 bit intrins
>   Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
>   Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
>   Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
>   Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
>   Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
>   Disable zmm register and 512 bit libmvec call when !TARGET_EVEX512
>   Support -mevex512 for AVX512F intrins
>   Support -mevex512 for AVX512DQ intrins
>   Support -mevex512 for AVX512BW intrins
>   Support -mevex512 for
> 
> AVX512{IFMA,VBMI,VNNI,BF16,VPOPCNTDQ,VBMI2,BITALG,VP2INTERSECT},VAES,GFNI,VPCLMULQDQ
> intrins
>   Support -mevex512 for AVX512FP16 intrins
>   Allow -mno-evex512 usage
>
>  gcc/common/config/i386/i386-common.cc   |15 +
>  gcc/config.gcc  |19 +-
>  gcc/config/i386/avx5124fmapsintrin.h| 2 +-
>  gcc/config/i386/avx5124vnniwintrin.h| 2 +-
>  gcc/config/i386/avx512bf16intrin.h  |31 +-
>  gcc/config/i386/avx512bitalgintrin.h|   155 +-
>  gcc/config/i386/avx512bitalgvlintrin.h  |   180 +
>  gcc/config/i386/avx512bwintrin.h|   291 +-
>  gcc/config/i386/avx512dqintrin.h|  1840 +-
>  gcc/config/i386/avx512erintrin.h| 2 +-
>  gcc/config/i386/avx512fintrin.h | 19663 +-
>  gcc/config/i386/avx512fp16intrin.h  |  8925 
>  gcc/config/i386/avx512ifmaintrin.h  | 4 +-
>  gcc/config/i386/avx512pfintrin.h| 2 +-
>  gcc/config/i386/avx512vbmi2intrin.h | 4 +-
>  gcc/config/i386/avx512vbmiintrin.h  | 4 +-
>  gcc/config/i386/avx512vnniintrin.h  | 4 +-
>  gcc/config/i386/avx512vp2intersectintrin.h  | 4 +-
>  gcc/config/i386/avx512vpopcntdqintrin.h | 4 +-
>  gcc/config/i386/gfniintrin.h|76 +-
>  gcc/config/i386/i386-builtin.def|  1312 +-
>  gcc/config/i386/i386-builtins.cc|96 +-
>  gcc/config/i386/i386-c.cc   | 2 +
>  gcc/config/i386/i386-expand.cc  |18 +-
>  gcc/config/i386/i386-options.cc |33 +-
>  gcc/config/i386/i386.cc |   168 +-
>  gcc/config/i386/i386.h  | 7 +-
>  gcc/config/i386/i386.md |   127 +-
>  gcc/config/i386/i386.opt| 4 +
>  gcc/config/i386/immintrin.h | 2 +
>  gcc/config/i386/predicates.md   | 3 +-
>  gcc/config/i386/sse.md  |   854 +-
>  gcc/config/i386/vaesintrin.h| 4 +-
>  gcc/config/i386/vpclmulqdqintrin.h  | 4 +-
>  gcc/testsuite/gcc.target/i386/noevex512-1.c |13 +
>  gcc/testsuite/gcc.target/i386/noevex512-2.c |13 +
>  gcc/testsuite/gcc.target/i386/noevex512-3.c |13 +
>  gcc/testsuite/gcc.target/i386/pr89229-5b.c  | 2 +-
>  gcc/testsuite/gcc.target/i386/pr89229-6b.c  | 2 +-
>  gcc/testsuite/gcc.target/i386/pr89229-7b.c  | 2 +-
>  gcc/testsuit

[PATCH v1] RISC-V: Remove arch and abi option for run test case.

2023-09-21 Thread pan2 . li
From: Pan Li 

Remove the -march and -mabi.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/math-ceil-run-0.c: Remove arch and abi.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: Ditto.

Signed-off-by: Pan Li 
---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
index f1946e197cc..67462154018 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target { riscv_vector } } } */
-/* { dg-additional-options "-march=rv64gcv_zvfh -std=c2x -mabi=lp64d -O3 
-ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+/* { dg-additional-options "-std=c2x -O3 -ftree-vectorize -fno-vect-cost-model 
-ffast-math" } */
 
 #include "test-math.h"
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
index 202944ddd92..38adff16df9 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target { riscv_vector } } } */
-/* { dg-additional-options "-march=rv64gcv -std=c99 -mabi=lp64d -O3 
-ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model 
-ffast-math" } */
 
 #include "test-math.h"
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
index f0ff9bca0af..6f22842ebdb 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target { riscv_vector } } } */
-/* { dg-additional-options "-march=rv64gcv -std=c99 -mabi=lp64d -O3 
-ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model 
-ffast-math" } */
 
 #include "test-math.h"
 
-- 
2.34.1



Re: [PATCH v1] RISC-V: Remove arch and abi option for run test case.

2023-09-21 Thread juzhe.zh...@rivai.ai
LGTM



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-22 11:39
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Remove arch and abi option for run test case.
From: Pan Li 
 
Remove the -march and -mabi.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/math-ceil-run-0.c: Remove arch and abi.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: Ditto.
 
Signed-off-by: Pan Li 
---
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c | 2 +-
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c | 2 +-
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
index f1946e197cc..67462154018 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
@@ -1,5 +1,5 @@
/* { dg-do run { target { riscv_vector } } } */
-/* { dg-additional-options "-march=rv64gcv_zvfh -std=c2x -mabi=lp64d -O3 
-ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+/* { dg-additional-options "-std=c2x -O3 -ftree-vectorize -fno-vect-cost-model 
-ffast-math" } */
#include "test-math.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
index 202944ddd92..38adff16df9 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
@@ -1,5 +1,5 @@
/* { dg-do run { target { riscv_vector } } } */
-/* { dg-additional-options "-march=rv64gcv -std=c99 -mabi=lp64d -O3 
-ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model 
-ffast-math" } */
#include "test-math.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
index f0ff9bca0af..6f22842ebdb 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
@@ -1,5 +1,5 @@
/* { dg-do run { target { riscv_vector } } } */
-/* { dg-additional-options "-march=rv64gcv -std=c99 -mabi=lp64d -O3 
-ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model 
-ffast-math" } */
#include "test-math.h"
-- 
2.34.1
 
 


[PATCH v1] RISC-V: Rename the test macro for math autovec test

2023-09-21 Thread pan2 . li
From: Pan Li 

Rename TEST_CEIL to TEST_UNARY_CALL for the underlying function
autovec patch testing.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/test-math.h: Rename.
* gcc.target/riscv/rvv/autovec/math-ceil-0.c: Ditto.
* gcc.target/riscv/rvv/autovec/math-ceil-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/math-ceil-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/math-ceil-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/math-ceil-run-0.c: Ditto.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: Ditto.

Signed-off-by: Pan Li 
---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-0.c | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h   | 4 ++--
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-0.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-0.c
index 88a2ac4b338..0959afd57d6 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-0.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-0.c
@@ -23,4 +23,4 @@
 **   fsrm\s+[atx][0-9]+
 **   ...
 */
-TEST_CEIL(_Float16, __builtin_ceilf16)
+TEST_UNARY_CALL (_Float16, __builtin_ceilf16)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
index 0908ef269bd..142705b7eed 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-1.c
@@ -23,4 +23,4 @@
 **   fsrm\s+[atx][0-9]+
 **   ...
 */
-TEST_CEIL(float, __builtin_ceilf)
+TEST_UNARY_CALL (float, __builtin_ceilf)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c
index 65d4807edef..d232e36e1db 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-2.c
@@ -23,4 +23,4 @@
 **   fsrm\s+[atx][0-9]+
 **   ...
 */
-TEST_CEIL(double, __builtin_ceil)
+TEST_UNARY_CALL (double, __builtin_ceil)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c
index 416698a753e..82e4f89a82a 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-3.c
@@ -25,4 +25,4 @@
 **   fsrm\s+[atx][0-9]+
 **   ...
 */
-TEST_COND_CEIL(float, __builtin_ceilf)
+TEST_COND_UNARY_CALL (float, __builtin_ceilf)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
index f1946e197cc..699eaf364e1 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
@@ -9,7 +9,7 @@ _Float16 in[ARRAY_SIZE];
 _Float16 out[ARRAY_SIZE];
 _Float16 ref[ARRAY_SIZE];
 
-TEST_CEIL (_Float16, __builtin_ceilf16)
+TEST_UNARY_CALL (_Float16, __builtin_ceilf16)
 TEST_ASSERT (_Float16)
 
 TEST_INIT (_Float16, 1.2, 2.0, 1)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
index 202944ddd92..98a412979b5 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
@@ -9,7 +9,7 @@ float in[ARRAY_SIZE];
 float out[ARRAY_SIZE];
 float ref[ARRAY_SIZE];
 
-TEST_CEIL (float, __builtin_ceilf)
+TEST_UNARY_CALL (float, __builtin_ceilf)
 TEST_ASSERT (float)
 
 TEST_INIT (float, 1.2, 2.0, 1)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
index f0ff9bca0af..22a4d8ab2b9 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
@@ -9,7 +9,7 @@ double in[ARRAY_SIZE];
 double out[ARRAY_SIZE];
 double ref[ARRAY_SIZE];
 
-TEST_CEIL (double, __builtin_ceil)
+TEST_UNARY_CALL (double, __builtin_ceil)
 TEST_ASSERT (double)
 
 TEST_INIT (double, 1.2, 2.0, 1)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h
index 6e913da37f4..d035835f370 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/test-math.h
@@ -1,11 +1,11 @@
-#define TEST_CEIL(TYPE, CALL) \
+#define TEST_UNARY_CALL(TYPE, CALL)  

RE: [PATCH v1] RISC-V: Remove arch and abi option for run test case.

2023-09-21 Thread Li, Pan2
Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Friday, September 22, 2023 11:45 AM
To: Li, Pan2 ; gcc-patches 
Cc: Li, Pan2 ; Wang, Yanzhang ; 
kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Remove arch and abi option for run test case.

LGTM


juzhe.zh...@rivai.ai

From: pan2.li
Date: 2023-09-22 11:39
To: gcc-patches
CC: juzhe.zhong; 
pan2.li; 
yanzhang.wang; 
kito.cheng
Subject: [PATCH v1] RISC-V: Remove arch and abi option for run test case.
From: Pan Li mailto:pan2...@intel.com>>

Remove the -march and -mabi.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/math-ceil-run-0.c: Remove arch and abi.
* gcc.target/riscv/rvv/autovec/math-ceil-run-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/math-ceil-run-2.c: Ditto.

Signed-off-by: Pan Li mailto:pan2...@intel.com>>
---
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c | 2 +-
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c | 2 +-
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
index f1946e197cc..67462154018 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-0.c
@@ -1,5 +1,5 @@
/* { dg-do run { target { riscv_vector } } } */
-/* { dg-additional-options "-march=rv64gcv_zvfh -std=c2x -mabi=lp64d -O3 
-ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+/* { dg-additional-options "-std=c2x -O3 -ftree-vectorize -fno-vect-cost-model 
-ffast-math" } */
#include "test-math.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
index 202944ddd92..38adff16df9 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-1.c
@@ -1,5 +1,5 @@
/* { dg-do run { target { riscv_vector } } } */
-/* { dg-additional-options "-march=rv64gcv -std=c99 -mabi=lp64d -O3 
-ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model 
-ffast-math" } */
#include "test-math.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
index f0ff9bca0af..6f22842ebdb 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/math-ceil-run-2.c
@@ -1,5 +1,5 @@
/* { dg-do run { target { riscv_vector } } } */
-/* { dg-additional-options "-march=rv64gcv -std=c99 -mabi=lp64d -O3 
-ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model 
-ffast-math" } */
#include "test-math.h"
--
2.34.1




Re: [PATCH v1] RISC-V: Rename the test macro for math autovec test

2023-09-21 Thread juzhe.zhong
ok Replied Message Frompan2...@intel.comDate09/22/2023 11:47 Togcc-patches@gcc.gnu.org Ccjuzhe.zh...@rivai.ai,pan2...@intel.com,yanzhang.w...@intel.com,kito.ch...@gmail.comSubject[PATCH v1] RISC-V: Rename the test macro for math autovec test


Re: Re: [PATCH V2] RISC-V: Optimization of vrgather.vv into vrgatherei16.vv[PR111451]

2023-09-21 Thread Li Xu
Committed, thanks Juzhe.
--
Li Xu
>LGTM. You can commit it after you pass the regression.
>
>
>
>juzhe.zh...@rivai.ai
>
>From: Li Xu
>Date: 2023-09-22 10:37
>To: gcc-patches
>CC: kito.cheng; palmer; juzhe.zhong; xuli
>Subject: [PATCH V2] RISC-V: Optimization of vrgather.vv into 
>vrgatherei16.vv[PR111451]
>From: xuli 
>
>Consider this following case:
>
>typedef int32_t vnx32si __attribute__ ((vector_size (128)));
>
>  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, 
>\
>   TYPE *out)  \
>  {    
>\
>    TYPE v 
>\
>  = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); 
>\
>    *(TYPE *) out = v; 
>\
>  }
>
>  T (vnx32si, 32)  
>\
>
>TEST_ALL (PERMUTE)
>
>Before this patch:
>  li a4,31
>  vsetvli a5,zero,e32,m8,ta,ma
>  vl8re32.v v24,0(a0)
>  vid.v v8
>  vrsub.vx v8,v8,a4
>  vrgather.vv v16,v24,v8
>  vs8r.v v16,0(a2)
>  ret
>
>The index vector register "v8" occupies 8 registers.
>We should optimize it into vrgatherei16.vv which is
>using int16 as the index elements.
>
>After this patch:
>  vsetvli a5,zero,e16,m4,ta,ma
>  li a4,31
>  vid.v v4
>  vl8re32.v v16,0(a0)
>  vrsub.vx v4,v4,a4
>  vsetvli zero,zero,e32,m8,ta,ma
>  vrgatherei16.vv v8,v16,v4
>  vs8r.v v8,0(a2)
>  ret
>With vrgatherei16.vv, the v8 will occupy 4 registers instead
>of 8. Lower the register consuming and register pressure.
>
>PR target/111451
>
>gcc/ChangeLog:
>
>* config/riscv/riscv-v.cc (emit_vlmax_gather_insn): Optimization of 
>vrgather.vv into vrgatherei16.vv.
>
>gcc/testsuite/ChangeLog:
>
>* gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c: Adjust case.
>* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Ditto.
>---
>gcc/config/riscv/riscv-v.cc    | 18 ++
>.../riscv/rvv/autovec/vls-vlmax/perm-4.c   |  3 ++-
>.../gcc.target/riscv/rvv/autovec/vls/perm-4.c  |  3 ++-
>3 files changed, 22 insertions(+), 2 deletions(-)
>
>diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
>index 64a71a128d4..455efa7ea8a 100644
>--- a/gcc/config/riscv/riscv-v.cc
>+++ b/gcc/config/riscv/riscv-v.cc
>@@ -790,6 +790,24 @@ emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
>   icode = code_for_pred_gather_scalar (data_mode);
>   sel = elt;
> }
>+  else if (CONST_VECTOR_P (sel)
>+   && GET_MODE_BITSIZE (GET_MODE_INNER (sel_mode)) > 16
>+   && riscv_get_v_regno_alignment (data_mode) > 1)
>+    {
>+  /* If the inner mode of data is not QI or HI and data_lmul > 1,
>+ emitting vrgatherei16.vv instruction will lower register
>+ pressure.
>+ data_mode  sel_mode  ei16
>+ RVVM1QI    RVVM1QI   RVVM2HI  not needed
>+ RVVM2QI    RVVM2QI   RVVM4HI  not needed
>+ RVVM2HI    RVVM2HI   RVVM2HI  not needed
>+ RVVM2SI    RVVM2SI   RVVM1HI  need
>+ RVVM4SI    RVVM4SI   RVVM2HI  need
>+ RVVM8DI    RVVM8DI   RVVM2HI  need */
>+  PUT_MODE (sel, get_vector_mode (HImode,
>+    GET_MODE_NUNITS (data_mode)).require ());
>+  icode = code_for_pred_gatherei16 (data_mode);
>+    }
>   else
> icode = code_for_pred_gather (data_mode);
>   rtx ops[] = {target, op, sel};
>diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c 
>b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
>index 9df69a0cc2c..7ab31043547 100644
>--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
>+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
>@@ -55,6 +55,7 @@
>TEST_ALL (PERMUTE)
>-/* { dg-final { scan-assembler-times 
>{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
>+/* { dg-final { scan-assembler-times 
>{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
>+/* { dg-final { scan-assembler-times 
>{vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
>/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
>/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
>diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c 
>b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
>index 46cad8ea2f4..4d6862cf1c0 100644
>--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
>+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
>@@ -3,6 +3,7 @@
>#include "../vls-vlmax/perm-4.c"
>-/* { dg-final { scan-assembler-times 
>{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
>+/* { dg-final { scan-assembler-times 
>{vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
>+/* { dg-final { scan-assembler-times 
>{vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
>/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
>/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
>--
>2.

RE: [PATCH v1] RISC-V: Rename the test macro for math autovec test

2023-09-21 Thread Li, Pan2
Committed, thanks Juzhe.

Pan

From: juzhe.zhong 
Sent: Friday, September 22, 2023 12:18 PM
To: Li, Pan2 
Cc: gcc-patches@gcc.gnu.org; Li, Pan2 ; Wang, Yanzhang 
; kito.ch...@gmail.com
Subject: Re: [PATCH v1] RISC-V: Rename the test macro for math autovec test

ok
 Replied Message 
From
pan2...@intel.com
Date
09/22/2023 11:47
To
gcc-patches@gcc.gnu.org
Cc
juzhe.zh...@rivai.ai,
pan2...@intel.com,
yanzhang.w...@intel.com,
kito.ch...@gmail.com
Subject
[PATCH v1] RISC-V: Rename the test macro for math autovec test



[PATCH v1] RISCV-V: Suport FP floor auto-vectorization

2023-09-21 Thread pan2 . li
From: Pan Li 

This patch would like to support auto-vectorization for the
floor API in math.h. It depends on the -ffast-math option.

When we would like to call floor/floorf like v2 = floor (v1), we will
convert it into below insns (reference the implementation of llvm).

* vfcvt.x.f v3, v1, RDN
* vfcvt.f.x v2, v3

However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.

  +---+---+-+
  | raw float | binary layout | after floor |
  +---+---+-+
  | 8388607.5 | 0x4aff| 8388607.0   |
  | 8388608.0 | 0x4b00| 8388608.0   |
  | 8388609.0 | 0x4b01| 8388609.0   |
  +---+---+-+

All single floating point glte 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.

Befor this patch:
math-floor-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callceilf
  fsw fa0,-4(s1)
  bne s0,s2,.L3

After this patch:
  ...
  fsrmi   2   // Rounding Down
.L4:
  vfabs.v v0,v1
  vmv1r.v v2,v1
  vmflt.vvv0,v0,v4
  sub a3,a3,a4
  vfcvt.x.f.v v3,v1,v0.t
  vfcvt.f.x.v v2,v3,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret

Please note VLS mode is also involved in this patch and covered by the
test cases.

gcc/ChangeLog:

* config/riscv/autovec.md (floor2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_floor): New function decl.
* config/riscv/riscv-v.cc (gen_floor_const_fp): New function impl.
(expand_vec_floor): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/math-floor-0.c: New test.
* gcc.target/riscv/rvv/autovec/math-floor-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-floor-2.c: New test.
* gcc.target/riscv/rvv/autovec/math-floor-3.c: New test.
* gcc.target/riscv/rvv/autovec/math-floor-run-0.c: New test.
* gcc.target/riscv/rvv/autovec/math-floor-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/math-floor-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-floor-1.c: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec.md   | 11 
 gcc/config/riscv/riscv-protos.h   |  5 ++
 gcc/config/riscv/riscv-v.cc   | 36 +++-
 .../riscv/rvv/autovec/math-floor-0.c  | 26 +
 .../riscv/rvv/autovec/math-floor-1.c  | 26 +
 .../riscv/rvv/autovec/math-floor-2.c  | 26 +
 .../riscv/rvv/autovec/math-floor-3.c  | 28 ++
 .../riscv/rvv/autovec/math-floor-run-0.c  | 39 +
 .../riscv/rvv/autovec/math-floor-run-1.c  | 39 +
 .../riscv/rvv/autovec/math-floor-run-2.c  | 39 +
 .../riscv/rvv/autovec/vls/math-floor-1.c  | 56 +++
 11 files changed, 329 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-floor-0.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-floor-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-floor-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/math-floor-3.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-floor-run-0.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-floor-run-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/math-floor-run-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-floor-1.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index b92cb7a5d0f..9ba20e27cf1 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2245,6 +2245,7 @@ (define_expand "avg3_ceil"
 ;; -
 ;; Includes:
 ;; - ceil/ceilf
+;; - floor/floorf
 ;; -
 (define_expand "ceil2"
   [(match_operand:V_VLSF 0 "register_operand")
@@ -2255,3 +2256,13 @@ (define_expand "ceil2"
 DONE;
   }
 )
+
+(define_expand "floor2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_floor (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 07b4ffe3edf..04e26c957d7 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -253,6 +253,9 @@ enum insn_flags : unsigned int
 
   /* Means INSN has FRM operand and the value is FRM_RUP.  */
   FRM_RUP_P = 1 << 16