Re: [PATCH v2 2/2] LoongArch: When the code model is extreme, the symbol address is obtained through macro instructions regardless of the value of -mexplicit-relocs.

2024-01-05 Thread Xi Ruoyao
On Fri, 2024-01-05 at 11:40 +0800, Lulu Cheng wrote:
>  bool
>  loongarch_explicit_relocs_p (enum loongarch_symbol_type type)
>  {
> +  /* Instructions pcalau12i, addi.d, lu32i.d and lu52i.d must be adjancent
> + so that the linker can infer the PC of pcalau12i to apply relocations
> + to lu32i.d and lu52i.d.  Otherwise, the results would be incorrect if
> + these four instructions are not in the same 4KiB page.
> + Therefore, macro instructions are used when cmodel=extreme.  */
> +  if (loongarch_symbol_extreme_p (type))
> +    return false;

I think this is a bit of strange.  With -mexplicit-relocs={auto,always}
we should still use explicit relocs, but coding all 4 instructions
altogether as

"pcalau12i.d\t%1,%pc64_hi12(%2)\n\taddi.d\t%0,$r0,%pclo12(%2)\n\tlu32i.d\t%0,%pc64_lo20(%2)\n\tlu52i.d\t%0,%0,%pc64_hi12(%2)"

Give me several hours trying to implement this...

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


[committed] RISC-V: Clean up testsuite for multi-lib testing [NFC]

2024-01-05 Thread Kito Cheng
- Drop unnecessary including for stdlib.h and math.h
- Drop assert.h / assert, use __builtin_abort instead.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/shift-scalar-template.h:
Use __builtin_abort instead of assert.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-1.c: Drop math.h.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax_zvfh-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax_zvfh-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax_zvfh-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax_zvfh-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin_zvfh-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin_zvfh-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin_zvfh-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin_zvfh-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/partial/single_rgroup-2.h: Use
__builtin_abort instead of assert.
* gcc.target/riscv/rvv/autovec/pr112694-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/partial/single_rgroup-3.h: Ditto.
* gcc.target/riscv/rvv/autovec/unop/abs-template.h: Drop stdlib.h.
* gcc.target/riscv/rvv/autovec/unop/vneg-template.h: Ditto.
* gcc.target/riscv/rvv/autovec/unop/vnot-template.h: Ditto.
---
 .../rvv/autovec/binop/shift-scalar-template.h |  5 +-
 .../riscv/rvv/autovec/cond/cond_fmax-1.c  |  1 -
 .../riscv/rvv/autovec/cond/cond_fmax-2.c  |  1 -
 .../riscv/rvv/autovec/cond/cond_fmax-3.c  |  1 -
 .../riscv/rvv/autovec/cond/cond_fmax-4.c  |  1 -
 .../riscv/rvv/autovec/cond/cond_fmax_zvfh-1.c |  1 -
 .../riscv/rvv/autovec/cond/cond_fmax_zvfh-2.c |  1 -
 .../riscv/rvv/autovec/cond/cond_fmax_zvfh-3.c |  1 -
 .../riscv/rvv/autovec/cond/cond_fmax_zvfh-4.c |  1 -
 .../riscv/rvv/autovec/cond/cond_fmin-1.c  |  1 -
 .../riscv/rvv/autovec/cond/cond_fmin-2.c  |  1 -
 .../riscv/rvv/autovec/cond/cond_fmin-3.c  |  1 -
 .../riscv/rvv/autovec/cond/cond_fmin-4.c  |  1 -
 .../riscv/rvv/autovec/cond/cond_fmin_zvfh-1.c |  1 -
 .../riscv/rvv/autovec/cond/cond_fmin_zvfh-2.c |  1 -
 .../riscv/rvv/autovec/cond/cond_fmin_zvfh-3.c |  1 -
 .../riscv/rvv/autovec/cond/cond_fmin_zvfh-4.c |  1 -
 .../rvv/autovec/partial/single_rgroup-2.h |  9 ++-
 .../rvv/autovec/partial/single_rgroup-3.h | 73 ++-
 .../gcc.target/riscv/rvv/autovec/pr112694-1.c |  4 +-
 .../riscv/rvv/autovec/unop/abs-template.h |  1 -
 .../riscv/rvv/autovec/unop/vneg-template.h|  1 -
 .../riscv/rvv/autovec/unop/vnot-template.h|  1 -
 23 files changed, 63 insertions(+), 47 deletions(-)

diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-scalar-template.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-scalar-template.h
index 8d1cefdca85..2cf645af26e 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-scalar-template.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-scalar-template.h
@@ -3,7 +3,6 @@
 /* { dg-additional-options "-std=c99 --param=riscv-autovec-preference=scalable 
-fno-vect-cost-model --save-temps" } */
 
 #include 
-#include 
 
 #define SHIFTL(TYPE,VAL)   \
   __attribute__ ((noipa))   \
@@ -64,7 +63,7 @@ TEST_ALL()
 a##TYPE##VAL[i] = 2;   \
   vsll_##TYPE_##VAL (a##TYPE##VAL, SZ);\
   for (int i = 0; i < SZ; i++) \
-assert (a##TYPE##VAL[i] == (2ll << VAL));
+if (a##TYPE##VAL[i] != (2ll << VAL)) __builtin_abort ();
 
 __attribute__((noipa))
 void vsllvx (uint32_t *dst, int val, int n)
@@ -79,7 +78,7 @@ void vsllvx (uint32_t *dst, int val, int n)
 a[i] = 2;  \
   vsllvx (a, 17, SZ);  \
   for (int i = 0; i < SZ; i++) \
-assert (a[i] == (2 << 17));
+if (a[i] != (2 << 17)) __builtin_abort ();
 
 int main ()
 {
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_fmax-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_fmax-1.c
index 25c35cf0607..fedee13aab8 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_fmax-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_fmax-1.c
@@ -2,7 +2,6 @@
 /* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d 
--param=riscv-autovec-preference=scalable -fno-vect-cost-model 
-fno-signaling-nans" } */
 
 #include 
-#include 
 
 #ifndef FN
 #define FN(X) __builtin_fmax##X
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_fmax-2.c 

[committed] RISC-V: Clean up unused variable [NFC]

2024-01-05 Thread Kito Cheng
gcc/ChangeLog:

* config/riscv/riscv-v.cc (expand_load_store):
Remove `value`.
(expand_cond_len_op): Ditto.
(expand_gather_scatter): Ditto.
(expand_lanes_load_store): Ditto.
(expand_fold_extract_last): Ditto.
---
 gcc/config/riscv/riscv-v.cc | 5 -
 1 file changed, 5 deletions(-)

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b7727b2b3e6..ec859645415 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3780,7 +3780,6 @@ expand_select_vl (rtx *ops)
 void
 expand_load_store (rtx *ops, bool is_load)
 {
-  poly_int64 value;
   rtx mask = ops[2];
   rtx len = ops[3];
   machine_mode mode = GET_MODE (ops[0]);
@@ -3849,7 +3848,6 @@ expand_cond_len_op (unsigned icode, insn_flags op_type, 
rtx *ops, rtx len)
   rtx mask = ops[1];
   machine_mode mode = GET_MODE (dest);
   machine_mode mask_mode = GET_MODE (mask);
-  poly_int64 value;
   bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
   bool is_vlmax_len = is_vlmax_len_p (mode, len);
 
@@ -4025,7 +4023,6 @@ expand_gather_scatter (rtx *ops, bool is_load)
   scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
   unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
-  poly_int64 value;
   bool is_vlmax = is_vlmax_len_p (vec_mode, len);
 
   /* Extend the offset element to address width.  */
@@ -4206,7 +4203,6 @@ prepare_ternary_operands (rtx *ops)
 void
 expand_lanes_load_store (rtx *ops, bool is_load)
 {
-  poly_int64 value;
   rtx mask = ops[2];
   rtx len = ops[3];
   rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
@@ -4259,7 +4255,6 @@ expand_fold_extract_last (rtx *ops)
   rtx else_label = gen_label_rtx ();
   rtx end_label = gen_label_rtx ();
   rtx len = ops[4];
-  poly_int64 value;
   machine_mode mode = GET_MODE (vect);
   machine_mode mask_mode = GET_MODE (mask);
   rtx compress_vect = gen_reg_rtx (mode);
-- 
2.34.1



Re: [PATCH v2 2/2] LoongArch: When the code model is extreme, the symbol address is obtained through macro instructions regardless of the value of -mexplicit-relocs.

2024-01-05 Thread chenglulu



在 2024/1/5 下午4:37, Xi Ruoyao 写道:

On Fri, 2024-01-05 at 11:40 +0800, Lulu Cheng wrote:

  bool
  loongarch_explicit_relocs_p (enum loongarch_symbol_type type)
  {
+  /* Instructions pcalau12i, addi.d, lu32i.d and lu52i.d must be adjancent
+ so that the linker can infer the PC of pcalau12i to apply relocations
+ to lu32i.d and lu52i.d.  Otherwise, the results would be incorrect if
+ these four instructions are not in the same 4KiB page.
+ Therefore, macro instructions are used when cmodel=extreme.  */
+  if (loongarch_symbol_extreme_p (type))
+    return false;

I think this is a bit of strange.  With -mexplicit-relocs={auto,always}
we should still use explicit relocs, but coding all 4 instructions
altogether as

"pcalau12i.d\t%1,%pc64_hi12(%2)\n\taddi.d\t%0,$r0,%pclo12(%2)\n\tlu32i.d\t%0,%pc64_lo20(%2)\n\tlu52i.d\t%0,%0,%pc64_hi12(%2)"

Give me several hours trying to implement this...


You mean to take the last add directive out separately?



[PATCH v7 1/2] RISC-V: Add crypto vector builtin function.

2024-01-05 Thread Feng Wang
Patch v7:Fix avl_type operand index of zvbc ins.
Patch v6:Remove unused code.
Patch v5:Rebase.
Patch v4:Merge crypto vector function.def into vector.
Patch v3:Define a shape for vaesz and merge vector-crypto-types.def
 into riscv-vector-builtins-types.def.
Patch v2:Optimize function_shape class for crypto_vector.

This patch add the intrinsic funtions of crypto vector based on the
intrinsic doc(https://github.com/riscv-non-isa/rvv-intrinsic-doc/blob
/eopc/vector-crypto/auto-generated/vector-crypto/intrinsic_funcs.md).

Co-Authored by: Songhe Zhu 
Co-Authored by: Ciyan Pan 
gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc (class vandn):
Add new function_base for crypto vector.
(class bitmanip): Ditto.
(class b_reverse):Ditto.
(class vwsll):   Ditto.
(class clmul):   Ditto.
(class vg_nhab):  Ditto.
(class crypto_vv):Ditto.
(class crypto_vi):Ditto.
(class vaeskf2_vsm3c):Ditto.
(class vsm3me): Ditto.
(BASE): Add BASE declaration for crypto vector.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def 
(REQUIRED_EXTENSIONS):
Add crypto vector intrinsic definition.
(vbrev): Ditto.
(vclz): Ditto.
(vctz): Ditto.
(vwsll): Ditto.
(vandn): Ditto.
(vbrev8): Ditto.
(vrev8): Ditto.
(vrol): Ditto.
(vror): Ditto.
(vclmul): Ditto.
(vclmulh): Ditto.
(vghsh): Ditto.
(vgmul): Ditto.
(vaesef): Ditto.
(vaesem): Ditto.
(vaesdf): Ditto.
(vaesdm): Ditto.
(vaesz): Ditto.
(vaeskf1): Ditto.
(vaeskf2): Ditto.
(vsha2ms): Ditto.
(vsha2ch): Ditto.
(vsha2cl): Ditto.
(vsm4k): Ditto.
(vsm4r): Ditto.
(vsm3me): Ditto.
(vsm3c): Ditto.
* config/riscv/riscv-vector-builtins-shapes.cc (struct crypto_vv_def):
Add new function_shape for crypto vector.
(struct crypto_vi_def): Ditto.
(struct crypto_vv_no_op_type_def): Ditto.
(SHAPE): Add SHAPE declaration of crypto vector.
* config/riscv/riscv-vector-builtins-shapes.h: Ditto.
* config/riscv/riscv-vector-builtins-types.def 
(DEF_RVV_CRYPTO_SEW32_OPS):
Add new data type for crypto vector.
(DEF_RVV_CRYPTO_SEW64_OPS): Ditto.
(vuint32mf2_t): Ditto.
(vuint32m1_t): Ditto.
(vuint32m2_t): Ditto.
(vuint32m4_t): Ditto.
(vuint32m8_t): Ditto.
(vuint64m1_t): Ditto.
(vuint64m2_t): Ditto.
(vuint64m4_t): Ditto.
(vuint64m8_t): Ditto.
* config/riscv/riscv-vector-builtins.cc (DEF_RVV_CRYPTO_SEW32_OPS):
Add new data struct for crypto vector.
(DEF_RVV_CRYPTO_SEW64_OPS): Ditto.
(registered_function::overloaded_hash): Processing size_t uimm for C 
overloaded func.
* config/riscv/riscv-vector-builtins.def (vi): Add vi OP_TYPE.
* config/riscv/vector.md: Modify avl_type operand index of zvbc ins.
---
 .../riscv/riscv-vector-builtins-bases.cc  | 264 +-
 .../riscv/riscv-vector-builtins-bases.h   |  28 ++
 .../riscv/riscv-vector-builtins-functions.def |  94 +++
 .../riscv/riscv-vector-builtins-shapes.cc |  87 +-
 .../riscv/riscv-vector-builtins-shapes.h  |   4 +
 .../riscv/riscv-vector-builtins-types.def |  25 ++
 gcc/config/riscv/riscv-vector-builtins.cc | 133 -
 gcc/config/riscv/riscv-vector-builtins.def|   1 +
 gcc/config/riscv/vector.md|   4 +-
 9 files changed, 635 insertions(+), 5 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 810783bff4c..fba98124473 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -2127,6 +2127,212 @@ public:
   }
 };
 
+/* Below implements are vector crypto */
+/* Implements vandn.[vv,vx] */
+class vandn : public function_base
+{
+public:
+  rtx expand (function_expander &e) const override
+  {
+switch (e.op_info->op)
+  {
+  case OP_TYPE_vv:
+return e.use_exact_insn (code_for_pred_vandn (e.vector_mode ()));
+  case OP_TYPE_vx:
+return e.use_exact_insn (code_for_pred_vandn_scalar (e.vector_mode 
()));
+  default:
+gcc_unreachable ();
+  }
+  }
+};
+
+/* Implements vrol/vror/clz/ctz.  */
+template
+class bitmanip : public function_base
+{
+public:
+  bool apply_tail_policy_p () const override
+  {
+return (CODE == CLZ || CODE == CTZ) ? false : true;
+  }
+  bool apply_mask_policy_p () const override
+  {
+return (CODE == CLZ || CODE == CTZ) ? false : true;
+  }
+  bool has_merge_operand_p () const override
+  {
+

Re: [PATCH v7 1/2] RISC-V: Add crypto vector builtin function.

2024-01-05 Thread juzhe.zh...@rivai.ai
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -864,9 +864,9 @@
  vnclip,vicmp,vfalu,vfmul,vfminmax,vfdiv,vfwalu,vfwmul,\
  vfsgnj,vfcmp,vslideup,vslidedown,vislide1up,\
  vislide1down,vfslide1up,vfslide1down,vgather,viwmuladd,vfwmuladd,\
-   vlsegds,vlsegdux,vlsegdox,vandn,vrol,vror,vwsll")
+   vlsegds,vlsegdux,vlsegdox,vandn,vrol,vror,vclmul,vclmulh,vwsll")
   (const_int 8)
- (eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox,vclmul,vclmulh")
+ (eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox")
   (const_int 5)


Ah, I knew something go wrong in case of attribute bugs.

I think it should be a separate patch which is "Fix vlmax type attribute bugs 
of vclmul and vclmulh instructions".



juzhe.zh...@rivai.ai
 
From: Feng Wang
Date: 2024-01-05 16:51
To: gcc-patches
CC: kito.cheng; jeffreyalaw; juzhe.zhong; Feng Wang
Subject: [PATCH v7 1/2] RISC-V: Add crypto vector builtin function.
Patch v7:Fix avl_type operand index of zvbc ins.
Patch v6:Remove unused code.
Patch v5:Rebase.
Patch v4:Merge crypto vector function.def into vector.
Patch v3:Define a shape for vaesz and merge vector-crypto-types.def
 into riscv-vector-builtins-types.def.
Patch v2:Optimize function_shape class for crypto_vector.
 
This patch add the intrinsic funtions of crypto vector based on the
intrinsic doc(https://github.com/riscv-non-isa/rvv-intrinsic-doc/blob
/eopc/vector-crypto/auto-generated/vector-crypto/intrinsic_funcs.md).
 
Co-Authored by: Songhe Zhu 
Co-Authored by: Ciyan Pan 
gcc/ChangeLog:
 
* config/riscv/riscv-vector-builtins-bases.cc (class vandn):
Add new function_base for crypto vector.
(class bitmanip): Ditto.
(class b_reverse):Ditto.
(class vwsll):   Ditto.
(class clmul):   Ditto.
(class vg_nhab):  Ditto.
(class crypto_vv):Ditto.
(class crypto_vi):Ditto.
(class vaeskf2_vsm3c):Ditto.
(class vsm3me): Ditto.
(BASE): Add BASE declaration for crypto vector.
* config/riscv/riscv-vector-builtins-bases.h: Ditto.
* config/riscv/riscv-vector-builtins-functions.def (REQUIRED_EXTENSIONS):
Add crypto vector intrinsic definition.
(vbrev): Ditto.
(vclz): Ditto.
(vctz): Ditto.
(vwsll): Ditto.
(vandn): Ditto.
(vbrev8): Ditto.
(vrev8): Ditto.
(vrol): Ditto.
(vror): Ditto.
(vclmul): Ditto.
(vclmulh): Ditto.
(vghsh): Ditto.
(vgmul): Ditto.
(vaesef): Ditto.
(vaesem): Ditto.
(vaesdf): Ditto.
(vaesdm): Ditto.
(vaesz): Ditto.
(vaeskf1): Ditto.
(vaeskf2): Ditto.
(vsha2ms): Ditto.
(vsha2ch): Ditto.
(vsha2cl): Ditto.
(vsm4k): Ditto.
(vsm4r): Ditto.
(vsm3me): Ditto.
(vsm3c): Ditto.
* config/riscv/riscv-vector-builtins-shapes.cc (struct crypto_vv_def):
Add new function_shape for crypto vector.
(struct crypto_vi_def): Ditto.
(struct crypto_vv_no_op_type_def): Ditto.
(SHAPE): Add SHAPE declaration of crypto vector.
* config/riscv/riscv-vector-builtins-shapes.h: Ditto.
* config/riscv/riscv-vector-builtins-types.def (DEF_RVV_CRYPTO_SEW32_OPS):
Add new data type for crypto vector.
(DEF_RVV_CRYPTO_SEW64_OPS): Ditto.
(vuint32mf2_t): Ditto.
(vuint32m1_t): Ditto.
(vuint32m2_t): Ditto.
(vuint32m4_t): Ditto.
(vuint32m8_t): Ditto.
(vuint64m1_t): Ditto.
(vuint64m2_t): Ditto.
(vuint64m4_t): Ditto.
(vuint64m8_t): Ditto.
* config/riscv/riscv-vector-builtins.cc (DEF_RVV_CRYPTO_SEW32_OPS):
Add new data struct for crypto vector.
(DEF_RVV_CRYPTO_SEW64_OPS): Ditto.
(registered_function::overloaded_hash): Processing size_t uimm for C overloaded 
func.
* config/riscv/riscv-vector-builtins.def (vi): Add vi OP_TYPE.
* config/riscv/vector.md: Modify avl_type operand index of zvbc ins.
---
.../riscv/riscv-vector-builtins-bases.cc  | 264 +-
.../riscv/riscv-vector-builtins-bases.h   |  28 ++
.../riscv/riscv-vector-builtins-functions.def |  94 +++
.../riscv/riscv-vector-builtins-shapes.cc |  87 +-
.../riscv/riscv-vector-builtins-shapes.h  |   4 +
.../riscv/riscv-vector-builtins-types.def |  25 ++
gcc/config/riscv/riscv-vector-builtins.cc | 133 -
gcc/config/riscv/riscv-vector-builtins.def|   1 +
gcc/config/riscv/vector.md|   4 +-
9 files changed, 635 insertions(+), 5 deletions(-)
 
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 810783bff4c..fba98124473 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -2127,6 +2127,212 @@ public:
   }
};
+/* Below implements are vector crypto */
+/* Implements vandn.[vv,vx] */
+class vandn : public function_base
+{
+public:
+  rtx expand (function_expander &e) const override
+  {
+switch (e.op_info->op)
+  {
+  case OP_TYPE_vv:
+return e.use_exact_insn (code_for_pred_vandn (e.vector_mode ()));
+  case OP_TYPE_vx:
+return e.use_exact_insn (code_for_pred_vandn_scalar (e.vector_mode 
()));
+  default:
+gcc_unreachable ();
+  }
+  }
+};
+
+/* Implements vrol/vror/clz/ctz.  */
+template
+class bitmanip : public function_base
+{
+public:
+  bool apply

Re: Re: [PATCH v7 1/2] RISC-V: Add crypto vector builtin function.

2024-01-05 Thread Feng Wang
2024-01-05 16:55 juzhe.zhong  wrote:



>--- a/gcc/config/riscv/vector.md



>+++ b/gcc/config/riscv/vector.md



>@@ -864,9 +864,9 @@



>  vnclip,vicmp,vfalu,vfmul,vfminmax,vfdiv,vfwalu,vfwmul,\



>  vfsgnj,vfcmp,vslideup,vslidedown,vislide1up,\



>  vislide1down,vfslide1up,vfslide1down,vgather,viwmuladd,vfwmuladd,\



>-   vlsegds,vlsegdux,vlsegdox,vandn,vrol,vror,vwsll")



>+   vlsegds,vlsegdux,vlsegdox,vandn,vrol,vror,vclmul,vclmulh,vwsll")



>   (const_int 8)



>- (eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox,vclmul,vclmulh")



>+ (eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox")



>   (const_int 5)



>



>



>Ah, I knew something go wrong in case of attribute bugs.



>



>I think it should be a separate patch which is "Fix vlmax type attribute bugs 
>of vclmul and vclmulh instructions".

>

>

>

>juzhe.zh...@rivai.ai

> 

OK. Will separate it.

>From: Feng Wang



>Date: 2024-01-05 16:51



>To: gcc-patches



>CC: kito.cheng; jeffreyalaw; juzhe.zhong; Feng Wang



>Subject: [PATCH v7 1/2] RISC-V: Add crypto vector builtin function.



>Patch v7:Fix avl_type operand index of zvbc ins.



>Patch v6:Remove unused code.



>Patch v5:Rebase.



>Patch v4:Merge crypto vector function.def into vector.



>Patch v3:Define a shape for vaesz and merge vector-crypto-types.def



> into riscv-vector-builtins-types.def.



>Patch v2:Optimize function_shape class for crypto_vector.



> 



>This patch add the intrinsic funtions of crypto vector based on the



>intrinsic doc(https://github.com/riscv-non-isa/rvv-intrinsic-doc/blob



>/eopc/vector-crypto/auto-generated/vector-crypto/intrinsic_funcs.md).



> 



>Co-Authored by: Songhe Zhu 



>Co-Authored by: Ciyan Pan 



>gcc/ChangeLog:



> 



>* config/riscv/riscv-vector-builtins-bases.cc (class vandn):



>Add new function_base for crypto vector.



>(class bitmanip): Ditto.



>(class b_reverse):Ditto.



>(class vwsll):   Ditto.



>(class clmul):   Ditto.



>(class vg_nhab):  Ditto.



>(class crypto_vv):Ditto.



>(class crypto_vi):Ditto.



>(class vaeskf2_vsm3c):Ditto.



>(class vsm3me): Ditto.



>(BASE): Add BASE declaration for crypto vector.



>* config/riscv/riscv-vector-builtins-bases.h: Ditto.



>* config/riscv/riscv-vector-builtins-functions.def (REQUIRED_EXTENSIONS):



>Add crypto vector intrinsic definition.



>(vbrev): Ditto.



>(vclz): Ditto.



>(vctz): Ditto.



>(vwsll): Ditto.



>(vandn): Ditto.



>(vbrev8): Ditto.



>(vrev8): Ditto.



>(vrol): Ditto.



>(vror): Ditto.



>(vclmul): Ditto.



>(vclmulh): Ditto.



>(vghsh): Ditto.



>(vgmul): Ditto.



>(vaesef): Ditto.



>(vaesem): Ditto.



>(vaesdf): Ditto.



>(vaesdm): Ditto.



>(vaesz): Ditto.



>(vaeskf1): Ditto.



>(vaeskf2): Ditto.



>(vsha2ms): Ditto.



>(vsha2ch): Ditto.



>(vsha2cl): Ditto.



>(vsm4k): Ditto.



>(vsm4r): Ditto.



>(vsm3me): Ditto.



>(vsm3c): Ditto.



>* config/riscv/riscv-vector-builtins-shapes.cc (struct crypto_vv_def):



>Add new function_shape for crypto vector.



>(struct crypto_vi_def): Ditto.



>(struct crypto_vv_no_op_type_def): Ditto.



>(SHAPE): Add SHAPE declaration of crypto vector.



>* config/riscv/riscv-vector-builtins-shapes.h: Ditto.



>* config/riscv/riscv-vector-builtins-types.def (DEF_RVV_CRYPTO_SEW32_OPS):



>Add new data type for crypto vector.



>(DEF_RVV_CRYPTO_SEW64_OPS): Ditto.



>(vuint32mf2_t): Ditto.



>(vuint32m1_t): Ditto.



>(vuint32m2_t): Ditto.



>(vuint32m4_t): Ditto.



>(vuint32m8_t): Ditto.



>(vuint64m1_t): Ditto.



>(vuint64m2_t): Ditto.



>(vuint64m4_t): Ditto.



>(vuint64m8_t): Ditto.



>* config/riscv/riscv-vector-builtins.cc (DEF_RVV_CRYPTO_SEW32_OPS):



>Add new data struct for crypto vector.



>(DEF_RVV_CRYPTO_SEW64_OPS): Ditto.



>(registered_function::overloaded_hash): Processing size_t uimm for C 
>overloaded func.



>* config/riscv/riscv-vector-builtins.def (vi): Add vi OP_TYPE.



>* config/riscv/vector.md: Modify avl_type operand index of zvbc ins.



>---



>.../riscv/riscv-vector-builtins-bases.cc  | 264 +-



>.../riscv/riscv-vector-builtins-bases.h   |  28 ++



>.../riscv/riscv-vector-builtins-functions.def |  94 +++



>.../riscv/riscv-vector-builtins-shapes.cc |  87 +-



>.../riscv/riscv-vector-builtins-shapes.h  |   4 +



>.../riscv/riscv-vector-builtins-types.def |  25 ++



>gcc/config/riscv/riscv-vector-builtins.cc | 133 -



>gcc/config/riscv/riscv-vector-builtins.def    |   1 +



>gcc/config/riscv/vector.md    |   4 +-



>9 files changed, 635 insertions(+), 5 deletions(-)



> 



>diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
>b/gcc/config/riscv/riscv-vector-builtins-bases.cc



>index 810783bff4c..fba98124473 100644



>--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc



>+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc



>@@ -2127,6 +2127,212 @@ public:



>   }



>};



>+/* Below implements are vector 

[PATCH] RISC-V: Fix avl-type operand index error for ZVBC

2024-01-05 Thread Feng Wang
This patch fix the rtl-checking error for crypto vector. The root
cause is the avl-type index of zvbc ins is error,it should be operand[8]
not operand[5].
gcc/ChangeLog:

* config/riscv/vector.md: Modify avl_type operand index of zvbc ins.
---
 gcc/config/riscv/vector.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 3d2c1c3ce8f..e99a312b5b6 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -864,9 +864,9 @@
  
vnclip,vicmp,vfalu,vfmul,vfminmax,vfdiv,vfwalu,vfwmul,\
  vfsgnj,vfcmp,vslideup,vslidedown,vislide1up,\
  
vislide1down,vfslide1up,vfslide1down,vgather,viwmuladd,vfwmuladd,\
- vlsegds,vlsegdux,vlsegdox,vandn,vrol,vror,vwsll")
+ 
vlsegds,vlsegdux,vlsegdox,vandn,vrol,vror,vclmul,vclmulh,vwsll")
   (const_int 8)
-(eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox,vclmul,vclmulh")
+(eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox")
   (const_int 5)
 
 (eq_attr "type" "vimuladd,vfmuladd")
-- 
2.17.1



Re: [PATCH] RISC-V: Fix avl-type operand index error for ZVBC

2024-01-05 Thread juzhe.zh...@rivai.ai
LGTM.



juzhe.zh...@rivai.ai
 
From: Feng Wang
Date: 2024-01-05 17:23
To: gcc-patches
CC: kito.cheng; jeffreyalaw; juzhe.zhong; Feng Wang
Subject: [PATCH] RISC-V: Fix avl-type operand index error for ZVBC
This patch fix the rtl-checking error for crypto vector. The root
cause is the avl-type index of zvbc ins is error,it should be operand[8]
not operand[5].
gcc/ChangeLog:
 
* config/riscv/vector.md: Modify avl_type operand index of zvbc ins.
---
gcc/config/riscv/vector.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
 
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 3d2c1c3ce8f..e99a312b5b6 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -864,9 +864,9 @@
  vnclip,vicmp,vfalu,vfmul,vfminmax,vfdiv,vfwalu,vfwmul,\
  vfsgnj,vfcmp,vslideup,vslidedown,vislide1up,\
  vislide1down,vfslide1up,vfslide1down,vgather,viwmuladd,vfwmuladd,\
-   vlsegds,vlsegdux,vlsegdox,vandn,vrol,vror,vwsll")
+   vlsegds,vlsegdux,vlsegdox,vandn,vrol,vror,vclmul,vclmulh,vwsll")
   (const_int 8)
- (eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox,vclmul,vclmulh")
+ (eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox")
   (const_int 5)
(eq_attr "type" "vimuladd,vfmuladd")
-- 
2.17.1
 
 


Re: [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]

2024-01-05 Thread Robin Dapp
> 1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize 
> shift with vector shift amount,
> that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant.
> 2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize 
> shift with scalar shift amount,
> that is, vectorization of 'a[i] >> x', the shift amount is loop invariant.
> 

> +static bool
> +loop_invariant_op_p (class loop *loop,
> +  tree op)
> +{
> +  if (is_gimple_min_invariant (op))
> +return true;
> +  if (SSA_NAME_IS_DEFAULT_DEF (op)
> +  || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op
> +return true;
> +  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
> +}
> +

Looks like this is straight from tree-ssa-loop-ch.  Do we need
is_gimple_min_invariant (is_gimple_constant could be sufficient?)
and DEFAULT_DEF for our case?  The rhs of a shift should never contain
a default def?

I'm not entirely happy about the "loop invariant" heuristic/proxy
of the shift amount being vectorizable.  That seems like something
that could bite us in the future in case we do slp-like vectorization
on loop-invariant (but varying) data.

As it helps for now and is not a correctness issue I'd still tend to
go forward with it.

Regards
 Robin


Re: [PATCH v2 2/2] LoongArch: When the code model is extreme, the symbol address is obtained through macro instructions regardless of the value of -mexplicit-relocs.

2024-01-05 Thread chenglulu



在 2024/1/5 下午4:37, Xi Ruoyao 写道:

On Fri, 2024-01-05 at 11:40 +0800, Lulu Cheng wrote:

  bool
  loongarch_explicit_relocs_p (enum loongarch_symbol_type type)
  {
+  /* Instructions pcalau12i, addi.d, lu32i.d and lu52i.d must be adjancent
+ so that the linker can infer the PC of pcalau12i to apply relocations
+ to lu32i.d and lu52i.d.  Otherwise, the results would be incorrect if
+ these four instructions are not in the same 4KiB page.
+ Therefore, macro instructions are used when cmodel=extreme.  */
+  if (loongarch_symbol_extreme_p (type))
+    return false;

I think this is a bit of strange.  With -mexplicit-relocs={auto,always}
we should still use explicit relocs, but coding all 4 instructions
altogether as

"pcalau12i.d\t%1,%pc64_hi12(%2)\n\taddi.d\t%0,$r0,%pclo12(%2)\n\tlu32i.d\t%0,%pc64_lo20(%2)\n\tlu52i.d\t%0,%0,%pc64_hi12(%2)"

Give me several hours trying to implement this...

I think there is no difference between macros and these instructions put 
together. If implement it in a split form, I think I can try it through 
TARGET_SCHED_MACRO_FUSION_PAIR_P




Re: Ping^3: [PATCH] Add a late-combine pass [PR106594]

2024-01-05 Thread YunQiang Su
I have tested this patch on mips64el: No regression.


[committed] libstdc++: Use if-constexpr in std::__try_use_facet [PR113099]

2024-01-05 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk.

-- >8 --

As noted in the PR, we can use if-constexpr for the explicit
instantantiation definitions that are compiled with -std=gnu++11. We
just need to disable the -Wc++17-extensions diagnostics.

libstdc++-v3/ChangeLog:

PR libstdc++/113099
* include/bits/locale_classes.tcc (__try_use_facet): Use
if-constexpr for C++11 and up.
---
 libstdc++-v3/include/bits/locale_classes.tcc | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libstdc++-v3/include/bits/locale_classes.tcc 
b/libstdc++-v3/include/bits/locale_classes.tcc
index 2a6176fb4de..63097582dec 100644
--- a/libstdc++-v3/include/bits/locale_classes.tcc
+++ b/libstdc++-v3/include/bits/locale_classes.tcc
@@ -87,6 +87,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__s2.data(), __s2.data() + __s2.length()) < 0);
 }
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wc++17-extensions"
   template
 inline const _Facet*
 __try_use_facet(const locale& __loc) _GLIBCXX_NOTHROW
@@ -97,7 +99,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   // We know these standard facets are always installed in every locale
   // so dynamic_cast always succeeds, just use static_cast instead.
 #define _GLIBCXX_STD_FACET(...) \
-  if _GLIBCXX17_CONSTEXPR (__is_same(_Facet, __VA_ARGS__)) \
+  if _GLIBCXX_CONSTEXPR (__is_same(_Facet, __VA_ARGS__)) \
return static_cast(__facets[__i])
 
   _GLIBCXX_STD_FACET(ctype);
@@ -145,6 +147,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   return static_cast(__facets[__i]);
 #endif
 }
+#pragma GCC diagnostic pop
 
   /**
*  @brief  Test for the presence of a facet.
-- 
2.43.0



Re: [PATCH v2 2/2] LoongArch: When the code model is extreme, the symbol address is obtained through macro instructions regardless of the value of -mexplicit-relocs.

2024-01-05 Thread Xi Ruoyao
On Fri, 2024-01-05 at 17:57 +0800, chenglulu wrote:
> 
> 在 2024/1/5 下午4:37, Xi Ruoyao 写道:
> > On Fri, 2024-01-05 at 11:40 +0800, Lulu Cheng wrote:
> > >   bool
> > >   loongarch_explicit_relocs_p (enum loongarch_symbol_type type)
> > >   {
> > > +  /* Instructions pcalau12i, addi.d, lu32i.d and lu52i.d must be 
> > > adjancent
> > > + so that the linker can infer the PC of pcalau12i to apply 
> > > relocations
> > > + to lu32i.d and lu52i.d.  Otherwise, the results would be incorrect 
> > > if
> > > + these four instructions are not in the same 4KiB page.
> > > + Therefore, macro instructions are used when cmodel=extreme.  */
> > > +  if (loongarch_symbol_extreme_p (type))
> > > +    return false;
> > I think this is a bit of strange.  With -mexplicit-relocs={auto,always}
> > we should still use explicit relocs, but coding all 4 instructions
> > altogether as
> > 
> > "pcalau12i.d\t%1,%pc64_hi12(%2)\n\taddi.d\t%0,$r0,%pclo12(%2)\n\tlu32i.d\t%0,%pc64_lo20(%2)\n\tlu52i.d\t%0,%0,%pc64_hi12(%2)"
> > 
> > Give me several hours trying to implement this...
> > 
> I think there is no difference between macros and these instructions put 
> together. If implement it in a split form, I think I can try it through 
> TARGET_SCHED_MACRO_FUSION_PAIR_P

There is a difference:

int x;
int t() { return x; }

pcalau12i.d t0, %pc_hi20(x)
addi.d t1, r0, %pc_lo12(x)
lu32i.d t1, %pc64_lo20(x)
lu52i.d t1, t1, %pc64_hi12(x)
ldx.w a0, t0, t1

is slightly better than

pcalau12i.d t0, %pc_hi20(x)
addi.d t1, r0, %pc_lo12(x)
lu32i.d t1, %pc64_lo20(x)
lu52i.d t1, t1, %pc64_hi12(x)
addi.d t0, t0, t1
ld.w a0, t0, 0

And generating macros when -mexplicit-relocs=always can puzzle people
(it says "always" :-\ ).

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


[committed] libstdc++: Remove UB from month and weekday additions and subtractions.

2024-01-05 Thread Jonathan Wakely
From: Cassio Neri 

Tested x86_64-linux. Pushed to trunk.

This seems suitable for backporting too, at least to gcc-13.

-- >8 --

The following invoke signed integer overflow (UB) [1]:

  month   + months{MAX} // where MAX is the maximum value of months::rep
  month   + months{MIN} // where MIN is the maximum value of months::rep
  month   - months{MIN} // where MIN is the minimum value of months::rep
  weekday + days  {MAX} // where MAX is the maximum value of days::rep
  weekday - days  {MIN} // where MIN is the minimum value of days::rep

For the additions to MAX, the crux of the problem is that, in libstdc++,
months::rep and days::rep are int64_t. Other implementations use int32_t, cast
operands to int64_t and perform arithmetic operations without risk of
overflowing.

For month + months{MIN}, the implementation follows the Standard's "returns
clause" and evaluates:

   modulo(static_cast(unsigned{__x}) + (__y.count() - 1), 12);

Overflow occurs when MIN - 1 is evaluated. Casting to a larger type could help
but, unfortunately again, this is not possible for libstdc++.

For the subtraction of MIN, the problem is that -MIN is not representable.

It's fair to say that the intention is for these additions/subtractions to
be performed in modulus (12 or 7) arithmetic so that no overflow is expected.

To fix these UB, this patch implements:

  template 
  unsigned __add_modulo(unsigned __x, _T __y);

  template 
  unsigned __sub_modulo(unsigned __x, _T __y);

which respectively, returns the remainder of Euclidean division of, __x + __y
and __x - __y by __d without overflowing. These functions replace

  constexpr unsigned __modulo(long long __n, unsigned __d);

which also calculates the reminder of __n, where __n is the result of the
addition or subtraction. Hence, these operations might invoke UB before __modulo
is called and thus, __modulo can't do anything to remediate the issue.

In addition to solve the UB issues, __add_modulo and __sub_modulo allow better
codegen (shorter and branchless) on x86-64 and ARM [2].

[1] https://godbolt.org/z/a9YfWdn57
[2] https://godbolt.org/z/Gh36cr7E4

libstdc++-v3/ChangeLog:

* include/std/chrono: Fix + and - for months and weekdays.
* testsuite/std/time/month/1.cc: Add constexpr tests against overflow.
* testsuite/std/time/month/2.cc: New test for extreme values.
* testsuite/std/time/weekday/1.cc: Add constexpr tests against overflow.
* testsuite/std/time/weekday/2.cc: New test for extreme values.
---
 libstdc++-v3/include/std/chrono  | 81 +---
 libstdc++-v3/testsuite/std/time/month/1.cc   | 19 +
 libstdc++-v3/testsuite/std/time/month/2.cc   | 32 
 libstdc++-v3/testsuite/std/time/weekday/1.cc | 16 +++-
 libstdc++-v3/testsuite/std/time/weekday/2.cc | 32 
 5 files changed, 152 insertions(+), 28 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/std/time/month/2.cc
 create mode 100644 libstdc++-v3/testsuite/std/time/weekday/2.cc

diff --git a/libstdc++-v3/include/std/chrono b/libstdc++-v3/include/std/chrono
index b3ad2a0b1ac..a59af34567c 100644
--- a/libstdc++-v3/include/std/chrono
+++ b/libstdc++-v3/include/std/chrono
@@ -501,18 +501,47 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 namespace __detail
 {
-  // Compute the remainder of the Euclidean division of __n divided by __d.
-  // Euclidean division truncates toward negative infinity and always
-  // produces a remainder in the range of [0,__d-1] (whereas standard
-  // division truncates toward zero and yields a nonpositive remainder
-  // for negative __n).
-  constexpr unsigned
-  __modulo(long long __n, unsigned __d)
+  // Helper to __add_modulo and __sub_modulo.
+  template 
+  consteval auto
+  __modulo_offset()
   {
-   if (__n >= 0)
- return __n % __d;
-   else
- return (__d + (__n % __d)) % __d;
+   using _Up = make_unsigned_t<_Tp>;
+   auto constexpr __a = _Up(-1) - _Up(255 + __d - 2);
+   auto constexpr __b = _Up(__d * (__a / __d) - 1);
+   // Notice: b <= a - 1 <= _Up(-1) - (255 + d - 1) and b % d = d - 1.
+   return _Up(-1) - __b; // >= 255 + d - 1
+  }
+
+  // Compute the remainder of the Euclidean division of __x + __y divided 
by
+  // __d without overflowing.  Typically, __x <= 255 + d - 1 is sum of
+  // weekday/month with a shift in [0, d - 1] and __y is a duration count.
+  template 
+  constexpr unsigned
+  __add_modulo(unsigned __x, _Tp __y)
+  {
+   using _Up = make_unsigned_t<_Tp>;
+   // For __y >= 0, _Up(__y) has the same mathematical value as __y and
+   // this function simply returns (__x + _Up(__y)) % d.  Typically, this
+   // doesn't overflow since the range of _Up contains many more positive
+   // values than _Tp's.  For __y < 0, _Up(__y) has a mathematical value in
+   // the upper-half range of _Up so that adding a positive value to it
+   /

Ping #2 [patch,avr] PR112944: Support .rodata in RAM for AVR64* and AVR128* devices

2024-01-05 Thread Georg-Johann Lay

Ping #2

https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640140.html

FYI, Binutils https://sourceware.org/PR31124 is upstream.

Johann


This is a patch that locates .rodata in flash for some AVR
devices that can support it.  All new functionality depends
on Binutils PR31124 and is switched on by configure checks
for the new emulations.

https://sourceware.org/PR31124

For explanation of the gcc part see commit message below.

Most of the patch is adjusting device-specs generation.

Ok for master?

--

avr: Support .rodata in Flash for AVR64* and AVR128* Devices.

These devices see a 32 KiB block of their program memory (flash) in
the RAM address space.  This can be used to support .rodata in flash
provided Binutils support PR31124 (Add new emulations which locate
.rodata in flash).  This patch does the following:

* configure checks availability of Binutils PR31124.

* Add new command line options -mrodata-in-ram and -flmap.
While -flmap is for internal usage (communicate hardware properties
to the compiler proper), -mrodata-in-ram is a user space option that
allows to return to the current rodata-in-ram layout.

* Adjust gen-avr-mmcu-specs.cc so that specs are generated that sanity
check options, and that translate -m[no-]rodata-in-ram to its emulation.

* Objects in .rodata don't drag __do_copy_data.

* Document new options and built-in macros.

PR target/112944

gcc/
* configure.ac [target=avr]: Check availability of emulations
avrxmega2_flmap and avrxmega4_flmap, resulting in new config vars
HAVE_LD_AVR_AVRXMEGA2_FLMAP and HAVE_LD_AVR_AVRXMEGA4_FLMAP.
* configure: Regenerate.
* config.in: Regenerate.
* doc/invoke.texi (AVR Options): Document -mflmap, -mrodata-in-ram,
__AVR_HAVE_FLMAP__, __AVR_RODATA_IN_RAM__.
* doc/avr-mmcu.texi: Regenerate.

* gcc/config/avr/avr.opt (-mflmap, -mrodata-in-ram): New options.
* config/avr/avr-arch.h (enum avr_device_specific_features):
Add AVR_ISA_FLMAP.
* config/avr/avr-mcus.def (AVR_MCU) [avr64*, avr128*]: Set isa flag
AVR_ISA_FLMAP.
* gcc/config/avr/avr.cc (avr_arch_index, avr_has_rodata_p): New vars.
(avr_set_core_architecture): Set avr_arch_index.
(have_avrxmega2_flmap, have_avrxmega4_flmap)
(have_avrxmega3_rodata_in_flash): Set new static const bool according
to configure results.
(avr_rodata_in_flash_p): New function.
(avr_asm_init_sections): Let readonly_data_section->unnamed.callback
track avr_need_copy_data_p only if not avr_rodata_in_flash_p().
(avr_asm_named_section): Track avr_has_rodata_p.
(avr_file_end): Emit __do_copy_data also when avr_has_rodata_p
and not avr_rodata_in_flash_p ().
* config/avr/specs.h (CC1_SPEC): Add %(cc1_rodata_in_ram).
(LINK_SPEC): Add %(link_rodata_in_ram).
(LINK_ARCH_SPEC): Remove.
* gcc/config/avr/gen-avr-mmcu-specs.cc (have_avrxmega3_rodata_in_flash)
(have_avrxmega2_flmap, have_avrxmega4_flmap): Set new static
const bool according to configure results.
(diagnose_mrodata_in_ram): New function.
(print_mcu): Generate specs with the following changes:
<*cc1_misc, *asm_misc, *link_misc>: New specs so that we don't
need to extend avr/specs.h each time we add a new bell or whistle.
<*cc1_rodata_in_ram, *link_rodata_in_ram>: New specs to diagnose
-m[no-]rodata-in-ram.
<*cpp_rodata_in_ram>: New. Does -D__AVR_RODATA_IN_RAM__=0/1.
<*cpp_mcu>: Add -D__AVR_AVR_FLMAP__ if it applies.
<*cpp>: Add %(cpp_rodata_in_ram).
<*link_arch>: Use emulation avrxmega2_flmap, avrxmega4_flmap as needed.
<*self_spec>: Add -mflmap or % 
 
+/* Define if your linker supports emulation avrxmega2_flmap. */

+#ifndef USED_FOR_TARGET
+#undef HAVE_LD_AVR_AVRXMEGA2_FLMAP
+#endif
+
+
 /* Define if your default avr linker script for avrxmega3 leaves .rodata in
flash. */
 #ifndef USED_FOR_TARGET
@@ -1680,6 +1686,12 @@
 #endif
 
 
+/* Define if your linker supports emulation avrxmega4_flmap. */

+#ifndef USED_FOR_TARGET
+#undef HAVE_LD_AVR_AVRXMEGA4_FLMAP
+#endif
+
+
 /* Define if your linker supports -z bndplt */
 #ifndef USED_FOR_TARGET
 #undef HAVE_LD_BNDPLT_SUPPORT
diff --git a/gcc/config/avr/avr-arch.h b/gcc/config/avr/avr-arch.h
index 79445fe7987..9ef187a1fd1 100644
--- a/gcc/config/avr/avr-arch.h
+++ b/gcc/config/avr/avr-arch.h
@@ -166,7 +166,35 @@ AVR_ISA_RCALL
   assume these instructions are not available and we set the built-in
   macro __AVR_HAVE_JMP_CALL__ accordingly.  This macro is used to
   determine a rough estimate of flash size in libgcc, and AVR-LibC uses
-  this macro to determine vector sizes.  */
+  this macro to determine vector sizes.
+
+AVR_ISA_FLMAP
+  The device has the NVMCTRL_CTRLB.FLMAP bitfield.  The value of FLMAP
+  determines which 32 KiB segment of the program memory (flash) is visible
+  in the RAM address space at 0x8000.
+
+  If Binutils support emulations avrxmega2_flmap resp. avrxmega4_flmap
+  (PR31124), then  the location of the .rodata section can b

[committed] libstdc++: Fix std::char_traits::move [PR113200]

2024-01-05 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk. Backports needed too.

-- >8 --

The current constexpr implementation of std::char_traits::move relies
on being able to compare the pointer parameters, which is not allowed
for unrelated pointers. We can use __builtin_constant_p to determine
whether it's safe to compare the pointers directly. If not, then we know
the ranges must be disjoint and so we can use char_traits::copy to
copy forwards from the first character to the last. If the pointers can
be compared directly, then we can simplify the condition for copying
backwards to just two pointer comparisons.

libstdc++-v3/ChangeLog:

PR libstdc++/113200
* include/bits/char_traits.h (__gnu_cxx::char_traits::move): Use
__builtin_constant_p to check for unrelated pointers that cannot
be compared during constant evaluation.
* testsuite/21_strings/char_traits/requirements/113200.cc: New
test.
---
 libstdc++-v3/include/bits/char_traits.h   | 16 +++
 .../char_traits/requirements/113200.cc| 20 +++
 2 files changed, 23 insertions(+), 13 deletions(-)
 create mode 100644 
libstdc++-v3/testsuite/21_strings/char_traits/requirements/113200.cc

diff --git a/libstdc++-v3/include/bits/char_traits.h 
b/libstdc++-v3/include/bits/char_traits.h
index 018eac28d0d..3074e9bb77e 100644
--- a/libstdc++-v3/include/bits/char_traits.h
+++ b/libstdc++-v3/include/bits/char_traits.h
@@ -227,19 +227,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #if __cplusplus >= 202002L
   if (std::__is_constant_evaluated())
{
- if (__s1 == __s2) // unlikely, but saves a lot of work
-   return __s1;
- const auto __end = __s2 + __n - 1;
- bool __overlap = false;
- for (std::size_t __i = 0; __i < __n - 1; ++__i)
-   {
- if (__s1 + __i == __end)
-   {
- __overlap = true;
- break;
-   }
-   }
- if (__overlap)
+ // Use __builtin_constant_p to avoid comparing unrelated pointers.
+ if (__builtin_constant_p(__s2 < __s1)
+   && __s1 > __s2 && __s1 < (__s2 + __n))
{
  do
{
diff --git 
a/libstdc++-v3/testsuite/21_strings/char_traits/requirements/113200.cc 
b/libstdc++-v3/testsuite/21_strings/char_traits/requirements/113200.cc
new file mode 100644
index 000..0fe765d53bc
--- /dev/null
+++ b/libstdc++-v3/testsuite/21_strings/char_traits/requirements/113200.cc
@@ -0,0 +1,20 @@
+// { dg-do compile { target c++20 } }
+
+// PR libstdc++/113200
+// char_traits::move is not constexpr when the argument is a string literal
+
+#include 
+
+template struct S
+{
+  char data_[ N ];
+
+  constexpr S( char const* p ): data_{}
+  {
+std::char_traits::move( data_, p, N );
+  }
+};
+
+template S( char const(&)[N] ) -> S;
+
+constexpr S s( "test" );
-- 
2.43.0



Re: [PATCH v3] AArch64: Cleanup memset expansion

2024-01-05 Thread Richard Sandiford
Wilco Dijkstra  writes:
> v3: rebased to latest trunk
>
> Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
> bytes throughout.  Simplify the complex calculations when optimizing for size
> by using a fixed limit.
>
> Passes regress & bootstrap.
>
> gcc/ChangeLog:
> * config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
> * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove 
> function.
> (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
> (aarch64_expand_setmem): Clean up implementation, use byte offsets,
> simplify size calculation.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 
> 3ae42be770400da96ea3d9d25d6e1b2d393d034d..dd3b7988d585277181c478cd022fd7b6285929d0
>  100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -1178,6 +1178,10 @@ typedef struct
> mode that should actually be used.  We allow pairs of registers.  */
>  #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
>
> +/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
> +   and 1 MOVI/DUP (same size as a call).  */
> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
> +

Since this isn't (AFAIK) a standard macro, there doesn't seem to be
any need to put it in the header file.  It could just go at the head
of aarch64.cc instead.

>  /* Maximum bytes moved by a single instruction (load/store pair).  */
>  #define MOVE_MAX (UNITS_PER_WORD * 2)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> f9850320f61c5ddccf47e6583d304e5f405a484f..0909b319d16b9a1587314bcfda0a8112b42a663f
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -26294,15 +26294,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
> next, amount);
>  }
>
> -/* Return a new RTX holding the result of moving POINTER forward by the
> -   size of the mode it points to.  */
> -
> -static rtx
> -aarch64_progress_pointer (rtx pointer)
> -{
> -  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
> -}
> -
>  typedef auto_vec, 12> copy_ops;
>
>  /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
> @@ -26457,45 +26448,21 @@ aarch64_expand_cpymem (rtx *operands, bool 
> is_memmove)
>return true;
>  }
>
> -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
> -   SRC is a register we have created with the duplicated value to be set.  */
> +/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
>  static void
> -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> -   machine_mode mode)
> +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
>  {
> -  /* If we are copying 128bits or 256bits, we can do that straight from
> - the SIMD register we prepared.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> -{
> -  mode = GET_MODE (src);
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memset.  */
> -  emit_insn (aarch64_gen_store_pair (*dst, src, src));
> -
> -  /* Move the pointers forward.  */
> -  *dst = aarch64_move_pointer (*dst, 32);
> -  return;
> -}
> -  if (known_eq (GET_MODE_BITSIZE (mode), 128))
> +  /* Emit explict store pair instructions for 32-byte writes.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>  {
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, GET_MODE (src), 0);
> -  /* Emit the memset.  */
> -  emit_move_insn (*dst, src);
> -  /* Move the pointers forward.  */
> -  *dst = aarch64_move_pointer (*dst, 16);
> +  mode = V16QImode;
> +  rtx dst1 = adjust_address (dst, mode, offset);
> +  emit_insn (aarch64_gen_store_pair (dst1, src, src));
>return;
>  }
> -  /* For copying less, we have to extract the right amount from src.  */
> -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> -
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memset.  */
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointer forward.  */
> -  *dst = aarch64_progress_pointer (*dst);
> +  if (known_lt (GET_MODE_SIZE (mode), 16))
> +src = lowpart_subreg (mode, src, GET_MODE (src));
> +  emit_move_insn (adjust_address (dst, mode, offset), src);
>  }
>
>  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> @@ -26524,7 +26491,7 @@ aarch64_expand_setmem_mops (rtx *operands)
>  bool
>  aarch64_expand_setmem (rtx *operands)
>  {
> -  int n, mode_bits;
> +  int mode_bytes;
>unsigned HOST_WIDE_INT len;
>rtx dst = operands[0];
>rtx val = operands[2], src;
> @@ -26537,11 +26504,9 @@ aarch64_expand_setmem (rtx *operands)
>|| (STRICT_

RE: [PATCH]middle-end: Don't apply copysign optimization if target does not implement optab [PR112468]

2024-01-05 Thread Tamar Christina
> -Original Message-
> From: Xi Ruoyao 
> Sent: Thursday, January 4, 2024 10:39 PM
> To: Palmer Dabbelt ; Tamar Christina
> 
> Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de; Jeff Law
> 
> Subject: Re: [PATCH]middle-end: Don't apply copysign optimization if target 
> does
> not implement optab [PR112468]
> 
> On Thu, 2024-01-04 at 14:32 -0800, Palmer Dabbelt wrote:
> > > +proc check_effective_target_ifn_copysign { } {
> > > +    return [check_cached_effective_target_indexed ifn_copysign {
> > > +  expr {
> > > +  (([istarget i?86-*-*] || [istarget x86_64-*-*])
> > > +    && [is-effective-target sse])
> > > +  || ([istarget loongarch*-*-*] && [check_effective_target_loongarch_sx])
> 
> LoongArch has [scalar FP copysign][1] too.
> 
> [1]:https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-
> EN.html#_fscaleblogbcopysign_sd

Ok, so something like:

|| ([istarget loongarch*-*-*] && ([check_effective_target_loongarch_sx] ||  
[check_effective_target_hard_float]))
?

> 
> > > +  || ([istarget powerpc*-*-*]
> > > +  && ![istarget powerpc-*-linux*paired*])
> > > +  || [istarget alpha*-*-*]
> > > +  || [istarget aarch64*-*-*]
> > > +  || [is-effective-target arm_neon]
> > > +  || ([istarget s390*-*-*]
> > > +  && [check_effective_target_s390_vx])
> > > +  || ([istarget riscv*-*-*]
> > > +  && [check_effective_target_riscv_v])
> >
> > Unless I'm missing something, we have copysign in the scalar
> > floating-point ISAs as well.  So I think this should be
> >
> >   || ([istarget riscv*-*-*]
> >   && [check_effective_target_hard_float])
> 

Ah cool, will update it in next version. 

Thanks,
Tamar

> --
> Xi Ruoyao 
> School of Aerospace Science and Technology, Xidian University


Re: [PATCH v3 1/3] libatomic: atomic_16.S: Improve ENTRY, END and ALIAS macro interface

2024-01-05 Thread Richard Sandiford
Victor Do Nascimento  writes:
> The introduction of further architectural-feature dependent ifuncs
> for AArch64 makes hard-coding ifunc `_i' suffixes to functions
> cumbersome to work with.  It is awkward to remember which ifunc maps
> onto which arch feature and makes the code harder to maintain when new
> ifuncs are added and their suffixes possibly altered.
>
> This patch uses pre-processor `#define' statements to map each suffix to
> a descriptive feature name macro, for example:
>
>   #define LSE2 _i1
>
> and reconstructs function names with the pre-processor's token
> concatenation feature, such that for `MACRO(_i)', we would
> now have `MACRO_FEAT(name, feature)' and in the macro definition body
> we replace `name` with `name##feature`.

FWIW, another way of doing this would be to have:

#define CORE(NAME) NAME
#define LSE2(NAME) NAME##_i1

and use feature(name) instead of name##feature.  This has the slight
advantage of not using ## on empty tokens, and the maybe slightly
better advantage of not needing the extra forwarding step in:

#define ENTRY_FEAT(name, feat)  \
ENTRY_FEAT1(name, feat)

#define ENTRY_FEAT1(name, feat) \

WDYT?

Richard

> Consequently, for base functionality, where the ifunc suffix is
> absent, the macro interface remains the same.  For example, the entry
> and endpoints of `libat_store_16' remain defined by:
>
>   - ENTRY (libat_store_16)
> and
>   - END (libat_store_16)
>
> For the LSE2 implementation of the same 16-byte atomic store, we now
> have:
>
>   - ENTRY_FEAT (libat_store_16, LSE2)
> and
>   - END_FEAT (libat_store_16, LSE2)
>
> For the alising of ifunc names, we define the following new
> implementation of the ALIAS macro:
>
>   - ALIAS (FN_BASE_NAME, FROM_SUFFIX, TO_SUFFIX)
>
> Defining the base feature name macro to map `CORE' to the empty string,
> mapping LSE2 to the base implementation, we'd alias the LSE2
> `libat_exchange_16' to it base implementation with:
>
>   - ALIAS (libat_exchange_16, LSE2, CORE)
>
> libatomic/ChangeLog:
>   * config/linux/aarch64/atomic_16.S (CORE): New macro.
>   (LSE2): Likewise.
>   (ENTRY_FEAT): Likewise.
>   (END_FEAT): Likewise.
>   (ENTRY_FEAT1): Likewise.
>   (END_FEAT1): Likewise.
>   (ALIAS): Modify macro to take in `arch' arguments.
> ---
>  libatomic/config/linux/aarch64/atomic_16.S | 83 +-
>  1 file changed, 49 insertions(+), 34 deletions(-)
>
> diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
> b/libatomic/config/linux/aarch64/atomic_16.S
> index a099037179b..eb8e749b8a2 100644
> --- a/libatomic/config/linux/aarch64/atomic_16.S
> +++ b/libatomic/config/linux/aarch64/atomic_16.S
> @@ -40,22 +40,38 @@
>  
>   .arch   armv8-a+lse
>  
> -#define ENTRY(name)  \
> - .global name;   \
> - .hidden name;   \
> - .type name,%function;   \
> - .p2align 4; \
> -name:\
> - .cfi_startproc; \
> +#define ENTRY(name) ENTRY_FEAT (name, CORE)
> +
> +#define ENTRY_FEAT(name, feat)   \
> + ENTRY_FEAT1(name, feat)
> +
> +#define ENTRY_FEAT1(name, feat)  \
> + .global name##feat; \
> + .hidden name##feat; \
> + .type name##feat,%function; \
> + .p2align 4; \
> +name##feat:  \
> + .cfi_startproc; \
>   hint34  // bti c
>  
> -#define END(name)\
> - .cfi_endproc;   \
> - .size name, .-name;
> +#define END(name) END_FEAT (name, CORE)
>  
> -#define ALIAS(alias,name)\
> - .global alias;  \
> - .set alias, name;
> +#define END_FEAT(name, feat) \
> + END_FEAT1(name, feat)
> +
> +#define END_FEAT1(name, feat)\
> + .cfi_endproc;   \
> + .size name##feat, .-name##feat;
> +
> +#define ALIAS(alias, from, to)   \
> + ALIAS1(alias,from,to)
> +
> +#define ALIAS1(alias, from, to)  \
> + .global alias##from;\
> + .set alias##from, alias##to;
> +
> +#define CORE
> +#define LSE2 _i1
>  
>  #define res0 x0
>  #define res1 x1
> @@ -108,7 +124,7 @@ ENTRY (libat_load_16)
>  END (libat_load_16)
>  
>  
> -ENTRY (libat_load_16_i1)
> +ENTRY_FEAT (libat_load_16, LSE2)
>   cbnzw1, 1f
>  
>   /* RELAXED.  */
> @@ -128,7 +144,7 @@ ENTRY (libat_load_16_i1)
>   ldp res0, res1, [x0]
>   dmb ishld
>   ret
> -END (libat_load_16_i1)
> +END_FEAT (libat_load_16, LSE2)
>  
>  
>  ENTRY (libat_store_16)
> @@ -148,7 +164,7 @@ ENTRY (libat_store_16)
>  END (libat_store_16)
>  
>  
> -ENTRY (libat_store_16_i1)
> +ENTRY_FEAT (libat_store_16, LSE2)
>   cbnzw4, 1f
>  
>   /* RELAXED.  */
> @@ -160,7 +176,7 @@ ENTRY (libat_store_16_i1)
>   stlxp   w4, in0, in1, [x0]
>   cbnzw4, 1b
>   ret
> -END (libat_store_16_i1)
> +END_FEAT (libat_store_16, LSE2)
>  
>  
>  ENTRY (li

Re: [PATCH v3 2/3] libatomic: Enable LSE128 128-bit atomics for armv9.4-a

2024-01-05 Thread Richard Sandiford
Victor Do Nascimento  writes:
> The armv9.4-a architectural revision adds three new atomic operations
> associated with the LSE128 feature:
>
>   * LDCLRP - Atomic AND NOT (bitclear) of a location with 128-bit
>   value held in a pair of registers, with original data loaded into
>   the same 2 registers.
>   * LDSETP - Atomic OR (bitset) of a location with 128-bit value held
>   in a pair of registers, with original data loaded into the same 2
>   registers.
>   * SWPP - Atomic swap of one 128-bit value with 128-bit value held
>   in a pair of registers.
>
> This patch adds the logic required to make use of these when the
> architectural feature is present and a suitable assembler available.
>
> In order to do this, the following changes are made:
>
>   1. Add a configure-time check to check for LSE128 support in the
>   assembler.
>   2. Edit host-config.h so that when N == 16, nifunc = 2.
>   3. Where available due to LSE128, implement the second ifunc, making
>   use of the novel instructions.
>   4. For atomic functions unable to make use of these new
>   instructions, define a new alias which causes the _i1 function
>   variant to point ahead to the corresponding _i2 implementation.
>
> libatomic/ChangeLog:
>
>   * Makefile.am (AM_CPPFLAGS): add conditional setting of
>   -DHAVE_FEAT_LSE128.
>   * acinclude.m4 (LIBAT_TEST_FEAT_LSE128): New.
>   * config/linux/aarch64/atomic_16.S (LSE128): New macro
>   definition.
>   (libat_exchange_16): New LSE128 variant.
>   (libat_fetch_or_16): Likewise.
>   (libat_or_fetch_16): Likewise.
>   (libat_fetch_and_16): Likewise.
>   (libat_and_fetch_16): Likewise.
>   * config/linux/aarch64/host-config.h (IFUNC_COND_2): New.
>   (IFUNC_NCOND): Add operand size checking.
>   (has_lse2): Renamed from `ifunc1`.
>   (has_lse128): New.
>   (HAS_LSE128): Likewise.
>   * libatomic/configure.ac: Add call to LIBAT_TEST_FEAT_LSE128.
>   * configure (ac_subst_vars): Regenerated via autoreconf.
>   * libatomic/Makefile.in: Likewise.
>   * libatomic/auto-config.h.in: Likewise.
> ---
>  libatomic/Makefile.am|   3 +
>  libatomic/Makefile.in|   1 +
>  libatomic/acinclude.m4   |  19 +++
>  libatomic/auto-config.h.in   |   3 +
>  libatomic/config/linux/aarch64/atomic_16.S   | 170 ++-
>  libatomic/config/linux/aarch64/host-config.h |  29 +++-
>  libatomic/configure  |  59 ++-
>  libatomic/configure.ac   |   1 +
>  8 files changed, 276 insertions(+), 9 deletions(-)
>
> [...]
> diff --git a/libatomic/acinclude.m4 b/libatomic/acinclude.m4
> index f35ab5b60a5..4197db8f404 100644
> --- a/libatomic/acinclude.m4
> +++ b/libatomic/acinclude.m4
> @@ -83,6 +83,25 @@ AC_DEFUN([LIBAT_TEST_ATOMIC_BUILTIN],[
>])
>  ])
>  
> +dnl
> +dnl Test if the host assembler supports armv9.4-a LSE128 isns.
> +dnl
> +AC_DEFUN([LIBAT_TEST_FEAT_LSE128],[
> +  AC_CACHE_CHECK([for armv9.4-a LSE128 insn support],
> +[libat_cv_have_feat_lse128],[
> +AC_LANG_CONFTEST([AC_LANG_PROGRAM([],[asm(".arch armv9-a+lse128")])])
> +if AC_TRY_EVAL(ac_link); then

ac_compile should be enough for this.  The link step isn't really
adding anything.

> +  eval libat_cv_have_feat_lse128=yes
> +else
> +  eval libat_cv_have_feat_lse128=no
> +fi
> +rm -f conftest*
> +  ])
> +  LIBAT_DEFINE_YESNO([HAVE_FEAT_LSE128], [$libat_cv_have_feat_lse128],
> + [Have LSE128 support for 16 byte integers.])
> +  AM_CONDITIONAL([ARCH_AARCH64_HAVE_LSE128], [test 
> x$libat_cv_have_feat_lse128 = xyes])
> +])
> +
>  dnl
>  dnl Test if we have __atomic_load and __atomic_store for mode $1, size $2
>  dnl
> [...]
> @@ -206,6 +211,31 @@ ENTRY (libat_exchange_16)
>  END (libat_exchange_16)
>  
>  
> +#if HAVE_FEAT_LSE128
> +ENTRY_FEAT (libat_exchange_16, LSE128)
> + mov tmp0, x0
> + mov res0, in0
> + mov res1, in1
> + cbnzw4, 1f
> +
> + /* RELAXED.  */
> + swppres0, res1, [tmp0]
> + ret
> +1:
> + cmp w4, ACQUIRE
> + b.hi2f
> +
> + /* ACQUIRE/CONSUME.  */
> + swppa   res0, res1, [tmp0]
> + ret
> +
> + /* RELEASE/ACQ_REL/SEQ_CST.  */
> +2:   swppal  res0, res1, [tmp0]
> + ret
> +END_FEAT (libat_exchange_16, LSE128)
> +#endif

Is there no benefit to using SWPPL for RELEASE here?  Similarly for the
others.

Looks good otherwise.

Thanks,
Richard


Re: [PATCH]middle-end: Don't apply copysign optimization if target does not implement optab [PR112468]

2024-01-05 Thread Xi Ruoyao
On Fri, 2024-01-05 at 11:02 +, Tamar Christina wrote:
> Ok, so something like:
> 
> > > ([istarget loongarch*-*-*] &&
> > > ([check_effective_target_loongarch_sx] || 
> > > [check_effective_target_hard_float]))
> ?

We don't need "[check_effective_target_loongarch_sx] ||" because SIMD
requires hard float.

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


[PATCH] aarch64: Further fix for throwing insns in ldp/stp pass [PR113217]

2024-01-05 Thread Alex Coplan
As the PR shows, the fix in
r14-6916-g057dc349021660c40699fb5c98fd9cac8e168653 was not complete.
That fix was enough to stop us trying to move throwing accesses above
nondebug insns, but due to this code in try_fuse_pair:

  // Placement strategy: push loads down and pull stores up, this should
  // help register pressure by reducing live ranges.
  if (load_p)
range.first = range.last;
  else
range.last = range.first;

we would still try to move stores up above any debug insns that occurred
immediately after the previous nondebug insn.  This patch fixes that by
narrowing the move range in the case that the second access is throwing
to exactly the range of that insn.

Note that we still need the fix to latest_hazard_before mentioned above
so as to ensure we select a suitable base and reject pairs if it isn't
viable to form the pair at the end of the BB.

Bootstrapped/regtested on aarch64-linux-gnu, OK for trunk?

Thanks,
Alex

gcc/ChangeLog:

PR target/113217
* config/aarch64/aarch64-ldp-fusion.cc
(ldp_bb_info::try_fuse_pair): If the second access can throw,
narrow the move range to exactly that insn.

gcc/testsuite/ChangeLog:

PR target/113217
* g++.dg/pr113217.C: New test.
diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 25f9b2d01c5..2fe1b1d4d84 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -2195,6 +2195,15 @@ ldp_bb_info::try_fuse_pair (bool load_p, unsigned 
access_size,
   if (base->hazards[0])
 range.last = base->hazards[0]->prev_nondebug_insn ();
 
+  // If the second insn can throw, narrow the move range to exactly that insn.
+  // This prevents us trying to move the second insn from the end of the BB.
+  if (cfun->can_throw_non_call_exceptions
+  && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
+{
+  gcc_assert (range.includes (insns[1]));
+  range = insn_range_info (insns[1]);
+}
+
   // Placement strategy: push loads down and pull stores up, this should
   // help register pressure by reducing live ranges.
   if (load_p)
diff --git a/gcc/testsuite/g++.dg/pr113217.C b/gcc/testsuite/g++.dg/pr113217.C
new file mode 100644
index 000..ec861543930
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr113217.C
@@ -0,0 +1,15 @@
+// { dg-do compile }
+// { dg-options "-O -g -fnon-call-exceptions" }
+struct _Vector_base {
+  int _M_end_of_storage;
+};
+struct vector : _Vector_base {
+  vector() : _Vector_base() {}
+  ~vector();
+};
+struct LoadGraph {
+  LoadGraph();
+  vector colors;
+  vector data_block;
+};
+LoadGraph::LoadGraph() {}


Re: [PATCH v2 2/2] LoongArch: When the code model is extreme, the symbol address is obtained through macro instructions regardless of the value of -mexplicit-relocs.

2024-01-05 Thread Xi Ruoyao
On Fri, 2024-01-05 at 18:25 +0800, Xi Ruoyao wrote:
> On Fri, 2024-01-05 at 17:57 +0800, chenglulu wrote:
> > 
> > 在 2024/1/5 下午4:37, Xi Ruoyao 写道:
> > > On Fri, 2024-01-05 at 11:40 +0800, Lulu Cheng wrote:
> > > >   bool
> > > >   loongarch_explicit_relocs_p (enum loongarch_symbol_type type)
> > > >   {
> > > > +  /* Instructions pcalau12i, addi.d, lu32i.d and lu52i.d must be 
> > > > adjancent
> > > > + so that the linker can infer the PC of pcalau12i to apply 
> > > > relocations
> > > > + to lu32i.d and lu52i.d.  Otherwise, the results would be 
> > > > incorrect if
> > > > + these four instructions are not in the same 4KiB page.
> > > > + Therefore, macro instructions are used when cmodel=extreme.  */
> > > > +  if (loongarch_symbol_extreme_p (type))
> > > > +    return false;
> > > I think this is a bit of strange.  With -mexplicit-relocs={auto,always}
> > > we should still use explicit relocs, but coding all 4 instructions
> > > altogether as
> > > 
> > > "pcalau12i.d\t%1,%pc64_hi12(%2)\n\taddi.d\t%0,$r0,%pclo12(%2)\n\tlu32i.d\t%0,%pc64_lo20(%2)\n\tlu52i.d\t%0,%0,%pc64_hi12(%2)"
> > > 
> > > Give me several hours trying to implement this...
> > > 
> > I think there is no difference between macros and these instructions put 
> > together. If implement it in a split form, I think I can try it through 
> > TARGET_SCHED_MACRO_FUSION_PAIR_P

We don't need to split the insn.  We can just add a "large insn"
containing the assembly output we want.

See the attached patch.  Note that TLS LE/LD/GD needs a fix too because
they are basically an variation of GOT addressing.

I've ran some small tests and now trying to bootstrap GCC with -
mcmodel=extreme in BOOT_CFLAGS...

> 
> There is a difference:
> 
> int x;
> int t() { return x; }
> 
> pcalau12i.d t0, %pc_hi20(x)
> addi.d t1, r0, %pc_lo12(x)
> lu32i.d t1, %pc64_lo20(x)
> lu52i.d t1, t1, %pc64_hi12(x)
> ldx.w a0, t0, t1
> 
> is slightly better than
> 
> pcalau12i.d t0, %pc_hi20(x)
> addi.d t1, r0, %pc_lo12(x)
> lu32i.d t1, %pc64_lo20(x)
> lu52i.d t1, t1, %pc64_hi12(x)
> addi.d t0, t0, t1
> ld.w a0, t0, 0
> 
> And generating macros when -mexplicit-relocs=always can puzzle people
> (it says "always" :-\ ).
> 

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University
From f6f75b1fd2dbd30255f127f59d16a2683fa22d58 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao 
Date: Fri, 5 Jan 2024 18:40:06 +0800
Subject: [PATCH] LoongArch: Don't split the instructions containing relocs for
 extreme code model

The ABI mandates the pcalau12i/addi.d/lu32i.d/lu52i.d instructions for
addressing a symbol to be adjacent.  So model them as "one large
instruction", i.e. define_insn, with two output registers.  The real
address is the sum of these two registers.

The advantage of this approach is the RTL passes can still use ldx/stx
instructions to skip an addi.d instruction.

gcc/ChangeLog:

	* config/loongarch/loongarch.md (unspec): Add
	UNSPEC_LA_PCREL_64_PART1 and UNSPEC_LA_PCREL_64_PART2.
	(la_pcrel64_two_parts): New define_insn.
	* config/loongarch/loongarch.cc (loongarch_tls_symbol): Fix a
	typo in the comment.
	(loongarch_call_tls_get_addr): If TARGET_CMODEL_EXTREME, use
	la_pcrel64_two_parts for addressing the TLS symbol and
	__tls_get_addr.
	(loongarch_legitimize_tls_address): If TARGET_CMODEL_EXTREME,
	address TLS IE symbols with la_pcrel64_two_parts.
	(loongarch_split_symbol): If TARGET_CMODEL_EXTREME, address
	symbols with la_pcrel64_two_parts.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/func-call-extreme-1.c (dg-options):
	Use -O2 instead of -O0 to ensure the pcalau12i/addi/lu32i/lu52i
	instruction sequences are not reordered by the compiler.
	(NOIPA): Disallow interprocedural optimizations.
	* gcc.target/loongarch/func-call-extreme-2.c: Remove the content
	duplicated from func-call-extreme-1.c, include it instead.
	(dg-options): Likewise.
	* gcc.target/loongarch/func-call-extreme-3.c (dg-options):
	Likewise.
	* gcc.target/loongarch/func-call-extreme-4.c (dg-options):
	Likewise.
	* gcc.target/loongarch/cmodel-extreme-1.c: New test.
	* gcc.target/loongarch/cmodel-extreme-2.c: New test.
---
 gcc/config/loongarch/loongarch.cc | 100 +-
 gcc/config/loongarch/loongarch.md |  21 
 .../gcc.target/loongarch/cmodel-extreme-1.c   |  18 
 .../gcc.target/loongarch/cmodel-extreme-2.c   |   7 ++
 .../loongarch/func-call-extreme-1.c   |  14 +--
 .../loongarch/func-call-extreme-2.c   |  29 +
 .../loongarch/func-call-extreme-3.c   |   2 +-
 .../loongarch/func-call-extreme-4.c   |   2 +-
 8 files changed, 109 insertions(+), 84 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/cmodel-extreme-1.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/cmodel-extreme-2.c

diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index db83232884f..7c01169b422 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/lo

Re: [PATCH v3 3/3] aarch64: Add explicit checks for implicit LSE/LSE2 requirements.

2024-01-05 Thread Richard Sandiford
Victor Do Nascimento  writes:
> At present, Evaluation of both `has_lse2(hwcap)' and
> `has_lse128(hwcap)' may require issuing an `mrs' instruction to query
> a system register.  This instruction, when issued from user-space
> results in a trap by the kernel which then returns the value read in
> by the system register.  Given the undesirable nature of the
> computational expense associated with the context switch, it is
> important to implement mechanisms to, wherever possible, forgo the
> operation.
>
> In light of this, given how other architectural requirements serving
> as prerequisites have long been assigned HWCAP bits by the kernel, we
> can inexpensively query for their availability before attempting to
> read any system registers.  Where one of these early tests fail, we
> can assert that the main feature of interest (be it LSE2 or LSE128)
> cannot be present, allowing us to return from the function early and
> skip the unnecessary expensive kernel-mediated access to system
> registers.
>
> libatomic/ChangeLog:
>
>   * config/linux/aarch64/host-config.h (has_lse2): Add test for LSE.
>   (has_lse128): Add test for LSE2.
> ---
>  libatomic/config/linux/aarch64/host-config.h | 13 ++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/libatomic/config/linux/aarch64/host-config.h 
> b/libatomic/config/linux/aarch64/host-config.h
> index c5485d63855..3be4db6e5f8 100644
> --- a/libatomic/config/linux/aarch64/host-config.h
> +++ b/libatomic/config/linux/aarch64/host-config.h
> @@ -53,8 +53,13 @@
>  static inline bool
>  has_lse2 (unsigned long hwcap)
>  {
> +  /* Check for LSE2.  */
>if (hwcap & HWCAP_USCAT)
>  return true;
> +  /* No point checking further for atomic 128-bit load/store if LSE
> + prerequisite not met.  */
> +  if (!(hwcap & HWCAP_ATOMICS))
> +return false;

This part is OK.

>if (!(hwcap & HWCAP_CPUID))
>  return false;
>  
> @@ -76,12 +81,14 @@ has_lse2 (unsigned long hwcap)
>  static inline bool
>  has_lse128 (unsigned long hwcap)
>  {
> -  if (!(hwcap & HWCAP_CPUID))
> -return false;
> +  /* In the absence of HWCAP_CPUID, we are unable to check for LSE128, 
> return.
> + If feature check available, check LSE2 prerequisite before proceeding.  
> */
> +  if (!(hwcap & HWCAP_CPUID) || !(hwcap & HWCAP_USCAT))
> + return false;

The inconsistency feels wrong here.  If we're saying that HWCAP_USCAT
is now so old that we don't need to fall back on CPUID, then it feels
like we should have the courage of our convictions and do the same for
has_lse2.  If instead we still want to support libcs that predate
HWCAP_USCAT, we should do the same here too.

>unsigned long isar0;
>asm volatile ("mrs %0, ID_AA64ISAR0_EL1" : "=r" (isar0));
>if (AT_FEAT_FIELD (isar0) >= 3)
> -return true;
> +  return true;

The original formatting was correct.

Thanks,
Richard

>return false;
>  }


Re: [PATCH v3 3/3] aarch64: Add explicit checks for implicit LSE/LSE2 requirements.

2024-01-05 Thread Richard Sandiford
Richard Sandiford  writes:
> Victor Do Nascimento  writes:
>> At present, Evaluation of both `has_lse2(hwcap)' and
>> `has_lse128(hwcap)' may require issuing an `mrs' instruction to query
>> a system register.  This instruction, when issued from user-space
>> results in a trap by the kernel which then returns the value read in
>> by the system register.  Given the undesirable nature of the
>> computational expense associated with the context switch, it is
>> important to implement mechanisms to, wherever possible, forgo the
>> operation.
>>
>> In light of this, given how other architectural requirements serving
>> as prerequisites have long been assigned HWCAP bits by the kernel, we
>> can inexpensively query for their availability before attempting to
>> read any system registers.  Where one of these early tests fail, we
>> can assert that the main feature of interest (be it LSE2 or LSE128)
>> cannot be present, allowing us to return from the function early and
>> skip the unnecessary expensive kernel-mediated access to system
>> registers.
>>
>> libatomic/ChangeLog:
>>
>>  * config/linux/aarch64/host-config.h (has_lse2): Add test for LSE.
>>  (has_lse128): Add test for LSE2.
>> ---
>>  libatomic/config/linux/aarch64/host-config.h | 13 ++---
>>  1 file changed, 10 insertions(+), 3 deletions(-)
>>
>> diff --git a/libatomic/config/linux/aarch64/host-config.h 
>> b/libatomic/config/linux/aarch64/host-config.h
>> index c5485d63855..3be4db6e5f8 100644
>> --- a/libatomic/config/linux/aarch64/host-config.h
>> +++ b/libatomic/config/linux/aarch64/host-config.h
>> @@ -53,8 +53,13 @@
>>  static inline bool
>>  has_lse2 (unsigned long hwcap)
>>  {
>> +  /* Check for LSE2.  */
>>if (hwcap & HWCAP_USCAT)
>>  return true;
>> +  /* No point checking further for atomic 128-bit load/store if LSE
>> + prerequisite not met.  */
>> +  if (!(hwcap & HWCAP_ATOMICS))
>> +return false;
>
> This part is OK.
>
>>if (!(hwcap & HWCAP_CPUID))
>>  return false;
>>  
>> @@ -76,12 +81,14 @@ has_lse2 (unsigned long hwcap)
>>  static inline bool
>>  has_lse128 (unsigned long hwcap)
>>  {
>> -  if (!(hwcap & HWCAP_CPUID))
>> -return false;
>> +  /* In the absence of HWCAP_CPUID, we are unable to check for LSE128, 
>> return.
>> + If feature check available, check LSE2 prerequisite before proceeding. 
>>  */
>> +  if (!(hwcap & HWCAP_CPUID) || !(hwcap & HWCAP_USCAT))
>> + return false;
>
> The inconsistency feels wrong here.  If we're saying that HWCAP_USCAT
> is now so old that we don't need to fall back on CPUID, then it feels
> like we should have the courage of our convictions and do the same for
> has_lse2.  If instead we still want to support libcs that predate
> HWCAP_USCAT, we should do the same here too.

Sorry, scratch that, I'd misread has_lse2.  The CPUID fallback there is
only for Neoverse N1, which we know doesn't support LSE128.

So the patch is OK with the formatting fixed: the returns should be
indented by their original amount.

Thanks,
Richard

>>unsigned long isar0;
>>asm volatile ("mrs %0, ID_AA64ISAR0_EL1" : "=r" (isar0));
>>if (AT_FEAT_FIELD (isar0) >= 3)
>> -return true;
>> +  return true;
>
> The original formatting was correct.
>
> Thanks,
> Richard
>
>>return false;
>>  }


Re: [PATCH 1/4] LoongArch: Handle ISA evolution switches along with other options

2024-01-05 Thread Xi Ruoyao
On Fri, 2024-01-05 at 14:55 +0800, Yang Yujie wrote:
> +#define ISA_HAS_FRECIPE \
> +  (la_target.isa.evolution & OPTION_MASK_ISA_FRECIPE)
> +#define ISA_HAS_DIV32 \
> +  (la_target.isa.evolution & OPTION_MASK_ISA_DIV32)
> +#define ISA_HAS_LAM_BH \
> +  (la_target.isa.evolution & OPTION_MASK_ISA_LAM_BH)
> +#define ISA_HAS_LAMCAS \
> +  (la_target.isa.evolution & OPTION_MASK_ISA_LAM_BH)
> +#define ISA_HAS_LD_SEQ_SA \
> +  (la_target.isa.evolution & OPTION_MASK_ISA_LD_SEQ_SA)

Should every occurrence of TARGET_DIV32 etc. be replaced with
ISA_HAS_DIV32 etc. in the code base?  It seems some of them are not
replaced.

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


Re: [PATCH] aarch64: Further fix for throwing insns in ldp/stp pass [PR113217]

2024-01-05 Thread Richard Sandiford
Alex Coplan  writes:
> As the PR shows, the fix in
> r14-6916-g057dc349021660c40699fb5c98fd9cac8e168653 was not complete.
> That fix was enough to stop us trying to move throwing accesses above
> nondebug insns, but due to this code in try_fuse_pair:
>
>   // Placement strategy: push loads down and pull stores up, this should
>   // help register pressure by reducing live ranges.
>   if (load_p)
> range.first = range.last;
>   else
> range.last = range.first;
>
> we would still try to move stores up above any debug insns that occurred
> immediately after the previous nondebug insn.  This patch fixes that by
> narrowing the move range in the case that the second access is throwing
> to exactly the range of that insn.
>
> Note that we still need the fix to latest_hazard_before mentioned above
> so as to ensure we select a suitable base and reject pairs if it isn't
> viable to form the pair at the end of the BB.
>
> Bootstrapped/regtested on aarch64-linux-gnu, OK for trunk?
>
> Thanks,
> Alex
>
> gcc/ChangeLog:
>
> PR target/113217
> * config/aarch64/aarch64-ldp-fusion.cc
> (ldp_bb_info::try_fuse_pair): If the second access can throw,
> narrow the move range to exactly that insn.
>
> gcc/testsuite/ChangeLog:
>
> PR target/113217
> * g++.dg/pr113217.C: New test.

OK, thanks.

Richard

> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> index 25f9b2d01c5..2fe1b1d4d84 100644
> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> @@ -2195,6 +2195,15 @@ ldp_bb_info::try_fuse_pair (bool load_p, unsigned 
> access_size,
>if (base->hazards[0])
>  range.last = base->hazards[0]->prev_nondebug_insn ();
>  
> +  // If the second insn can throw, narrow the move range to exactly that 
> insn.
> +  // This prevents us trying to move the second insn from the end of the BB.
> +  if (cfun->can_throw_non_call_exceptions
> +  && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
> +{
> +  gcc_assert (range.includes (insns[1]));
> +  range = insn_range_info (insns[1]);
> +}
> +
>// Placement strategy: push loads down and pull stores up, this should
>// help register pressure by reducing live ranges.
>if (load_p)
> diff --git a/gcc/testsuite/g++.dg/pr113217.C b/gcc/testsuite/g++.dg/pr113217.C
> new file mode 100644
> index 000..ec861543930
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/pr113217.C
> @@ -0,0 +1,15 @@
> +// { dg-do compile }
> +// { dg-options "-O -g -fnon-call-exceptions" }
> +struct _Vector_base {
> +  int _M_end_of_storage;
> +};
> +struct vector : _Vector_base {
> +  vector() : _Vector_base() {}
> +  ~vector();
> +};
> +struct LoadGraph {
> +  LoadGraph();
> +  vector colors;
> +  vector data_block;
> +};
> +LoadGraph::LoadGraph() {}


Re: [pushed][PATCH v2 0/7] LoongArch:Enable testing for common

2024-01-05 Thread chenglulu

Pushed 2-7 to r14-6955...r14-6961.

在 2024/1/5 上午11:43, chenxiaolong 写道:

v1->v2:
   On the basis of v1, the reason of the analysis problem is described in 
detail.

When using binutils, which does not support vectorization, and the gcc compiler
toolchain, which does support vectorization, the following two types of error
problems occur in gcc regression testing.

1.Failure of common tests in the gcc.dg/vect directory.

Regression testing of GCC has found that vect-bic-bitmask-{12/23}.c has errors
at compile time, and similar problems exist on various architectures (e.g. x86,
aarch64,riscv, etc.). The reason is that the behavior of the program is the
assembly state, and the vector instruction cannot be recognized in the assembly
stage and an error occurs.

2.FAIL items of common vectorization tests are supported.

When LoongArch architecture supports common vector test cases, GCC regression
testing has many failures. Reasons include a lack of detection of targets
Rules, lack of vectorization options, lack of specific compilation options,
check for instruction set differences and test behavior for program Settings,
etc. For details, see the following patches:

chenxiaolong (7):
   LoongArch: testsuite:Added support for vector object detection.
   LoongArch: testsuite:Modify the test behavior of the
 vect-bic-bitmask-{12,23}.c file.
   LoongArch: testsuite:Added detection support for LoongArch
 architecture in vect-{82,83}.c.
   LoongArch: testsuite:Fix FAIL in file bind_c_array_params_2.f90.
   LoongArch: testsuite:Delete the default run behavior in pr60510.f.
   LoongArch: testsuite:Added additional vectorization "-mlasx"
 compilation option.
   LoongArch: testsuite:Give up the detection of the
 gcc.dg/fma-{3,4,6,7}.c file.

  gcc/testsuite/gcc.dg/fma-3.c  |   2 +-
  gcc/testsuite/gcc.dg/fma-4.c  |   2 +-
  gcc/testsuite/gcc.dg/fma-6.c  |   2 +-
  gcc/testsuite/gcc.dg/fma-7.c  |   2 +-
  gcc/testsuite/gcc.dg/vect/bb-slp-pattern-1.c  |   1 +
  .../gcc.dg/vect/slp-widen-mult-half.c |   1 +
  gcc/testsuite/gcc.dg/vect/vect-82.c   |   2 +-
  gcc/testsuite/gcc.dg/vect/vect-83.c   |   2 +-
  .../gcc.dg/vect/vect-bic-bitmask-12.c |   2 +-
  .../gcc.dg/vect/vect-bic-bitmask-23.c |   2 +-
  .../gcc.dg/vect/vect-widen-mult-const-s16.c   |   1 +
  .../gcc.dg/vect/vect-widen-mult-const-u16.c   |   1 +
  .../gcc.dg/vect/vect-widen-mult-half-u8.c |   1 +
  .../gcc.dg/vect/vect-widen-mult-half.c|   1 +
  .../gcc.dg/vect/vect-widen-mult-u16.c |   1 +
  .../gcc.dg/vect/vect-widen-mult-u8-s16-s32.c  |   1 +
  .../gcc.dg/vect/vect-widen-mult-u8-u32.c  |   1 +
  .../gcc.dg/vect/vect-widen-mult-u8.c  |   1 +
  .../gfortran.dg/bind_c_array_params_2.f90 |   4 +-
  gcc/testsuite/gfortran.dg/vect/pr60510.f  |   1 -
  gcc/testsuite/lib/target-supports.exp | 217 +-
  21 files changed, 183 insertions(+), 65 deletions(-)





Re: [pushed][PATCH v3] LoongArch: testsuite:Added support for vector object detection.

2024-01-05 Thread chenglulu

pushed to r14-6954.
在 2024/1/5 下午2:05, chenxiaolong 写道:

- Change the default vectorization "-mlasx" option to "-mlsx" because there
are many non-aligned memory accesses when using 256-bit vectorization.

- The following detection procedure is added to the target-supports.exp file:

1.check_effective_target_scalar_all_fma
2.check_effective_target_vect_int
3.check_effective_target_vect_intfloat_cvt
4.check_effective_target_vect_doubleint_cvt
5.check_effective_target_vect_intdouble_cvt
6.check_effective_target_vect_uintfloat_cvt
7.check_effective_target_vect_floatint_cvt
8.check_effective_target_vect_floatuint_cvt
9.check_effective_target_vect_shift
10.check_effective_target_vect_var_shift
11.check_effective_target_whole_vector_shift
12.check_effective_target_vect_bswap
13.check_effective_target_vect_bool_cmp
14.check_effective_target_vect_char_add
15.check_effective_target_vect_shift_char
16.check_effective_target_vect_long
17.check_effective_target_vect_float
18.check_effective_target_vect_double
19.check_effective_target_vect_long_long
20.check_effective_target_vect_perm
21.check_effective_target_vect_perm_byte
22.check_effective_target_vect_perm_short
23.check_effective_target_vect_widen_sum_hi_to_si
24.check_effective_target_vect_widen_sum_qi_to_hi
25.check_effective_target_vect_widen_sum_qi_to_hi
26.check_effective_target_vect_widen_mult_qi_to_hi
27.check_effective_target_vect_widen_mult_hi_to_si
28.check_effective_target_vect_widen_mult_qi_to_hi_pattern
29.check_effective_target_vect_widen_mult_hi_to_si_pattern
30.check_effective_target_vect_widen_mult_si_to_di_pattern
31.check_effective_target_vect_sdot_qi
32.check_effective_target_vect_udot_qi
33.check_effective_target_vect_sdot_hi
34.check_effective_target_vect_udot_hi
35.check_effective_target_vect_usad_char
36.check_effective_target_vect_avg_qi
37.check_effective_target_vect_pack_trunc
38.check_effective_target_vect_unpack
39.check_effective_target_vect_hw_misalign
40.check_effective_target_vect_gather_load_ifn
40.check_effective_target_vect_condition
42.check_effective_target_vect_cond_mixed
43.check_effective_target_vect_char_mult
44.check_effective_target_vect_short_mult
45.check_effective_target_vect_int_mult
46.check_effective_target_vect_long_mult
47.check_effective_target_vect_int_mod
48.check_effective_target_vect_extract_even_odd
49.check_effective_target_vect_interleave
50.check_effective_target_vect_call_copysignf
51.check_effective_target_vect_call_sqrtf
52.check_effective_target_vect_call_lrint
53.check_effective_target_vect_call_btrunc
54.check_effective_target_vect_call_btruncf
55.check_effective_target_vect_call_ceil
56.check_effective_target_vect_call_ceilf
57.check_effective_target_vect_call_floor
58.check_effective_target_vect_call_floorf
59.check_effective_target_vect_call_lceil
60.check_effective_target_vect_call_lfloor
61.check_effective_target_vect_logical_reduc
62.check_effective_target_section_anchors
63.check_vect_support_and_set_flags
64.check_effective_target_vect_max_reduc
65.check_effective_target_loongarch_sx
66.check_effective_target_loongarch_sx_hw

gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Add LoongArch to the list of supported
targets.
---
  gcc/testsuite/lib/target-supports.exp | 217 +++---
  1 file changed, 162 insertions(+), 55 deletions(-)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 167e630f5a5..9addf35ade4 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3815,7 +3815,11 @@ proc add_options_for_bfloat16 { flags } {
  # (fma, fms, fnma, and fnms) for both float and double.
  
  proc check_effective_target_scalar_all_fma { } {

-return [istarget aarch64*-*-*]
+if { [istarget aarch64*-*-*]
+|| [istarget loongarch*-*-*]} {
+   return 1
+}
+return 0
  }
  
  # Return 1 if the target supports compiling fixed-point,

@@ -4051,6 +4055,8 @@ proc check_effective_target_vect_int { } {
 && [check_effective_target_s390_vx])
 || ([istarget riscv*-*-*]
 && [check_effective_target_riscv_v])
+|| ([istarget loongarch*-*-*]
+&& [check_effective_target_loongarch_sx])
}}]
  }
  
@@ -4218,7 +4224,9 @@ proc check_effective_target_vect_intfloat_cvt { } {

 || ([istarget s390*-*-*]
 && [check_effective_target_s390_vxe2])
 || ([istarget riscv*-*-*]
-&& [check_effective_target_riscv_v]) }}]
+&& [check_effective_target_riscv_v])
+|| ([istarget loongarch*-*-*]
+&& [check_effective_target_loongarch_sx]) }}]
  }
  
  # Return 1 if the target supports signed double->int conversion

@@ -4239,7 +4247,9 @@ proc check_effective_target_vect_doubleint_cvt { } {
 || ([istarget s390*-*-*]
 && [check_effective_target_s390_vx])
 || ([istarget riscv*-*-*]
-&& [

Re: [pushed][PATCH] LoongArch: Fixed the problem of incorrect judgment of the immediate field of the [x]vld/[x]vst instruction.

2024-01-05 Thread chenglulu

Pushed to r14-6955.

在 2024/1/4 上午10:37, Lulu Cheng 写道:

The [x]vld/[x]vst directive is defined as follows:
   [x]vld/[x]vst {x/v}d, rj, si12

When not modified, the immediate field of [x]vld/[x]vst is between 10 and
14 bits depending on the type. However, in loongarch_valid_offset_p, the
immediate field is restricted first, so there is no error. However, in
some cases redundant instructions will be generated, see test cases.
Now modify it according to the description in the instruction manual.

gcc/ChangeLog:

* config/loongarch/lasx.md (lasx_mxld_):
Modify the method of determining the memory offset of [x]vld/[x]vst.
(lasx_mxst_): Likewise.
* config/loongarch/loongarch.cc (loongarch_valid_offset_p): Delete.
(loongarch_address_insns): Likewise.
* config/loongarch/lsx.md (lsx_ld_): Likewise.
(lsx_st_): Likewise.
* config/loongarch/predicates.md (aq10b_operand): Likewise.
(aq10h_operand): Likewise.
(aq10w_operand): Likewise.
(aq10d_operand): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/vect-ld-st-imm12.c: New test.
---
  gcc/config/loongarch/lasx.md  | 26 ---
  gcc/config/loongarch/loongarch.cc | 19 +++---
  gcc/config/loongarch/lsx.md   | 26 ---
  gcc/config/loongarch/predicates.md| 16 
  .../gcc.target/loongarch/vect-ld-st-imm12.c   | 15 +++
  5 files changed, 19 insertions(+), 83 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/loongarch/vect-ld-st-imm12.c

diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index dbbf5a136b7..95c6bae20ae 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -846,32 +846,6 @@ (define_split
DONE;
  })
  
-;; Offset load

-(define_expand "lasx_mxld_"
-  [(match_operand:LASX 0 "register_operand")
-   (match_operand 1 "pmode_register_operand")
-   (match_operand 2 "aq10_operand")]
-  "ISA_HAS_LASX"
-{
-  rtx addr = plus_constant (GET_MODE (operands[1]), operands[1],
- INTVAL (operands[2]));
-  loongarch_emit_move (operands[0], gen_rtx_MEM (mode, addr));
-  DONE;
-})
-
-;; Offset store
-(define_expand "lasx_mxst_"
-  [(match_operand:LASX 0 "register_operand")
-   (match_operand 1 "pmode_register_operand")
-   (match_operand 2 "aq10_operand")]
-  "ISA_HAS_LASX"
-{
-  rtx addr = plus_constant (GET_MODE (operands[1]), operands[1],
-   INTVAL (operands[2]));
-  loongarch_emit_move (gen_rtx_MEM (mode, addr), operands[0]);
-  DONE;
-})
-
  ;; LASX
  (define_insn "add3"
[(set (match_operand:ILASX 0 "register_operand" "=f,f,f")
diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index db83232884f..b82ef1a7c0a 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -2126,21 +2126,11 @@ loongarch_valid_offset_p (rtx x, machine_mode mode)
  
/* We may need to split multiword moves, so make sure that every word

   is accessible.  */
-  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD
+  if (!(LSX_SUPPORTED_MODE_P (mode) || LASX_SUPPORTED_MODE_P (mode))
+  && GET_MODE_SIZE (mode) > UNITS_PER_WORD
&& !IMM12_OPERAND (INTVAL (x) + GET_MODE_SIZE (mode) - UNITS_PER_WORD))
  return false;
  
-  /* LSX LD.* and ST.* supports 10-bit signed offsets.  */

-  if (LSX_SUPPORTED_MODE_P (mode)
-  && !loongarch_signed_immediate_p (INTVAL (x), 10,
-   loongarch_ldst_scaled_shift (mode)))
-return false;
-
-  /* LASX XVLD.B and XVST.B supports 10-bit signed offsets without shift.  */
-  if (LASX_SUPPORTED_MODE_P (mode)
-  && !loongarch_signed_immediate_p (INTVAL (x), 10, 0))
-return false;
-
return true;
  }
  
@@ -2376,9 +2366,8 @@ loongarch_address_insns (rtx x, machine_mode mode, bool might_split_p)

case ADDRESS_REG:
if (lsx_p)
  {
-   /* LSX LD.* and ST.* supports 10-bit signed offsets.  */
-   if (loongarch_signed_immediate_p (INTVAL (addr.offset), 10,
- loongarch_ldst_scaled_shift 
(mode)))
+   /* LSX LD.* and ST.* supports 12-bit signed offsets.  */
+   if (IMM12_OPERAND (INTVAL (addr.offset)))
  return 1;
else
  return 0;
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index 3e3248ef499..02e89247bdf 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -812,32 +812,6 @@ (define_split
DONE;
  })
  
-;; Offset load

-(define_expand "lsx_ld_"
-  [(match_operand:LSX 0 "register_operand")
-   (match_operand 1 "pmode_register_operand")
-   (match_operand 2 "aq10_operand")]
-  "ISA_HAS_LSX"
-{
-  rtx addr = plus_constant (GET_MODE (operands[1]), operands[1],
-   INTVAL (operands[2]));
-  loongarch_emit_move (opera

Re: Patch: Remove unneeded double operation in libstdc++-v3/src/c++17/fs_path.cc

2024-01-05 Thread Jonathan Wakely

On 18/12/23 09:36 +0100, Martin Küttler wrote:

This is a small change to libstdc++ which does not change any behavior.


Please CC the libstd...@gcc.gnu.org list on all libstdc++ patches, as
documented at https://gcc.gnu.org/lists.html

Otherwise I won't see the patches unless I happen to glance at the
gcc-patches archive by chance.


This change has two, ihmo positive, implications:

- The implicit conversion from double to int is avoided (Avoiding a
  warning).


I don't see any warning here. What do you see?


- No floating point number is used at all, which could be significant
  in some scenarios.


Yes, it seems worth doing for this reason. I'll test+push the patch,
thanks.

Looking at path::_List::reserve now, we probably also want to avoid
overflow. Although a path with INT_MAX/1.5 components seems
implausible for 32-bit and 64-bit targets, it could be a problem for
16-bit targets. I'll take care of that too.




diff --git a/libstdc++-v3/src/c++17/fs_path.cc 
b/libstdc++-v3/src/c++17/fs_path.cc
index d65b5482e8b..b47ed0aa7aa 100644
--- a/libstdc++-v3/src/c++17/fs_path.cc
+++ b/libstdc++-v3/src/c++17/fs_path.cc
@@ -447,8 +447,9 @@ path::_List::reserve(int newcap, bool exact = false)

  if (curcap < newcap)
{
-  if (!exact && newcap < int(1.5 * curcap))
-   newcap = 1.5 * curcap;
+  const int nextcap = curcap + curcap / 2;
+  if (!exact && newcap < nextcap)
+   newcap = nextcap;

  void* p = ::operator new(sizeof(_Impl) + newcap * sizeof(value_type));
  std::unique_ptr<_Impl, _Impl_deleter> newptr(::new(p) _Impl{newcap});




Re: [PATCH v2 2/2] LoongArch: When the code model is extreme, the symbol address is obtained through macro instructions regardless of the value of -mexplicit-relocs.

2024-01-05 Thread chenglulu



在 2024/1/5 下午7:55, Xi Ruoyao 写道:

On Fri, 2024-01-05 at 18:25 +0800, Xi Ruoyao wrote:

On Fri, 2024-01-05 at 17:57 +0800, chenglulu wrote:

在 2024/1/5 下午4:37, Xi Ruoyao 写道:

On Fri, 2024-01-05 at 11:40 +0800, Lulu Cheng wrote:

   bool
   loongarch_explicit_relocs_p (enum loongarch_symbol_type type)
   {
+  /* Instructions pcalau12i, addi.d, lu32i.d and lu52i.d must be adjancent
+ so that the linker can infer the PC of pcalau12i to apply relocations
+ to lu32i.d and lu52i.d.  Otherwise, the results would be incorrect if
+ these four instructions are not in the same 4KiB page.
+ Therefore, macro instructions are used when cmodel=extreme.  */
+  if (loongarch_symbol_extreme_p (type))
+    return false;

I think this is a bit of strange.  With -mexplicit-relocs={auto,always}
we should still use explicit relocs, but coding all 4 instructions
altogether as

"pcalau12i.d\t%1,%pc64_hi12(%2)\n\taddi.d\t%0,$r0,%pclo12(%2)\n\tlu32i.d\t%0,%pc64_lo20(%2)\n\tlu52i.d\t%0,%0,%pc64_hi12(%2)"

Give me several hours trying to implement this...


I think there is no difference between macros and these instructions put
together. If implement it in a split form, I think I can try it through
TARGET_SCHED_MACRO_FUSION_PAIR_P

We don't need to split the insn.  We can just add a "large insn"
containing the assembly output we want.

See the attached patch.  Note that TLS LE/LD/GD needs a fix too because
they are basically an variation of GOT addressing.

I've ran some small tests and now trying to bootstrap GCC with -
mcmodel=extreme in BOOT_CFLAGS...


There is a difference:

int x;
int t() { return x; }

pcalau12i.d t0, %pc_hi20(x)
addi.d t1, r0, %pc_lo12(x)
lu32i.d t1, %pc64_lo20(x)
lu52i.d t1, t1, %pc64_hi12(x)
ldx.w a0, t0, t1

is slightly better than

pcalau12i.d t0, %pc_hi20(x)
addi.d t1, r0, %pc_lo12(x)
lu32i.d t1, %pc64_lo20(x)
lu52i.d t1, t1, %pc64_hi12(x)
addi.d t0, t0, t1
ld.w a0, t0, 0

And generating macros when -mexplicit-relocs=always can puzzle people
(it says "always" :-\ ).

Thumbs up! This method is much better than my method, I learned 
something! grateful!

But I still have to test the accuracy.



[PATCH] LoongArch: Implement option save/restore

2024-01-05 Thread Yang Yujie
LTO option streaming and target attributes both require per-function
target configuration, which is achieved via option save/restore.

We implement TARGET_OPTION_{SAVE,RESTORE} to switch the la_target
context in addition to other automatically maintained option states
(via the "Save" option property in the .opt files).

Tested on loongarch64-linux-gnu without regression.

PR target/113233

gcc/ChangeLog:

* config/loongarch/genopts/loongarch.opt.in: Mark options with
the "Save" property.
* config/loongarch/loongarch.opt: Same.
* config/loongarch/loongarch-opts.cc: Refresh -mcmodel= state
according to la_target.
* config/loongarch/loongarch.cc: Implement TARGET_OPTION_{SAVE,
RESTORE} for the la_target structure; Rename option conditions
to have the same "la_" prefix.
* config/loongarch/loongarch.h: Same.
---
 gcc/config/loongarch/genopts/loongarch.opt.in | 38 
 gcc/config/loongarch/loongarch-opts.cc|  7 ++
 gcc/config/loongarch/loongarch.cc | 90 +++
 gcc/config/loongarch/loongarch.h  |  2 +-
 gcc/config/loongarch/loongarch.opt| 38 
 5 files changed, 121 insertions(+), 54 deletions(-)

diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in 
b/gcc/config/loongarch/genopts/loongarch.opt.in
index 1dbd3ad1e3f..02f918053f5 100644
--- a/gcc/config/loongarch/genopts/loongarch.opt.in
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
@@ -50,7 +50,7 @@ EnumValue
 Enum(isa_ext_fpu) String(@@STR_ISA_EXT_FPU64@@) Value(ISA_EXT_FPU64)
 
 m@@OPTSTR_ISA_EXT_FPU@@=
-Target RejectNegative Joined ToLower Enum(isa_ext_fpu) Var(la_opt_fpu) 
Init(M_OPT_UNSET)
+Target RejectNegative Joined ToLower Enum(isa_ext_fpu) Var(la_opt_fpu) 
Init(M_OPT_UNSET) Save
 -m@@OPTSTR_ISA_EXT_FPU@@=FPU   Generate code for the given FPU.
 
 m@@OPTSTR_ISA_EXT_FPU@@=@@STR_ISA_EXT_FPU0@@
@@ -82,7 +82,7 @@ EnumValue
 Enum(isa_ext_simd) String(@@STR_ISA_EXT_LASX@@) Value(ISA_EXT_SIMD_LASX)
 
 m@@OPTSTR_ISA_EXT_SIMD@@=
-Target RejectNegative Joined ToLower Enum(isa_ext_simd) Var(la_opt_simd) 
Init(M_OPT_UNSET)
+Target RejectNegative Joined ToLower Enum(isa_ext_simd) Var(la_opt_simd) 
Init(M_OPT_UNSET) Save
 -m@@OPTSTR_ISA_EXT_SIMD@@=SIMD Generate code for the given SIMD extension.
 
 m@@STR_ISA_EXT_LSX@@
@@ -114,11 +114,11 @@ EnumValue
 Enum(cpu_type) String(@@STR_CPU_LA664@@) Value(CPU_LA664)
 
 m@@OPTSTR_ARCH@@=
-Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_arch) 
Init(M_OPT_UNSET)
+Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_arch) 
Init(M_OPT_UNSET) Save
 -m@@OPTSTR_ARCH@@=PROCESSORGenerate code for the given PROCESSOR ISA.
 
 m@@OPTSTR_TUNE@@=
-Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_tune) 
Init(M_OPT_UNSET)
+Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_tune) 
Init(M_OPT_UNSET) Save
 -m@@OPTSTR_TUNE@@=PROCESSORGenerate optimized code for PROCESSOR.
 
 
@@ -149,31 +149,31 @@ Variable
 int la_opt_abi_ext = M_OPT_UNSET
 
 mbranch-cost=
-Target RejectNegative Joined UInteger Var(loongarch_branch_cost)
+Target RejectNegative Joined UInteger Var(la_branch_cost) Save
 -mbranch-cost=COST Set the cost of branches to roughly COST instructions.
 
 mcheck-zero-division
-Target Mask(CHECK_ZERO_DIV)
+Target Mask(CHECK_ZERO_DIV) Save
 Trap on integer divide by zero.
 
 mcond-move-int
-Target Var(TARGET_COND_MOVE_INT) Init(1)
+Target Mask(COND_MOVE_INT) Save
 Conditional moves for integral are enabled.
 
 mcond-move-float
-Target Var(TARGET_COND_MOVE_FLOAT) Init(1)
+Target Mask(COND_MOVE_FLOAT) Save
 Conditional moves for float are enabled.
 
 mmemcpy
-Target Mask(MEMCPY)
+Target Mask(MEMCPY) Save
 Prevent optimizing block moves, which is also the default behavior of -Os.
 
 mstrict-align
-Target Var(TARGET_STRICT_ALIGN) Init(0)
+Target Mask(STRICT_ALIGN) Save
 Do not generate unaligned memory accesses.
 
 mmax-inline-memcpy-size=
-Target Joined RejectNegative UInteger Var(loongarch_max_inline_memcpy_size) 
Init(1024)
+Target Joined RejectNegative UInteger Var(la_max_inline_memcpy_size) 
Init(1024) Save
 -mmax-inline-memcpy-size=SIZE  Set the max size of memcpy to inline, default 
is 1024.
 
 Enum
@@ -198,11 +198,11 @@ Target Alias(mexplicit-relocs=, always, none)
 Use %reloc() assembly operators (for backward compatibility).
 
 mrecip
-Target RejectNegative Var(loongarch_recip)
+Target RejectNegative Var(la_recip) Save
 Generate approximate reciprocal divide and square root for better throughput.
 
 mrecip=
-Target RejectNegative Joined Var(loongarch_recip_name)
+Target RejectNegative Joined Var(la_recip_name) Save
 Control generation of reciprocal estimates.
 
 ; The code model option names for -mcmodel.
@@ -229,29 +229,29 @@ EnumValue
 Enum(cmodel) String(@@STR_CMODEL_EXTREME@@) Value(CMODEL_EXTREME)
 
 mcmodel=
-Target RejectNegative Joined Enum(cmodel) Var(la_opt_cmodel) Init(M_OPT_UNSET)
+Target RejectNegative Joined Enum(cmodel) 

Re: Patch: Remove unneeded double operation in libstdc++-v3/src/c++17/fs_path.cc

2024-01-05 Thread Martin Küttler


>>This is a small change to libstdc++ which does not change any behavior.
>
> Please CC the libstd...@gcc.gnu.org list on all libstdc++ patches, as
> documented at https://gcc.gnu.org/lists.html

Acknowledged. Sorry.

>>This change has two, ihmo positive, implications:
>>
>> - The implicit conversion from double to int is avoided (Avoiding a
>>   warning).
>
> I don't see any warning here. What do you see?

I see "warning: conversion from ‘double’ to ‘int’ may change value
[-Wfloat-conversion]" This appears to be a specifically enabled warning.

> Looking at path::_List::reserve now, we probably also want to avoid
> overflow. Although a path with INT_MAX/1.5 components seems
> implausible for 32-bit and 64-bit targets, it could be a problem for
> 16-bit targets. I'll take care of that too.

Nice catch.

Martin

--
Kernkonzept GmbH at Dresden, Germany, HRB 31129, CEO Dr.-Ing. Michael Hohmuth


Re: Patch: Remove unneeded double operation in libstdc++-v3/src/c++17/fs_path.cc

2024-01-05 Thread Jonathan Wakely
On Fri, 5 Jan 2024 at 13:00, Martin Küttler
 wrote:
>
>
> >>This is a small change to libstdc++ which does not change any behavior.
> >
> > Please CC the libstd...@gcc.gnu.org list on all libstdc++ patches, as
> > documented at https://gcc.gnu.org/lists.html
>
> Acknowledged. Sorry.
>
> >>This change has two, ihmo positive, implications:
> >>
> >> - The implicit conversion from double to int is avoided (Avoiding a
> >>   warning).
> >
> > I don't see any warning here. What do you see?
>
> I see "warning: conversion from ‘double’ to ‘int’ may change value
> [-Wfloat-conversion]" This appears to be a specifically enabled warning.
>
> > Looking at path::_List::reserve now, we probably also want to avoid
> > overflow. Although a path with INT_MAX/1.5 components seems
> > implausible for 32-bit and 64-bit targets, it could be a problem for
> > 16-bit targets. I'll take care of that too.
>
> Nice catch.


We also have some redundant code in path::operator/= which can just be
removed, because _List::reserve does it anyway:

  if (orig_type == _Type::_Multi)
{
  const int curcap = _M_cmpts._M_impl->capacity();
  if (capacity > curcap)
capacity = std::max(capacity, (int) (curcap * 1.5));
}



Re: [PATCH 1/4] LoongArch: Handle ISA evolution switches along with other options

2024-01-05 Thread Yang Yujie
On Fri, Jan 05, 2024 at 08:12:08PM +0800, Xi Ruoyao wrote:
> On Fri, 2024-01-05 at 14:55 +0800, Yang Yujie wrote:
> > +#define ISA_HAS_FRECIPE \
> > +  (la_target.isa.evolution & OPTION_MASK_ISA_FRECIPE)
> > +#define ISA_HAS_DIV32 \
> > +  (la_target.isa.evolution & OPTION_MASK_ISA_DIV32)
> > +#define ISA_HAS_LAM_BH \
> > +  (la_target.isa.evolution & OPTION_MASK_ISA_LAM_BH)
> > +#define ISA_HAS_LAMCAS \
> > +  (la_target.isa.evolution & OPTION_MASK_ISA_LAM_BH)
> > +#define ISA_HAS_LD_SEQ_SA \
> > +  (la_target.isa.evolution & OPTION_MASK_ISA_LD_SEQ_SA)
> 
> Should every occurrence of TARGET_DIV32 etc. be replaced with
> ISA_HAS_DIV32 etc. in the code base?  It seems some of them are not
> replaced.

Thanks! I will fix that soon.

Yujie



RE: [PATCH]middle-end: Don't apply copysign optimization if target does not implement optab [PR112468]

2024-01-05 Thread Tamar Christina
> On Fri, 2024-01-05 at 11:02 +, Tamar Christina wrote:
> > Ok, so something like:
> >
> > > > ([istarget loongarch*-*-*] &&
> > > > ([check_effective_target_loongarch_sx] ||
> > > > [check_effective_target_hard_float]))
> > ?
> 
> We don't need "[check_effective_target_loongarch_sx] ||" because SIMD
> requires hard float.
> 

Cool, thanks! 

--

Hi All,

currently GCC does not treat IFN_COPYSIGN the same as the copysign tree expr.
The latter has a libcall fallback and the IFN can only do optabs.

Because of this the change I made to optimize copysign only works if the
target has impemented the optab, but it should work for those that have the
libcall too.

More annoyingly if a target has vector versions of ABS and NEG but not COPYSIGN
then the change made them lose vectorization.

The proper fix for this is to treat the IFN the same as the tree EXPR and to
enhance expand_COPYSIGN to also support vector calls.

I have such a patch for GCC 15 but it's quite big and too invasive for stage-4.
As such this is a minimal fix, just don't apply the transformation and leave
targets which don't have the optab unoptimized.

Targets list for check_effective_target_ifn_copysign was gotten by grepping for
copysign and looking at the optab.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Tests ran in x86_64-pc-linux-gnu -m32 and tests no longer fail.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/112468
* doc/sourcebuild.texi: Document ifn_copysign.
* match.pd: Only apply transformation if target supports the IFN.

gcc/testsuite/ChangeLog:

PR tree-optimization/112468
* gcc.dg/fold-copysign-1.c: Modify tests based on if target supports
IFN_COPYSIGN.
* gcc.dg/pr55152-2.c: Likewise.
* gcc.dg/tree-ssa/abs-4.c: Likewise.
* gcc.dg/tree-ssa/backprop-6.c: Likewise.
* gcc.dg/tree-ssa/copy-sign-2.c: Likewise.
* gcc.dg/tree-ssa/mult-abs-2.c: Likewise.
* lib/target-supports.exp (check_effective_target_ifn_copysign): New.

--- inline copy of patch ---

diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 
4be67daedb20d394857c02739389cabf23c0d533..f4847dafe65cbbf8c9de34905f614ef6957658b4
 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2664,6 +2664,10 @@ Target requires a command line argument to enable a SIMD 
instruction set.
 @item xorsign
 Target supports the xorsign optab expansion.
 
+@item ifn_copysign
+Target supports the IFN_COPYSIGN optab expansion for both scalar and vector
+types.
+
 @end table
 
 @subsubsection Environment attributes
diff --git a/gcc/match.pd b/gcc/match.pd
index 
d57e29bfe1d68afd4df4dda20fecc2405ff05332..87d13e7e3e1aa6d89119142b614890dc4729b521
 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1159,13 +1159,22 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (simplify
   (copysigns @0 REAL_CST@1)
   (if (!REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
-   (abs @0
+   (abs @0)
+#if GIMPLE
+   (if (!direct_internal_fn_supported_p (IFN_COPYSIGN, type,
+OPTIMIZE_FOR_BOTH))
+(negate (abs @0)))
+#endif
+   )))
 
+#if GIMPLE
 /* Transform fneg (fabs (X)) -> copysign (X, -1).  */
 (simplify
  (negate (abs @0))
- (IFN_COPYSIGN @0 { build_minus_one_cst (type); }))
-
+ (if (direct_internal_fn_supported_p (IFN_COPYSIGN, type,
+ OPTIMIZE_FOR_BOTH))
+   (IFN_COPYSIGN @0 { build_minus_one_cst (type); })))
+#endif
 /* copysign(copysign(x, y), z) -> copysign(x, z).  */
 (for copysigns (COPYSIGN_ALL)
  (simplify
diff --git a/gcc/testsuite/gcc.dg/fold-copysign-1.c 
b/gcc/testsuite/gcc.dg/fold-copysign-1.c
index 
f9cafd14ab05f5e8ab2f6f68e62801d21c2df6a6..96b80c733794fffada1b08274ef39cc8f6e442ce
 100644
--- a/gcc/testsuite/gcc.dg/fold-copysign-1.c
+++ b/gcc/testsuite/gcc.dg/fold-copysign-1.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O -fdump-tree-cddce1" } */
+/* { dg-additional-options "-msse -mfpmath=sse" { target { { i?86-*-* 
x86_64-*-* } && ilp32 } } } */
 
 double foo (double x)
 {
@@ -12,5 +13,7 @@ double bar (double x)
   return __builtin_copysign (x, minuszero);
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_copysign" 1 "cddce1" } } */
-/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_copysign" 1 "cddce1" { target 
ifn_copysign } } } */
+/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "cddce1" { target 
ifn_copysign } } } */
+/* { dg-final { scan-tree-dump-times "= -" 1 "cddce1" { target { ! 
ifn_copysign } } } } */
+/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 2 "cddce1" { target { ! 
ifn_copysign } } } } */
diff --git a/gcc/testsuite/gcc.dg/pr55152-2.c b/gcc/testsuite/gcc.dg/pr55152-2.c
index 
605f202ed6bc7aa8fe921457b02ff0b88cc63ce6..24068cffa4a8e2807ba7d16c4ed3def4f736e797
 100644
--- a/gcc/testsuite/gcc.dg/pr55152-2.c
+++ b/gcc/testsuite/gcc.dg/pr55152-2.c
@@ -

Re: [PATCH] RISC-V: Allow simplification non-vlmax with len = NUNITS reg to reg move

2024-01-05 Thread Robin Dapp
> +/* Return true it is whole register-register move.  */
> +bool
> +whole_reg_to_reg_move_p (rtx *ops, machine_mode mode)
> +{
> +  if (register_operand (ops[0], mode)
> +  && register_operand (ops[3], mode)
> +  && satisfies_constraint_vu (ops[2])
> +  && satisfies_constraint_Wc1 (ops[1]))
> +{
> +  int vlmax_index = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? 5 : 7;
> +  if (INTVAL (ops[vlmax_index]) == VLMAX)
> + return true;

Is that indent correct?  Looks odd on my screen but I didn't verify.

> +  /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
> +  into NON-VLMAX with LEN = NUNITS.  */
> +  else if (CONST_INT_P (ops[4])
> +&& known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode)))
> + return true;
> +}
> +  return false;
> +}

I would prefer having the vlmax_index as a parameter.  Even though
it's clear that a mask set operation has two operands less I don't
find it particularly intuitive to check that in the function.

Also explain both cases in the function-level comment and mention
the preconditions for calling the function.  Something like:
 "An operation is a whole-register move if either
   (1) Its vlmax operand equals VLMAX
   (2) Its vl operand equals the number of units of its mode."

Maybe some more asserts or checks wouldn't hurt either so the function
can't accidentally be called on other operations than vlde/vste/vimov.

Regards
 Robin



Re: [PATCH] libstdc++: Fix testsuite with -Wformat

2024-01-05 Thread Jonathan Wakely

On 06/12/23 15:34 +0100, Gwenole Beauchesne wrote:

Tested on x86_64-pc-linux-gnu with --enable-languages=c,c++ and
additional -Wformat to CXXFLAGS.


Please CC the libstd...@gcc.gnu.org list on all libstdc++ patches, as
documented at https://gcc.gnu.org/lists.html

Otherwise I won't see the patches unless I happen to glance at the
gcc-patches archive by chance.

The patch seems OK, but what exactly is it fixing? I don't see any
warning when adding -Wformat to the test flags.


-- >8 --

Fix testsuite when compiling with -Wformat. Use nonnull arguments so
that -Wformat does not cause extraneous output to be reported as an
error.

FAIL: tr1/8_c_compatibility/cinttypes/functions.cc (test for excess errors)

libstdc++-v3/ChangeLog:

   * testsuite/tr1/8_c_compatibility/cinttypes/functions.cc: Use
   nonnull arguments to strtoimax() and wcstoimax() functions.

Signed-off-by: Gwenole Beauchesne 
---
.../testsuite/tr1/8_c_compatibility/cinttypes/functions.cc| 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git 
a/libstdc++-v3/testsuite/tr1/8_c_compatibility/cinttypes/functions.cc 
b/libstdc++-v3/testsuite/tr1/8_c_compatibility/cinttypes/functions.cc
index 518ddf49875..21f5263b5cc 100644
--- a/libstdc++-v3/testsuite/tr1/8_c_compatibility/cinttypes/functions.cc
+++ b/libstdc++-v3/testsuite/tr1/8_c_compatibility/cinttypes/functions.cc
@@ -29,10 +29,10 @@ void test01()
#if _GLIBCXX_USE_C99_INTTYPES_TR1

  std::tr1::intmax_t i = 0, numer = 0, denom = 0, base = 0;
-  const char* s = 0;
+  const char* s = "0";
  char** endptr = 0;
#if defined(_GLIBCXX_USE_WCHAR_T) && _GLIBCXX_USE_C99_INTTYPES_WCHAR_T_TR1
-  const wchar_t* ws = 0;
+  const wchar_t* ws = L"0";
  wchar_t** wendptr = 0;
#endif





Re: [PATCH v2 2/2] LoongArch: When the code model is extreme, the symbol address is obtained through macro instructions regardless of the value of -mexplicit-relocs.

2024-01-05 Thread Xi Ruoyao
On Fri, 2024-01-05 at 20:45 +0800, chenglulu wrote:
> 
> 在 2024/1/5 下午7:55, Xi Ruoyao 写道:
> > On Fri, 2024-01-05 at 18:25 +0800, Xi Ruoyao wrote:
> > > On Fri, 2024-01-05 at 17:57 +0800, chenglulu wrote:
> > > > 在 2024/1/5 下午4:37, Xi Ruoyao 写道:
> > > > > On Fri, 2024-01-05 at 11:40 +0800, Lulu Cheng wrote:
> > > > > >    bool
> > > > > >    loongarch_explicit_relocs_p (enum loongarch_symbol_type type)
> > > > > >    {
> > > > > > +  /* Instructions pcalau12i, addi.d, lu32i.d and lu52i.d must be 
> > > > > > adjancent
> > > > > > + so that the linker can infer the PC of pcalau12i to apply 
> > > > > > relocations
> > > > > > + to lu32i.d and lu52i.d.  Otherwise, the results would be 
> > > > > > incorrect if
> > > > > > + these four instructions are not in the same 4KiB page.
> > > > > > + Therefore, macro instructions are used when cmodel=extreme.  
> > > > > > */
> > > > > > +  if (loongarch_symbol_extreme_p (type))
> > > > > > +    return false;
> > > > > I think this is a bit of strange.  With 
> > > > > -mexplicit-relocs={auto,always}
> > > > > we should still use explicit relocs, but coding all 4 instructions
> > > > > altogether as
> > > > > 
> > > > > "pcalau12i.d\t%1,%pc64_hi12(%2)\n\taddi.d\t%0,$r0,%pclo12(%2)\n\tlu32i.d\t%0,%pc64_lo20(%2)\n\tlu52i.d\t%0,%0,%pc64_hi12(%2)"
> > > > > 
> > > > > Give me several hours trying to implement this...
> > > > > 
> > > > I think there is no difference between macros and these instructions put
> > > > together. If implement it in a split form, I think I can try it through
> > > > TARGET_SCHED_MACRO_FUSION_PAIR_P
> > We don't need to split the insn.  We can just add a "large insn"
> > containing the assembly output we want.
> > 
> > See the attached patch.  Note that TLS LE/LD/GD needs a fix too because
> > they are basically an variation of GOT addressing.
> > 
> > I've ran some small tests and now trying to bootstrap GCC with -
> > mcmodel=extreme in BOOT_CFLAGS...
> > 
> > > There is a difference:
> > > 
> > > int x;
> > > int t() { return x; }
> > > 
> > > pcalau12i.d t0, %pc_hi20(x)
> > > addi.d t1, r0, %pc_lo12(x)
> > > lu32i.d t1, %pc64_lo20(x)
> > > lu52i.d t1, t1, %pc64_hi12(x)
> > > ldx.w a0, t0, t1
> > > 
> > > is slightly better than
> > > 
> > > pcalau12i.d t0, %pc_hi20(x)
> > > addi.d t1, r0, %pc_lo12(x)
> > > lu32i.d t1, %pc64_lo20(x)
> > > lu52i.d t1, t1, %pc64_hi12(x)
> > > addi.d t0, t0, t1
> > > ld.w a0, t0, 0
> > > 
> > > And generating macros when -mexplicit-relocs=always can puzzle people
> > > (it says "always" :-\ ).
> > > 
> Thumbs up! This method is much better than my method, I learned 
> something! grateful!
> But I still have to test the accuracy.

I found an issue bootstrapping GCC with -mcmodel=extreme in BOOT_CFLAGS:
we need a target hook to tell the generic code
UNSPEC_LA_PCREL_64_PART{1,2} are just a wrapper around symbols, or we'll
see millions lines of messages like

../../gcc/gcc/tree.h:4171:1: note: non-delegitimized UNSPEC
UNSPEC_LA_PCREL_64_PART1 (42) found in variable location

diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index 4f89c4af323..410e1b5e693 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -10868,6 +10868,24 @@ loongarch_asm_code_end (void)
 #undef DUMP_FEATURE
 }
 
+static rtx loongarch_delegitimize_address (rtx op)
+{
+  if (GET_CODE (op) == UNSPEC)
+  {
+int unspec = XINT (op, 1);
+switch (unspec)
+  {
+  case UNSPEC_LA_PCREL_64_PART1:
+  case UNSPEC_LA_PCREL_64_PART2:
+   return XVECEXP (op, 0, 0);
+  default:
+   return op;
+  }
+  }
+
+  return op;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -11129,6 +11147,10 @@ loongarch_asm_code_end (void)
 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
   loongarch_builtin_support_vector_misalignment
 
+#undef TARGET_DELEGITIMIZE_ADDRESS
+#define TARGET_DELEGITIMIZE_ADDRESS \
+  loongarch_delegitimize_address
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-loongarch.h"

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


[committed] libstdc++: Do not use __is_convertible unconditionally [PR113241]

2024-01-05 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk, backport to gcc-13 needed too.

-- >8 --

The new __is_convertible built-in should only be used after checking
that it's supported.

libstdc++-v3/ChangeLog:

PR libstdc++/113241
* include/std/type_traits (is_convertible_v): Guard use of
built-in with preprocessor check.
---
 libstdc++-v3/include/std/type_traits | 5 +
 1 file changed, 5 insertions(+)

diff --git a/libstdc++-v3/include/std/type_traits 
b/libstdc++-v3/include/std/type_traits
index a71162b33ec..3b1b419 100644
--- a/libstdc++-v3/include/std/type_traits
+++ b/libstdc++-v3/include/std/type_traits
@@ -3477,8 +3477,13 @@ template 
 #endif
 template 
   inline constexpr bool is_base_of_v = __is_base_of(_Base, _Derived);
+#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_convertible)
 template 
   inline constexpr bool is_convertible_v = __is_convertible(_From, _To);
+#else
+template 
+  inline constexpr bool is_convertible_v = is_convertible<_From, _To>::value;
+#endif
 template
   inline constexpr bool is_invocable_v = is_invocable<_Fn, _Args...>::value;
 template
-- 
2.43.0



Re: [middle-end PATCH take #2] Only call targetm.truly_noop_truncation for truncations.

2024-01-05 Thread Richard Sandiford
"Roger Sayle"  writes:
> Very many thanks (and a Happy New Year) to the pre-commit
> patch testing folks at linaro.org.   Their testing has revealed that
> although my patch is clean on x86_64, it triggers some problems
> on aarch64 and arm.  The issue (with the previous version of my
> patch) is that these platforms require a paradoxical subreg to be
> generated by the middle-end, where we were previously checking
> for truly_noop_truncation.
>
> This has been fixed (in revision 2) below.  Where previously I had:
>
> @@ -66,7 +66,9 @@ gen_lowpart_general (machine_mode mode, rtx x)
>scalar_int_mode xmode;
>if (is_a  (GET_MODE (x), &xmode)
>   && GET_MODE_SIZE (xmode) <= UNITS_PER_WORD
> - && TRULY_NOOP_TRUNCATION_MODES_P (mode, xmode)
> + && (known_lt (GET_MODE_SIZE (mode), GET_MODE_SIZE (xmode))
> + ? TRULY_NOOP_TRUNCATION_MODES_P (mode, xmode)
> + : known_eq (GET_MODE_SIZE (mode), GET_MODE_SIZE (xmode)))
>   && !reload_completed)
> return gen_lowpart_general (mode, force_reg (xmode, x));
>
> the correct change is:
>
>scalar_int_mode xmode;
>if (is_a  (GET_MODE (x), &xmode)
>   && GET_MODE_SIZE (xmode) <= UNITS_PER_WORD
> - && TRULY_NOOP_TRUNCATION_MODES_P (mode, xmode)
> + && (known_ge (GET_MODE_SIZE (mode), GET_MODE_SIZE (xmode))
> + || TRULY_NOOP_TRUNCATION_MODES_P (mode, xmode))
>   && !reload_completed)
> return gen_lowpart_general (mode, force_reg (xmode, x));
>
> i.e. we only call TRULY_NOOP_TRUNCATION_MODES_P when we
> know we have a truncation, but the behaviour of non-truncations
> is preserved (no longer depends upon unspecified behaviour) and
> gen_lowpart_general is called to create the paradoxical SUBREG.
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
> Hopefully this revision tests cleanly on the linaro.org CI pipeline.
>
> 2023-12-31  Roger Sayle  
>
> gcc/ChangeLog
> * combine.cc (make_extraction): Confirm that OUTPREC is less than
> INPREC before calling TRULY_NOOP_TRUNCATION_MODES_P.
> * expmed.cc (store_bit_field_using_insv): Likewise.
> (extract_bit_field_using_extv): Likewise.
> (extract_bit_field_as_subreg): Likewise.
> * optabs-query.cc (get_best_extraction_insn): Likewise.
> * optabs.cc (expand_parity): Likewise.
> * rtlhooks.cc (gen_lowpart_general): Likewise.
> * simplify-rtx.cc (simplify_truncation): Disallow truncations
> to the same precision.
> (simplify_unary_operation_1) : Move optimization
> of truncations to the same mode earlier.
>
>
>> -Original Message-
>> From: Roger Sayle 
>> Sent: 28 December 2023 15:35
>> To: 'gcc-patches@gcc.gnu.org' 
>> Cc: 'Jeff Law' 
>> Subject: [middle-end PATCH] Only call targetm.truly_noop_truncation for
>> truncations.
>> 
>> 
>> The truly_noop_truncation target hook is documented, in target.def, as
> "true if it
>> is safe to convert a value of inprec bits to one of outprec bits (where
> outprec is
>> smaller than inprec) by merely operating on it as if it had only outprec
> bits", i.e.
>> the middle-end can use a SUBREG instead of a TRUNCATE.
>> 
>> What's perhaps potentially a little ambiguous in the above description is
> whether
>> it is the caller or the callee that's responsible for ensuring or checking
> whether
>> "outprec < inprec".  The name TRULY_NOOP_TRUNCATION_P, like
>> SUBREG_PROMOTED_P, may be prone to being understood as a predicate that
>> confirms that something is a no-op truncation or a promoted subreg, when
> in fact
>> the caller must first confirm this is a truncation/subreg and only then
> call the
>> "classification" macro.
>> 
>> Alas making the following minor tweak (for testing) to the i386 backend:
>> 
>> static bool
>> ix86_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec) {
>>   gcc_assert (outprec < inprec);
>>   return true;
>> }
>> 
>> #undef TARGET_TRULY_NOOP_TRUNCATION
>> #define TARGET_TRULY_NOOP_TRUNCATION ix86_truly_noop_truncation
>> 
>> reveals that there are numerous callers in middle-end that rely on the
> default
>> behaviour of silently returning true for any (invalid) input.
>> These are fixed below.
>> 
>> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and
>> make -k check, both with and without --target_board=unix{-m32} with no new
>> failures.  Ok for mainline?
>> 
>> 
>> 2023-12-28  Roger Sayle  
>> 
>> gcc/ChangeLog
>> * combine.cc (make_extraction): Confirm that OUTPREC is less than
>> INPREC before calling TRULY_NOOP_TRUNCATION_MODES_P.
>> * expmed.cc (store_bit_field_using_insv): Likewise.
>> (extract_bit_field_using_extv): Likewise.
>> (extract_bit_field_as_subreg): Likewise.
>> * optabs-query.cc (get_best_extract

[committed] libstdc++: Avoid overflow when appending to std::filesystem::path

2024-01-05 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk.

-- >8 --

This prevents a std::filesystem::path from exceeding INT_MAX/4
components (which is unlikely to ever be a problem except on 16-bit
targets). That limit ensures that the capacity*1.5 calculation doesn't
overflow. We should also check that we don't exceed SIZE_MAX when
calculating how many bytes to allocate. That only needs to be checked
when int is at least as large as size_t, because otherwise we know that
the product INT_MAX/4 * sizeof(value_type) will fit in SIZE_MAX. For
targets where size_t is twice as wide as int this obviously holds. For
msp430-elf we have 16-bit int and 20-bit size_t, so the condition holds
as long as sizeof(value_type) fits in 7 bits, which it does.

We can also remove some floating-point arithmetic in operator/= which
ensures exponential growth of the buffer. That's redundant because
path::_List::reserve does that anyway (and does so more efficiently
since the commit immediately before this one).

libstdc++-v3/ChangeLog:

* src/c++17/fs_path.cc (path::_List::reserve): Limit maximum
size and check for overflows in arithmetic.
(path::operator/=(const path&)): Remove redundant exponential
growth calculation.
---
 libstdc++-v3/src/c++17/fs_path.cc | 35 +--
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/libstdc++-v3/src/c++17/fs_path.cc 
b/libstdc++-v3/src/c++17/fs_path.cc
index a2d3c54a88a..d33b8d96663 100644
--- a/libstdc++-v3/src/c++17/fs_path.cc
+++ b/libstdc++-v3/src/c++17/fs_path.cc
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include  // __gnu_cxx::__int_traits
 
 namespace fs = std::filesystem;
 using fs::path;
@@ -447,11 +448,30 @@ path::_List::reserve(int newcap, bool exact = false)
 
   if (curcap < newcap)
 {
-  const int nextcap = curcap + curcap / 2;
-  if (!exact && newcap < nextcap)
-   newcap = nextcap;
+  if (!exact)
+   {
+ const int nextcap = curcap + curcap / 2;
+ if (newcap < nextcap)
+   newcap = nextcap;
+   }
 
-  void* p = ::operator new(sizeof(_Impl) + newcap * sizeof(value_type));
+  using __gnu_cxx::__int_traits;
+  // Nobody should need paths with this many components.
+  if (newcap >= __int_traits::__max / 4)
+   std::__throw_bad_alloc();
+
+  size_t bytes;
+  if constexpr (__int_traits::__max >= __int_traits::__max)
+   {
+ size_t components;
+ if (__builtin_mul_overflow(newcap, sizeof(value_type), &components)
+   || __builtin_add_overflow(sizeof(_Impl), components, &bytes))
+   std::__throw_bad_alloc();
+   }
+  else // This won't overflow, even for 20-bit size_t on msp430.
+   bytes = sizeof(_Impl) + newcap * sizeof(value_type);
+
+  void* p = ::operator new(bytes);
   std::unique_ptr<_Impl, _Impl_deleter> newptr(::new(p) _Impl{newcap});
   const int cursize = curptr ? curptr->size() : 0;
   if (cursize)
@@ -588,13 +608,6 @@ path::operator/=(const path& __p)
 ++capacity; // Need to insert root-directory after root-name
 #endif
 
-  if (orig_type == _Type::_Multi)
-{
-  const int curcap = _M_cmpts._M_impl->capacity();
-  if (capacity > curcap)
-   capacity = std::max(capacity, (int) (curcap * 1.5));
-}
-
   _M_pathname.reserve(_M_pathname.length() + sep.length()
  + __p._M_pathname.length());
 
-- 
2.43.0



Re: [EXTERNAL] Re: Fw: [RFC] Either fix or disable SME feature for `aarch64-w64-mingw32` target?

2024-01-05 Thread Radek Barton
Hello, Andrew.

Thank you for your input. I've updated the "fixing" patch according to your 
feedback. Please let me know if I understood it correctly.

Radek


From: Andrew Pinski 
Sent: Thursday, January 4, 2024 8:11 PM
To: Radek Barton ; Andrew Pinski (QUIC) 

Cc: gcc-patches@gcc.gnu.org 
Subject: [EXTERNAL] Re: Fw: [RFC] Either fix or disable SME feature for 
`aarch64-w64-mingw32` target?

[You don't often get email from pins...@gmail.com. Learn why this is important 
at https://aka.ms/LearnAboutSenderIdentification ]

On Thu, Jan 4, 2024 at 5:51 AM Radek Barton  wrote:
>
> Hello, everyone.
>
>
> Our "Arm64 on Windows Ecosystem" team is currently working on adding 
> aarch64-w64-mingw32 target and we've noticed that recent commit adding SME 
> support 
> (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgcc.gnu.org%2Fpipermail%2Fgcc-cvs%2F2023-December%2F394915.html&data=05%7C02%7Cradek.barton%40microsoft.com%7C51df4d9506014407bc8908dc0d58eeac%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638399922842775482%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=5%2FK9lrqVSFn35cxjmhyEnKpiArJEMcOp5BQbAr%2F3r1s%3D&reserved=0)
>  is using .hidden and .size pseudo-ops that are not supported by this target 
> yet. We'd like to hear your opinion what would be the most acceptable fix for 
> the community:
>
> Wrap the unsupported pseudo-ops using macros and #ifdef them for the target. 
> The attached 0001-Ifdef-.hidden-and-.size-pseudo-ops-for-aarch64-w64-m.patch 
> is demonstrating this option.
> Move SME related sources to a separate config, t-sme, that won't be included 
> by the aarch64-w64-mingw32 target config. The attached 
> 0001-Exclude-SME-feature-from-libgcc-for-aarch64-w64-ming.patch  by Evgeny 
> Karpov is a proposal of this change.
> Do you have any other proposal?

For the .type issue you should use the following define instead:
```
#ifdef __ELF__
#define TYPE(x) .type x,function
#else
#define TYPE(x)
#endif
```
Which comes directly from config/aarch64/crti.S .
HIDDEN should be handled similarly.

We really should still have SME support for GCC for windows.

Thanks,
Andrew Pinski

>
>
> Best regards,
>
> Radek Bartoň


0001-Ifdef-.hidden-.type-and-.size-pseudo-ops-for-aarch64.patch
Description: 0001-Ifdef-.hidden-.type-and-.size-pseudo-ops-for-aarch64.patch


Re: [committed] RISC-V: Add crypto vector builtin function.

2024-01-05 Thread Jeff Law




On 1/4/24 20:24, Palmer Dabbelt wrote:

On Thu, 04 Jan 2024 19:17:21 PST (-0800), juzhe.zh...@rivai.ai wrote:

Hi, Wang Feng.

Your patch has some ICEs:
FAIL: gcc.target/riscv/rvv/base/zvbc-intrinsic.c (internal compiler 
error: RTL check: expected code 'const_int', have 'reg' in 
vlmax_avl_type_p, at config/riscv/riscv-v.cc:4930)

FAIL: gcc.target/riscv/rvv/base/zvbc-intrinsic.c (test for excess errors)
FAIL: gcc.target/riscv/rvv/base/zvbc_vx_constraint-1.c (internal 
compiler error: RTL check: expected code 'const_int', have 'reg' in 
vlmax_avl_type_p, at config/riscv/riscv-v.cc:4930)
FAIL: gcc.target/riscv/rvv/base/zvbc_vx_constraint-1.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/base/zvbc_vx_constraint-2.c (internal 
compiler error: RTL check: expected code 'const_int', have 'reg' in 
vlmax_avl_type_p, at config/riscv/riscv-v.cc:4930)
FAIL: gcc.target/riscv/rvv/base/zvbc_vx_constraint-2.c (test for 
excess errors)


So let's just revert it, it doesn't even look like it was reviewed. 
We've set a really bad precedent here where we're just merging a bunch 
of unreviewed code and sorting out the regressions in trunk, that's not 
the right way to do things.




I suspect you didn't enable rtl check in the regression:

../../configure --enable-gcc-checking=rtl.
Plz enable rtl check in the regression tests.
We haven't ever required folks to test with RTL checking enabled due to 
its compile-time cost.  So I don't think Feng did anything wrong here.


IIRC, Jakub's standard practice over in the x86 world is to do a 
bootstrap and regression test with RTL checking enabled in the spring as 
we get closer to the release to weed out these kinds of things that can 
slip through.


Clearly there's a bug and we should fix it, but it's not a sign that 
anything has gone terribly wrong.


jeff


Re: [RFA] [V3] new pass for sign/zero extension elimination

2024-01-05 Thread Jeff Law




On 1/4/24 13:44, Xi Ruoyao wrote:

I have successfully bootstrapped and regtested the patch on loongarch64-
linux-gnu.  The test cases in the patch (intended for RISC-V) also works
on LoongArch per my manual testing.
I find myself wondering if we should create some kind of target-supports 
test and make the tests generic.  Worst case we could just opt-in each 
target where they are expected to work in that target-supports test.



jeff


[pushed] aarch64: Extend VECT_COMPARE_COSTS to !SVE [PR113104]

2024-01-05 Thread Richard Sandiford
When SVE is enabled, we try vectorising with multiple different SVE and
Advanced SIMD approaches and use the cost model to pick the best one.
Until now, we've not done that for Advanced SIMD, since "the first mode
that works should always be the best".

The testcase is a counterexample.  Each iteration of the scalar loop
vectorises naturally with 64-bit input vectors and 128-bit output
vectors.  We do try that for SVE, and choose it as the best approach.
But the first approach we try is instead to use:

- a vectorisation factor of 2
- 1 128-bit vector for the inputs
- 2 128-bit vectors for the outputs

But since the stride is variable, the cost of marshalling the input
vector from two iterations outweighs the benefit of doing two iterations
at once.

This patch therefore generalises aarch64-sve-compare-costs to
aarch64-vect-compare-costs and applies it to non-SVE compilations.

Pushed after testing on aarch64-linux-gnu and aarch64_be-elf.

Richard


gcc/
PR target/113104
* doc/invoke.texi (aarch64-sve-compare-costs): Replace with...
(aarch64-vect-compare-costs): ...this.
* config/aarch64/aarch64.opt (-param=aarch64-sve-compare-costs=):
Replace with...
(-param=aarch64-vect-compare-costs=): ...this new param.
* config/aarch64/aarch64.cc (aarch64_override_options_internal):
Don't disable it when vectorizing for Advanced SIMD only.
(aarch64_autovectorize_vector_modes): Apply VECT_COMPARE_COSTS
whenever aarch64_vect_compare_costs is true.

gcc/testsuite/
PR target/113104
* gcc.target/aarch64/pr113104.c: New test.
* gcc.target/aarch64/sve/cond_arith_1.c: Update for new parameter
names.
* gcc.target/aarch64/sve/cond_arith_1_run.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_3.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_3_run.c: Likewise.
* gcc.target/aarch64/sve/gather_load_6.c: Likewise.
* gcc.target/aarch64/sve/gather_load_7.c: Likewise.
* gcc.target/aarch64/sve/load_const_offset_2.c: Likewise.
* gcc.target/aarch64/sve/load_const_offset_3.c: Likewise.
* gcc.target/aarch64/sve/mask_gather_load_6.c: Likewise.
* gcc.target/aarch64/sve/mask_gather_load_7.c: Likewise.
* gcc.target/aarch64/sve/mask_load_slp_1.c: Likewise.
* gcc.target/aarch64/sve/mask_struct_load_1.c: Likewise.
* gcc.target/aarch64/sve/mask_struct_load_2.c: Likewise.
* gcc.target/aarch64/sve/mask_struct_load_3.c: Likewise.
* gcc.target/aarch64/sve/mask_struct_load_4.c: Likewise.
* gcc.target/aarch64/sve/mask_struct_store_1.c: Likewise.
* gcc.target/aarch64/sve/mask_struct_store_1_run.c: Likewise.
* gcc.target/aarch64/sve/mask_struct_store_2.c: Likewise.
* gcc.target/aarch64/sve/mask_struct_store_2_run.c: Likewise.
* gcc.target/aarch64/sve/pack_1.c: Likewise.
* gcc.target/aarch64/sve/reduc_4.c: Likewise.
* gcc.target/aarch64/sve/scatter_store_6.c: Likewise.
* gcc.target/aarch64/sve/scatter_store_7.c: Likewise.
* gcc.target/aarch64/sve/strided_load_3.c: Likewise.
* gcc.target/aarch64/sve/strided_store_3.c: Likewise.
* gcc.target/aarch64/sve/unpack_fcvt_signed_1.c: Likewise.
* gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c: Likewise.
* gcc.target/aarch64/sve/unpack_signed_1.c: Likewise.
* gcc.target/aarch64/sve/unpack_unsigned_1.c: Likewise.
* gcc.target/aarch64/sve/unpack_unsigned_1_run.c: Likewise.
* gcc.target/aarch64/sve/vcond_11.c: Likewise.
* gcc.target/aarch64/sve/vcond_11_run.c: Likewise.
---
 gcc/config/aarch64/aarch64.cc | 13 +-
 gcc/config/aarch64/aarch64.opt|  7 +++---
 gcc/doc/invoke.texi   | 25 +--
 gcc/testsuite/gcc.target/aarch64/pr113104.c   | 25 +++
 .../gcc.target/aarch64/sve/cond_arith_1.c |  2 +-
 .../gcc.target/aarch64/sve/cond_arith_1_run.c |  2 +-
 .../gcc.target/aarch64/sve/cond_arith_3.c |  2 +-
 .../gcc.target/aarch64/sve/cond_arith_3_run.c |  2 +-
 .../gcc.target/aarch64/sve/gather_load_6.c|  2 +-
 .../gcc.target/aarch64/sve/gather_load_7.c|  2 +-
 .../aarch64/sve/load_const_offset_2.c |  2 +-
 .../aarch64/sve/load_const_offset_3.c |  2 +-
 .../aarch64/sve/mask_gather_load_6.c  |  2 +-
 .../aarch64/sve/mask_gather_load_7.c  |  2 +-
 .../gcc.target/aarch64/sve/mask_load_slp_1.c  |  2 +-
 .../aarch64/sve/mask_struct_load_1.c  |  2 +-
 .../aarch64/sve/mask_struct_load_2.c  |  2 +-
 .../aarch64/sve/mask_struct_load_3.c  |  2 +-
 .../aarch64/sve/mask_struct_load_4.c  |  2 +-
 .../aarch64/sve/mask_struct_store_1.c |  2 +-
 .../aarch64/sve/mask_struct_store_1_run.c |  2 +-
 .../aarch64/sve/mask_struct_store_2.c |  2 +-
 .../aarch64/sve/mask_struct_store_2_run.c |  2 +-
 gcc/t

[PATCH] Keep track of the FUNCTION_BEG note

2024-01-05 Thread Richard Sandiford
function.cc emits a NOTE_FUNCTION_BEG after all arguments have
been copied to pseudos.  It then records this note in parm_birth_insn.
Various other pieces of code use this insn as a convenient place to
insert things at the start of the function.

However, cfgexpand later changes parm_birth_insn as follows:

  /* If we emitted any instructions for setting up the variables,
 emit them before the FUNCTION_START note.  */
  if (var_seq)
{
  emit_insn_before (var_seq, parm_birth_insn);

  /* In expand_function_end we'll insert the alloca save/restore
 before parm_birth_insn.  We've just insertted an alloca call.
 Adjust the pointer to match.  */
  parm_birth_insn = var_seq;
}

But the FUNCTION_BEG note is still useful for things that aren't
sensitive to stack allocation, and it has the advantage that
(unlike the var_seq above) it is never deleted or combined.
This patch adds a separate variable to track it.

Tested on aarch64-linux-gnu, where it's needed for fixing PR113196.
OK to install?

Richard


gcc/
* emit-rtl.h (rtl_data::x_function_beg_note): New member variable.
(function_beg_insn): New macro.
* function.cc (expand_function_start): Initialize function_beg_insn.
---
 gcc/emit-rtl.h  | 4 
 gcc/function.cc | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/emit-rtl.h b/gcc/emit-rtl.h
index f749ca9f2a0..34f44cb2990 100644
--- a/gcc/emit-rtl.h
+++ b/gcc/emit-rtl.h
@@ -141,6 +141,9 @@ struct GTY(()) rtl_data {
  If stack grows up, this is the address for the next slot.  */
   poly_int64 x_frame_offset;
 
+  /* The function's FUNCTION_BEG note.  */
+  rtx_insn *x_function_beg_insn;
+
   /* Insn after which register parms and SAVE_EXPRs are born, if nonopt.  */
   rtx_insn *x_parm_birth_insn;
 
@@ -323,6 +326,7 @@ struct GTY(()) rtl_data {
 #define return_label (crtl->x_return_label)
 #define naked_return_label (crtl->x_naked_return_label)
 #define stack_slot_list (crtl->x_stack_slot_list)
+#define function_beg_insn (crtl->x_function_beg_insn)
 #define parm_birth_insn (crtl->x_parm_birth_insn)
 #define frame_offset (crtl->x_frame_offset)
 #define stack_check_probe_note (crtl->x_stack_check_probe_note)
diff --git a/gcc/function.cc b/gcc/function.cc
index de356f7fba3..5ffd438475e 100644
--- a/gcc/function.cc
+++ b/gcc/function.cc
@@ -5202,7 +5202,7 @@ expand_function_start (tree subr)
 
   gcc_assert (NOTE_P (get_last_insn ()));
 
-  parm_birth_insn = get_last_insn ();
+  function_beg_insn = parm_birth_insn = get_last_insn ();
 
   /* If the function receives a non-local goto, then store the
  bits we need to restore the frame pointer.  */
-- 
2.25.1



[PATCH] aarch64: Rework uxtl->zip optimisation [PR113196]

2024-01-05 Thread Richard Sandiford
g:f26f92b534f9 implemented unsigned extensions using ZIPs rather than
UXTL{,2}, since the former has a higher throughput than the latter on
amny cores.  The optimisation worked by lowering directly to ZIP during
expand, so that the zero input could be hoisted and shared.

However, changing to ZIP means that zero extensions no longer benefit
from some existing combine patterns.  The patch included new patterns
for UADDW and USUBW, but the PR shows that other patterns were affected
as well.

This patch instead introduces the ZIPs during a pre-reload split
and forcibly hoists the zero move to the outermost scope.  This has
the disadvantage of executing the move even for a shrink-wrapped
function, which I suppose could be a problem if it causes a kernel
to trap and enable Advanced SIMD unnecessarily.  In other circumstances,
an unused move shouldn't affect things much.

Also, the RA should be able to rematerialise the move at an
appropriate point if necessary, such as if there is an intervening
call.  uxtl-combine-13.c contains a test for this.

The patch then tries to ensure that the post-RA late-combine pass
can recombine zeros and ZIPs back into UXTLs if there wasn't
sufficient use of the zero to make it worthwhile.  The cut-off
used by the patch is that 1 UXTL is better than 1 MOVI + 1 ZIP,
but that 1 MOVI + 2 ZIPs are better than 2 UXTLs (assuming all
instructions have equal execution frequency).  Any other uses of the
shared zero would count in its favour too; it's not limitedto ZIPs.

In order to do that, the patch relaxes the ZIP patterns so that
the inputs can have any mode.  This allows the V4SI zero to be
propagated into any kind of ZIP, rather than just V4SI ones.
I think that's logically consistent, since it's the mode of
the unspec that ultimately determines the mode of the operation.
(And we don't need to be overly defensive about which modes are
acceptable, since ZIPs are only generated by code that knows/ought
to know what it's doing.)

Also, the original optimisation contained a big-endian correction
that I don't think is needed/correct.  Even on big-endian targets,
we want the ZIP to take the low half of an element from the input
vector and the high half from the zero vector.  And the patterns
map directly to the underlying Advanced SIMD instructions: the use
of unspecs means that there's no need to adjust for the difference
between GCC and Arm lane numbering.

Tested on aarch64-linux-gnu and aarch64_be-elf (fixing some execution
failures for the latter).  The patch depends on the late-combine pass
and on the FUNCTION_BEG patch that I just posted.  I'll commit once
those are in, if there are no objections.

Richard


gcc/
PR target/113196
* config/aarch64/aarch64.h (machine_function::advsimd_zero_insn):
New member variable.
* config/aarch64/iterators.md (Vnarrowq2): New mode attribute.
* config/aarch64/predicates.md (aarch64_any_register_operand):
Accept subregs too.
* config/aarch64/aarch64-simd.md
(aarch64_): Change the
input operand predicates to aarch64_any_register_operand.
(vec_unpacku_hi_, vec_unpacks_hi_): Recombine into...
(vec_unpack_hi_): ...this.  Move the generation of
zip2 for zero-extends to...
(aarch64_simd_vec_unpack_hi_): ...a split of this
instruction.  Fix big-endian handling.
(*aarch64_zip2_uxtl2): New pattern.
(vec_unpacku_lo_, vec_unpacks_lo_): Recombine into...
(vec_unpack_lo_): ...this.  Move the generation of
zip1 for zero-extends to...
(2): ...a split of this instruction.
Fix big-endian handling.
(*aarch64_zip1_uxtl): New pattern.
(aarch64_usubw_lo_zip, aarch64_uaddw_lo_zip): Delete
(aarch64_usubw_hi_zip, aarch64_uaddw_hi_zip): Likewise.
* config/aarch64/aarch64.cc (aarch64_rtx_costs): Recognize ZIP1s
and ZIP2s that can be implemented using UXTL{,2}.  Make them
half an instruction more expensive than a normal zip.
(aarch64_get_shareable_reg): New function.
(aarch64_gen_shareable_zero): Use it.

gcc/testsuite/
PR target/113196
* gcc.target/aarch64/pr103350-1.c: Disable split1.
* gcc.target/aarch64/pr103350-2.c: Likewise.
* gcc.target/aarch64/simd/vmovl_high_1.c: Remove double include.
Expect uxtl2 rather than zip2.
* gcc.target/aarch64/vect_mixed_sizes_8.c: Expect zip1 rather
than uxtl.
* gcc.target/aarch64/vect_mixed_sizes_9.c: Likewise.
* gcc.target/aarch64/vect_mixed_sizes_10.c: Likewise.
* gcc.target/aarch64/uxtl-combine-7.c: New test.
* gcc.target/aarch64/uxtl-combine-8.c: Likewise.
* gcc.target/aarch64/uxtl-combine-9.c: Likewise.
* gcc.target/aarch64/uxtl-combine-10.c: Likewise.
* gcc.target/aarch64/uxtl-combine-11.c: Likewise.
* gcc.target/aarch64/uxtl-combine-12.c: Likewise.
* gcc.target/aarch64/uxtl-combine-13.c

[PATCH] c++: address of NTTP object as targ [PR113242]

2024-01-05 Thread Patrick Palka
Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK
for trunk and perhaps 13?

-- >8 --

invalid_tparm_referent_p was rejecting using the address of a class NTTP
object as a template argument, but this should be fine.

PR c++/113242

gcc/cp/ChangeLog:

* pt.cc (invalid_tparm_referent_p) : Suppress
DECL_ARTIFICIAL rejection test for class NTTP objects.

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/nontype-class61.C: New test.
---
 gcc/cp/pt.cc |  3 ++-
 gcc/testsuite/g++.dg/cpp2a/nontype-class61.C | 27 
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/nontype-class61.C

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 154ac76cb65..8c7d178328d 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -7219,7 +7219,8 @@ invalid_tparm_referent_p (tree type, tree expr, 
tsubst_flags_t complain)
   * a string literal (5.13.5),
   * the result of a typeid expression (8.2.8), or
   * a predefined __func__ variable (11.4.1).  */
-   else if (VAR_P (decl) && DECL_ARTIFICIAL (decl))
+   else if (VAR_P (decl) && !DECL_NTTP_OBJECT_P (decl)
+&& DECL_ARTIFICIAL (decl))
  {
if (complain & tf_error)
  error ("the address of %qD is not a valid template argument",
diff --git a/gcc/testsuite/g++.dg/cpp2a/nontype-class61.C 
b/gcc/testsuite/g++.dg/cpp2a/nontype-class61.C
new file mode 100644
index 000..90805a05ecf
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/nontype-class61.C
@@ -0,0 +1,27 @@
+// PR c++/113242
+// { dg-do compile { target c++20 } }
+
+struct wrapper {
+  int n;
+};
+
+template
+void f1() {
+  static_assert(X.n == 42);
+}
+
+template
+void f2() {
+  static_assert(X->n == 42);
+}
+
+template
+void g() {
+  f1();
+  f2<&X>();
+}
+
+int main() {
+  constexpr wrapper X = {42};
+  g();
+}
-- 
2.43.0.254.ga26002b628



[PATCH] libgccjit: Add support for setting the comment ident

2024-01-05 Thread Antoni Boucher
Hi.
This patch adds support for setting the comment ident (analogous to
#ident "comment" in C).
Thanks for the review.
From 1af4e77540001cce8c30e86040c1da785e435810 Mon Sep 17 00:00:00 2001
From: Antoni Boucher 
Date: Fri, 27 Oct 2023 17:36:03 -0400
Subject: [PATCH] libgccjit: Add support for setting the comment ident

gcc/jit/ChangeLog:

	* docs/topics/compatibility.rst (LIBGCCJIT_ABI_26): New ABI tag.
	* docs/topics/contexts.rst: Document gcc_jit_context_set_output_ident.
	* jit-playback.cc (set_output_ident): New method.
	* jit-playback.h (set_output_ident): New method.
	* jit-recording.cc (recording::context::set_output_ident,
	recording::output_ident::output_ident,
	recording::output_ident::~output_ident,
	recording::output_ident::replay_into,
	recording::output_ident::make_debug_string,
	recording::output_ident::write_reproducer): New methods.
	* jit-recording.h (class output_ident): New class.
	* libgccjit.cc (gcc_jit_context_set_output_ident): New function.
	* libgccjit.h (gcc_jit_context_set_output_ident): New function.
	* libgccjit.map: New function.

gcc/testsuite/ChangeLog:

	* jit.dg/all-non-failing-tests.h: New test.
	* jit.dg/test-output-ident.c: New test.
---
 gcc/jit/docs/topics/compatibility.rst|  7 +++
 gcc/jit/docs/topics/contexts.rst | 22 
 gcc/jit/jit-playback.cc  |  7 +++
 gcc/jit/jit-playback.h   |  3 ++
 gcc/jit/jit-recording.cc | 53 
 gcc/jit/jit-recording.h  | 22 
 gcc/jit/libgccjit.cc | 15 ++
 gcc/jit/libgccjit.h  |  6 +++
 gcc/jit/libgccjit.map|  5 ++
 gcc/testsuite/jit.dg/all-non-failing-tests.h |  3 ++
 gcc/testsuite/jit.dg/test-output-ident.c | 23 +
 11 files changed, 166 insertions(+)
 create mode 100644 gcc/testsuite/jit.dg/test-output-ident.c

diff --git a/gcc/jit/docs/topics/compatibility.rst b/gcc/jit/docs/topics/compatibility.rst
index ebede440ee4..c4de996506b 100644
--- a/gcc/jit/docs/topics/compatibility.rst
+++ b/gcc/jit/docs/topics/compatibility.rst
@@ -378,3 +378,10 @@ alignment of a variable:
 
 ``LIBGCCJIT_ABI_25`` covers the addition of
 :func:`gcc_jit_type_get_restrict`
+
+.. _LIBGCCJIT_ABI_26:
+
+``LIBGCCJIT_ABI_26``
+
+``LIBGCCJIT_ABI_26`` covers the addition of
+:func:`gcc_jit_context_set_output_ident`
diff --git a/gcc/jit/docs/topics/contexts.rst b/gcc/jit/docs/topics/contexts.rst
index b22eb2aa983..c51cf5a82ea 100644
--- a/gcc/jit/docs/topics/contexts.rst
+++ b/gcc/jit/docs/topics/contexts.rst
@@ -599,3 +599,25 @@ Additional command-line options
.. code-block:: c
 
   #ifdef LIBGCCJIT_HAVE_gcc_jit_context_add_driver_option
+
+Output options
+**
+
+.. function:: void gcc_jit_context_set_output_ident (gcc_jit_context *ctxt,\
+ const char* output_ident)
+
+   Set the identifier to write in the .comment section of the output file to
+   ``output_ident``. Analogous to:
+
+   .. code-block:: c
+
+  #ident "My comment"
+
+   in C.
+
+   This entrypoint was added in :ref:`LIBGCCJIT_ABI_26`; you can test for
+   its presence using
+
+   .. code-block:: c
+
+  #ifdef LIBGCCJIT_HAVE_gcc_jit_context_set_output_ident
diff --git a/gcc/jit/jit-playback.cc b/gcc/jit/jit-playback.cc
index 537f3b1..243a9fdf972 100644
--- a/gcc/jit/jit-playback.cc
+++ b/gcc/jit/jit-playback.cc
@@ -319,6 +319,13 @@ get_type (enum gcc_jit_types type_)
   return new type (type_node);
 }
 
+void
+playback::context::
+set_output_ident (const char* ident)
+{
+  targetm.asm_out.output_ident (ident);
+}
+
 /* Construct a playback::type instance (wrapping a tree) for the given
array type.  */
 
diff --git a/gcc/jit/jit-playback.h b/gcc/jit/jit-playback.h
index b0166f8f6ce..c6ed15517e9 100644
--- a/gcc/jit/jit-playback.h
+++ b/gcc/jit/jit-playback.h
@@ -66,6 +66,9 @@ public:
   type *
   get_type (enum gcc_jit_types type);
 
+  void
+  set_output_ident (const char* ident);
+
   type *
   new_array_type (location *loc,
 		  type *element_type,
diff --git a/gcc/jit/jit-recording.cc b/gcc/jit/jit-recording.cc
index 9b5b8005ebe..d86616f45ef 100644
--- a/gcc/jit/jit-recording.cc
+++ b/gcc/jit/jit-recording.cc
@@ -1346,6 +1346,13 @@ recording::context::set_str_option (enum gcc_jit_str_option opt,
   log_str_option (opt);
 }
 
+void
+recording::context::set_output_ident (const char *ident)
+{
+  recording::output_ident *memento = new output_ident (this, ident);
+  record (memento);
+}
+
 /* Set the given integer option for this context, or add an error if
it's not recognized.
 
@@ -2185,6 +2192,52 @@ recording::string::write_reproducer (reproducer &)
   /* Empty.  */
 }
 
+/* The implementation of class gcc::jit::recording::output_ident.  */
+
+/* Constructor for gcc::jit::recording::output_ident, allocating a
+   copy of the given text using new char[].  */

Re: [PATCH] Add a late-combine pass [PR106594]

2024-01-05 Thread Richard Sandiford
Jeff Law  writes:
> On 10/24/23 12:49, Richard Sandiford wrote:
>> This patch adds a combine pass that runs late in the pipeline.
>> There are two instances: one between combine and split1, and one
>> after postreload.
> So have you done any investigation on cases caught by your new pass 
> between combine and split1 to characterize them?  In particular do they 
> point at solvable problems in combine?  Or do you forsee this subsuming 
> the old combiner pass at some point in the distant future?

Examples like the PR are the main motivation for the pre-RA pass.
There we had an extension that could be combined into an address,
but no longer was after GCC 13.

The PR itself could in principle be fixed in combine (various
approaches were suggested, but not accepted).  But the same problem
applies to multiple uses of extensions.  fwprop can't handle it because
individual propagations are not a win in isolation.  And combine has
a limit of combining 4 insns (with a maximum of 2 output insns, IIRC).
So I don't think either of the existing passes scale to the general case.

FWIW, an example of this is gcc.c-torture/compile/pr33133.c:

@@ -20,7 +20,7 @@
bne .L3
mov x3, 0
 .L7:
-   mov w8, w3
+   mov w9, w3
add x1, x5, x3
ldrbw6, [x1, 256]
ubfxx1, x6, 1, 7
@@ -28,15 +28,14 @@
ldrbw1, [x7, x1]
lsr w1, w1, 4
 .L5:
-   asr w6, w8, 1
-   sxtwx9, w6
-   ldrbw6, [x2, w6, sxtw]
+   asr w8, w9, 1
+   ldrbw6, [x2, w8, sxtw]
sxtbw1, w1
-   tbz x8, 0, .L6
+   tbz x9, 0, .L6
sbfiz   w1, w1, 4, 4
 .L6:
orr w1, w6, w1
-   strbw1, [x2, x9]
+   strbw1, [x2, w8, sxtw]
add x3, x3, 1
cmp x3, 16
bne .L7

I hope that the new pass could be extended to do new things in future,
especially if they're things that would make sense in both a pre-RA
and post-RA pass.  And there's some benefit to having a cleanish slate,
especially without the baggage of the expand_compound_operation/
make_compound_operation wrangler.  But I don't think we can realistically
expect to replace old combine, or even that it's something worth aiming for.

> rth and I sketched out an SSA based RTL combine at some point in the 
> deep past.  The key goal we were trying to achieve was combining across 
> blocks.  We didn't have a functioning RTL SSA form at the time, so it 
> never went to any implementation work.  It looks like yours would solve 
> the class of problems rth and I were considering.

Yeah, I do see some cases where combining across blocks helps.
The case above is one example of that.  Another is:

@@ -8,9 +8,8 @@
 .LFB0:
.cfi_startproc
ldr w1, [x0, 4]
-   cmp w1, 0
cbz w1, .L3
-   blt .L4
+   tbnzw1, #31, .L4
ldr w2, [x0]
ldr w0, [x0, 8]
add w1, w1, w2

And, as a similar example for multiple uses:

@@ -144,8 +144,7 @@
neg x21, x21
mov w26, 255
 .L18:
-   cmp x19, 0
-   bge .L19
+   tbz x19, #63, .L19
mvn w26, w26
and w26, w26, 255
neg x19, x19
@@ -160,7 +159,7 @@
mov w26, 0
b   .L18
 .L19:
-   bne .L20
+   cbnzx19, .L20
mov x19, 9223372036854775807
b   .L21

>> The patch therefore enables the pass by default only on AArch64.
>> However, I did test the patch with it enabled on x86_64-linux-gnu
>> as well, which was useful for debugging.
>> 
>> Bootstrapped & regression-tested on aarch64-linux-gnu and
>> x86_64-linux-gnu (as posted, with no regressions, and with the
>> pass enabled by default, with some gcc.target/i386 regressions).
>> OK to install?
> I'm going to adjust this slightly so that it's enabled across the board 
> and throw it into the tester tomorrow (tester is busy tonight).  Even if 
> we make it opt-in on a per-port basis, the alternate target testing does 
> seems to find stuff that needs fixing ;-)

Thanks!  As per our off-list discussion, the cris-elf failures showed
up a bug in the handling of call arguments.  Here's an updated version
with that fixed.

Unfortunately (also as per the off-list discussion), it seems like some
ports have latent assumptions that certain combinations don't happen
between RA and the first post-RA split.  They should be easy enough
to iron out in most cases, although as mentioned, the bad interaction
with the x86 passes seems harder to fix.

> > +  // Don't substitute into a non-local goto, since it can then be
> > +  // treated as a jump to local label, e.g. in shorten_branches.
> > +  // ??? But this shouldn't be necessary.
> > +  if (use_insn->is_jump ()
> > + && find_reg_note (use_insn->rtl (), REG_NON_LOCAL_GOTO, NULL_RTX))
> > +   return false;
> Agreed that this shouldn't be necessary.  In fact, if you can subst

[PATCH v2 0/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2024-01-05 Thread Andre Vieira
Hi,

Resending series version 2 addression comments on first version, also moved
parts of the first patch to the second so it can be built without the second
patch.

Andre Vieira (2):
  arm: Add define_attr to to create a mapping between MVE predicated and
unpredicated insns
  arm: Add support for MVE Tail-Predicated Low Overhead Loops


-- 
2.17.1


[PATCH v2 1/2] arm: Add define_attr to to create a mapping between MVE predicated and unpredicated insns

2024-01-05 Thread Andre Vieira
Respin of first version to address comments and make it buildable on its own.

diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index a9c2752c0ea..f0b01b7461f 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -2375,6 +2375,21 @@ extern int making_const_table;
   else if (TARGET_THUMB1)\
 thumb1_final_prescan_insn (INSN)
 
+/* These defines are useful to refer to the value of the mve_unpredicated_insn
+   insn attribute.  Note that, because these use the get_attr_* function, these
+   will change recog_data if (INSN) isn't current_insn.  */
+#define MVE_VPT_PREDICABLE_INSN_P(INSN)	\
+  (recog_memoized (INSN) >= 0		\
+   && get_attr_mve_unpredicated_insn (INSN) != CODE_FOR_nothing)
+
+#define MVE_VPT_PREDICATED_INSN_P(INSN)	\
+  (MVE_VPT_PREDICABLE_INSN_P (INSN)	\
+   && recog_memoized (INSN) != get_attr_mve_unpredicated_insn (INSN))
+
+#define MVE_VPT_UNPREDICATED_INSN_P(INSN)\
+  (MVE_VPT_PREDICABLE_INSN_P (INSN)	\
+   && recog_memoized (INSN) == get_attr_mve_unpredicated_insn (INSN))
+
 #define ARM_SIGN_EXTEND(x)  ((HOST_WIDE_INT)			\
   (HOST_BITS_PER_WIDE_INT <= 32 ? (unsigned HOST_WIDE_INT) (x)	\
: unsigned HOST_WIDE_INT)(x)) & (unsigned HOST_WIDE_INT) 0x) |\
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 07eaf06cdea..296212be33f 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -124,6 +124,12 @@ (define_attr "fpu" "none,vfp"
 ; and not all ARM insns do.
 (define_attr "predicated" "yes,no" (const_string "no"))
 
+; An attribute that encodes the CODE_FOR_ of the MVE VPT unpredicated
+; version of a VPT-predicated instruction.  For unpredicated instructions
+; that are predicable, encode the same pattern's CODE_FOR_ as a way to
+; encode that it is a predicable instruction.
+(define_attr "mve_unpredicated_insn" "" (symbol_ref "CODE_FOR_nothing"))
+
 ; LENGTH of an instruction (in bytes)
 (define_attr "length" ""
   (const_int 4))
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index a9803538101..5ea2d9e8668 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -2305,6 +2305,7 @@ (define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8")
 
 (define_int_attr mmla_sfx [(UNSPEC_MATMUL_S "s8") (UNSPEC_MATMUL_U "u8")
 			   (UNSPEC_MATMUL_US "s8")])
+
 ;;MVE int attribute.
 (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
 		   (VREV16Q_U "u") (VMVNQ_N_S "s") (VMVNQ_N_U "u")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index b0d3443da9c..b1862d7977e 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -17,7 +17,7 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; .
 
-(define_insn "*mve_mov"
+(define_insn "mve_mov"
   [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w   , w,   r,Ux,w")
 	(match_operand:MVE_types 1 "general_operand"  " w,r,w,DnDm,UxUi,r,w, Ul"))]
   "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT"
@@ -81,18 +81,27 @@ (define_insn "*mve_mov"
   return "";
 }
 }
-  [(set_attr "type" "mve_move,mve_move,mve_move,mve_move,mve_load,multiple,mve_store,mve_load")
+   [(set_attr_alternative "mve_unpredicated_insn" [(symbol_ref "CODE_FOR_mve_mov")
+		   (symbol_ref "CODE_FOR_nothing")
+		   (symbol_ref "CODE_FOR_nothing")
+		   (symbol_ref "CODE_FOR_mve_mov")
+		   (symbol_ref "CODE_FOR_mve_mov")
+		   (symbol_ref "CODE_FOR_nothing")
+		   (symbol_ref "CODE_FOR_mve_mov")
+		   (symbol_ref "CODE_FOR_nothing")])
+   (set_attr "type" "mve_move,mve_move,mve_move,mve_move,mve_load,multiple,mve_store,mve_load")
(set_attr "length" "4,8,8,4,4,8,4,8")
(set_attr "thumb2_pool_range" "*,*,*,*,1018,*,*,*")
(set_attr "neg_pool_range" "*,*,*,*,996,*,*,*")])
 
-(define_insn "*mve_vdup"
+(define_insn "mve_vdup"
   [(set (match_operand:MVE_vecs 0 "s_register_operand" "=w")
 	(vec_duplicate:MVE_vecs
 	  (match_operand: 1 "s_register_operand" "r")))]
   "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT"
   "vdup.\t%q0, %1"
-  [(set_attr "length" "4")
+ [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vdup"))
+  (set_attr "length" "4")
(set_attr "type" "mve_move")])
 
 ;;
@@ -145,7 +154,8 @@ (define_insn "@mve_q_f"
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
   ".f%#\t%q0, %q1"
-  [(set_attr "type" "mve_move")
+ [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_f"))
+  (set_attr "type" "mve_move")
 ])
 
 ;;
@@ -159,7 +169,8 @@ (define_insn "@mve_q_f"
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
   ".%#\t%q0, %q1"
-  [(set_attr "type" "mve_move")
+ [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_f"))
+  (set_attr "type" "mve_move")
 ])
 
 ;;
@@ -173,7 +184,8 @@ (define_insn "mve_vq_f"
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
   "v.f%#\t%q0, %q1"
-  [(set_attr "type" "mve_move")
+ [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve

[PATCH v2 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2024-01-05 Thread Andre Vieira
Respin after comments on first version.
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2f5ca79ed8d..4f164c54740 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
 extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
-extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern bool arm_target_bb_ok_for_lob (basic_block);
+extern rtx arm_attempt_dlstp_transform (rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 0c0cb14a8a4..1ee72bcb7ec 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] =
 #undef TARGET_HAVE_CONDITIONAL_EXECUTION
 #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust
+
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p
+
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
 
@@ -34483,19 +34489,1147 @@ arm_invalid_within_doloop (const rtx_insn *insn)
 }
 
 bool
-arm_target_insn_ok_for_lob (rtx insn)
+arm_target_bb_ok_for_lob (basic_block bb)
 {
-  basic_block bb = BLOCK_FOR_INSN (insn);
   /* Make sure the basic block of the target insn is a simple latch
  having as single predecessor and successor the body of the loop
  itself.  Only simple loops with a single basic block as body are
  supported for 'low over head loop' making sure that LE target is
  above LE itself in the generated code.  */
-
   return single_succ_p (bb)
-&& single_pred_p (bb)
-&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src
-&& contains_no_active_insn_p (bb);
+	 && single_pred_p (bb)
+	 && single_succ_edge (bb)->dest == single_pred_edge (bb)->src;
+}
+
+/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE
+   lanes based on the machine mode being used.  */
+
+static int
+arm_mve_get_vctp_lanes (rtx_insn *insn)
+{
+  rtx insn_set = single_set (insn);
+  if (insn_set
+  && GET_CODE (SET_SRC (insn_set)) == UNSPEC
+  && (XINT (SET_SRC (insn_set), 1) == VCTP
+	  || XINT (SET_SRC (insn_set), 1) == VCTP_M))
+{
+  machine_mode mode = GET_MODE (SET_SRC (insn_set));
+  return (VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode))
+	 ? GET_MODE_NUNITS (mode) : 0;
+}
+  return 0;
+}
+
+/* Check if INSN requires the use of the VPR reg, if it does, return the
+   sub-rtx of the VPR reg.  The TYPE argument controls whether
+   this function should:
+   * For TYPE == 0, check all operands, including the OUT operands,
+ and return the first occurrence of the VPR reg.
+   * For TYPE == 1, only check the input operands.
+   * For TYPE == 2, only check the output operands.
+   (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn, unsigned int type = 0)
+{
+  gcc_assert (type < 3);
+  if (!NONJUMP_INSN_P (insn))
+return NULL_RTX;
+
+  bool requires_vpr;
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+ this insn.  */
+  preprocess_constraints (insn);
+
+  for (int op = 0; op < n_operands; op++)
+{
+  requires_vpr = true;
+  if (type == 1 && recog_data.operand_type[op] == OP_OUT)
+	continue;
+  else if (type == 2 && recog_data.operand_type[op] == OP_IN)
+	continue;
+
+  /* Iterate through alternatives of operand "op" in recog_op_alt and
+	 identify if the operand is required to be the VPR.  */
+  for (int alt = 0; alt < recog_data.n_alternatives; alt++)
+	{
+	  const operand_alternative *op_alt
+	  = &recog_op_alt[alt * n_operands];
+	  /* Fetch the reg_class for each entry and check it against the
+	 VPR_REG reg_class.  */
+	  if (alternative_class (op_alt, op) != VPR_REG)
+	requires_vpr = false;
+	}
+  /* If all alternatives of the insn require the VPR reg for this operand,
+	 it means that either this is VPR-generating instruction, like a vctp,
+	 vcmp, etc., or it is a VPT-predicated insruction.  Return the subrtx
+	 of the VPR reg operand.  */
+  if (requires_vpr)
+	return recog_data.operand[op];
+}
+  return NULL_RTX;
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == 1, so return
+   something only if the VPR reg is an input operand to the insn.  */
+
+static rtx
+arm_get_required_vpr_reg_param (rtx_insn *insn)
+{
+  return arm_get_required_vpr_reg (insn, 1);
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == 2, so return
+  

[patch] omp_target_is_accessible (was: [patch] libgomp.texi: Document omp_display_env)

2024-01-05 Thread Tobias Burnus

Hi all,

updated patch attached - which also fixes some additional issues and
adds omp_target_is_accessible.


On 03.01.24 23:35, Sandra Loosemore wrote:

On 1/3/24 11:31, Tobias Burnus wrote: [...]
I'm not sure about the usability issues, except I think it's generally
easier to change an undocumented interface.  :-)


I have now removed some details, which are not really needed and can be
gathered from the example. That makes it easier to change things in the
future and makes it more readable.

BTW: I kept the ' -- '; if we want to change this to '---', it should be
done globally for the whole file as the latter isn't used at all and the
former all the time.


+Routine to display the OpenMP number and the initial value of ICVs.


I'm not sure what an "OpenMP number is".  Below it says "GCC version
number" and "OpenMP version"


OpenMP number is used in the OpenMP spec but it does not really make
sense. "GCC version number" is bogus as it should be OpenMP's.

There are actually two version numbers of the OpenMP spec, the "real"
one everyone uses like 4.5 and an integer one like 201511 denoting the
year and month (= November as Supercomputing Conf is in November). The
latter value permits '>=' comparisons and is available both as
preprocessor macro (_OPENMP) and as Fortran named constant (openmp_version).

I am not sure how to distinguish 'version' from 'version' in words, but
'OpenMP version number' should be okayish for the 20yymm number.

(I used 4.5 & 201511 above as that's the highest OpenMP version for
which GCC claims full support; I hope that we can bump it to 5.0 or 5.1
during the GCC 15 development. For 5.0, only some existing features have
to be merged - most prominently missing are metadirectives but also a
few smaller features that do exist on the OG13 branch)



I'd rephrase this a bit:

...is printed on @code{stderr}.  The displayed values are those at
startup
after evaluating the environment variables; later calls to API routines
or clauses used in enclosing constructs do not affect the output.


Applied. Thanks for the suggestion!


This is the documentation for the GCC implementation, so it's not
necessary to say "in GCC"


Based on the real-world use of more comprehensive documentation, I think
it makes sense to distinguish between spec-based behavior and
implementation choice. However, I removed some level of less important
details, avoiding 'in GCC'.

Tobias
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
libgomp.texi: Document omp_display_env + omp_target_is_accessible

Additionally, it fixes a typo and changes the OpenMP 5.2 section
references (18.8.x) to OpenMP 5.1 ones (3.8.x) matching the mentioned
OpenMP number.

libgomp/ChangeLog:

	* libgomp.texi (OpenMP Technical Report 12): Fix a typo.
	(Device Memory Routines): Fix OpenMP 5.1 spec refs; add
	omp_target_is_accessible.
	(Environment Display Routine): Uncomment and add
	omp_display_env description.
	(OMP_DISPLAY_ENV): Update wording, add 'see also'.

 libgomp/libgomp.texi | 169 +++
 1 file changed, 145 insertions(+), 24 deletions(-)

diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index c727850397d..30f69ee412b 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -501,7 +501,7 @@ Technical Report (TR) 12 is the second preview for OpenMP 6.0.
   modifiers of the @code{init} clause
   @tab N @tab
 @item @code{interop} clause to @code{dispatch} @tab N @tab
-@item @code{message} and @code{severity} calauses to @code{parallel} directive
+@item @code{message} and @code{severity} clauses to @code{parallel} directive
   @tab N @tab
 @item @code{self} clause to @code{requires} directive @tab N @tab
 @item @code{no_openmp_constructs} assumptions clause @tab N @tab
@@ -570,7 +570,7 @@ specification in version 5.2.
 @c * Interoperability Routines::
 * Memory Management Routines::
 @c * Tool Control Routine::
-@c * Environment Display Routine::
+* Environment Display Routine::
 @end menu
 
 
@@ -1719,7 +1719,7 @@ pointers on devices. They have C linkage and do not throw exceptions.
 * omp_target_alloc:: Allocate device memory
 * omp_target_free:: Free device memory
 * omp_target_is_present:: Check whether storage is mapped
-@c * omp_target_is_accessible:: 
+* omp_target_is_accessible:: Check whether memory is device accessible
 @c * omp_target_memcpy:: 
 @c * omp_target_memcpy_rect:: 
 @c * omp_target_memcpy_async:: 
@@ -1768,7 +1768,7 @@ is not supported.
 @ref{omp_target_free}, @ref{omp_target_associate_ptr}
 
 @item @emph{Reference}:
-@uref{https://www.openmp.org, OpenMP specification v5.1}, Section 18.8.1
+@uref{https://www.openmp.org, OpenMP specification v5.1}, Section 3.8.1
 @end table
 
 
@@ -1802,7 +1802,7 @@ is not supported.
 @ref{omp_targe

Re: [PATCH] libgccjit: Add support for setting the comment ident

2024-01-05 Thread David Malcolm
On Fri, 2024-01-05 at 12:09 -0500, Antoni Boucher wrote:
> Hi.
> This patch adds support for setting the comment ident (analogous to
> #ident "comment" in C).
> Thanks for the review.

Thanks for the patch.

This may sound like a silly question, but what does #ident do and what
is it used for?

FWIW I found this in our documentation:
  https://gcc.gnu.org/onlinedocs/cpp/Other-Directives.html

[...snip...]

> +Output options
> +**
> +
> +.. function:: void gcc_jit_context_set_output_ident (gcc_jit_context *ctxt,\
> + const char* 
> output_ident)
> +
> +   Set the identifier to write in the .comment section of the output file to
> +   ``output_ident``. Analogous to:

...but only on some targets, according to the link above.  Maybe add
that link here?

> +
> +   .. code-block:: c
> +
> +  #ident "My comment"
> +
> +   in C.
> +
> +   This entrypoint was added in :ref:`LIBGCCJIT_ABI_26`; you can test for
> +   its presence using

Can the param "output_ident" be NULL?  It isn't checked for NULL in the
patch's implementation of gcc_jit_context_set_output_ident, and
recording::output_ident's constructor does check for it being NULL...

> +
> +   .. code-block:: c
> +
> +  #ifdef LIBGCCJIT_HAVE_gcc_jit_context_set_output_ident

> diff --git a/gcc/jit/jit-playback.cc b/gcc/jit/jit-playback.cc
> index 537f3b1..243a9fdf972 100644
> --- a/gcc/jit/jit-playback.cc
> +++ b/gcc/jit/jit-playback.cc
> @@ -319,6 +319,13 @@ get_type (enum gcc_jit_types type_)
>return new type (type_node);
>  }
>  
> +void
> +playback::context::
> +set_output_ident (const char* ident)
> +{
> +  targetm.asm_out.output_ident (ident);
> +}
> +

...but looking at varasm.cc's default_asm_output_ident_directive it
looks like the param must be non-NULL.

So this should either be conditionalized here to:

  if (ident)
targetm.asm_out.output_ident (ident);

or else we should ensure that "ident" is non-NULL at the API boundary
and document this.

My guess is that it doesn't make sense to have a NULL ident, so we
should go with the latter approach.

Can you have more than one #ident directive?  Presumably each one just
adds another line to the generated asm, right?

[...snip...]

> @@ -2185,6 +2192,52 @@ recording::string::write_reproducer (reproducer &)
>/* Empty.  */
>  }
>  
> +/* The implementation of class gcc::jit::recording::output_ident.  */
> +
> +/* Constructor for gcc::jit::recording::output_ident, allocating a
> +   copy of the given text using new char[].  */
> +
> +recording::output_ident::output_ident (context *ctxt, const char *ident)
> +: memento (ctxt)
> +{
> +  m_ident = ident ? xstrdup (ident) : NULL;
> +}
> +
> +/* Destructor for gcc::jit::recording::output_ident.  */
> +
> +recording::output_ident::~output_ident ()
> +{
> +  delete[] m_ident;

m_ident is allocated above using xstrdup, so it must be cleaned up with
"free"; I don't think it's safe to use "delete[]" here.

[...snip...]

> +/* Implementation of recording::memento::write_reproducer for output_ident.  
> */
> +
> +void
> +recording::output_ident::write_reproducer (reproducer &r)
> +{
> +  r.write ("  gcc_jit_context_set_output_ident (%s, \"%s\");",
> +r.get_identifier (get_context ()),
> +m_ident);

It isn't safe on all implementations to use %s with m_ident being NULL.

[...snip...]

Thanks again for the patch; hope this is constructive
Dave



[PATCH] c++: reference variable as default targ [PR101463]

2024-01-05 Thread Patrick Palka
Bootstrapped and regtested on x86_64-pc-linux-gnu, does this
look OK for trunk?

-- >8 --

Here during default template argument substitution we wrongly consider
the (substituted) default arguments v and vt as value-dependent[1]
which ultimately leads to deduction failure for the calls.

The bogus value_dependent_expression_p result aside, I noticed
type_unification_real during default targ substitution keeps track of
whether all previous targs are known and non-dependent, as is the case
for these calls.  And in such cases it should be safe to avoid checking
dependence of the substituted default targ and just assume it's not.
This patch implements this optimization, which lets us accept both
testcases by sidestepping the value_dependent_expression_p issue
altogether.

[1]: The reason we consider these reference variables value-dependent is
due to a workaround in value_dependent_expression_p:

  case VAR_DECL:
...
else if (TYPE_REF_P (TREE_TYPE (expression)))
  /* FIXME cp_finish_decl doesn't fold reference initializers.  */
  return true;
...

added by r5-5022-g51d72abe5ea04e.  I'm not sure if this workaround
is needed anymore, but naively removing it seems safe as far as
bootstrap+regtest is concerned (the only change is that we issue more
-Wmissing-braces warnings ahead of time in cpp0x/initlist123.C), and
lets us accept the first testcase.

Unfortunately we still reject the second testcase (in which v and vt are
additionally constexpr) for the same reason (bogus value dependence) due
to the subsequent check in v_d_e_p:

  ...
  /* We have a constexpr variable and we're processing a template.  When
 there's lifetime extension involved (for which finish_compound_literal
 used to create a temporary), we'll not be able to evaluate the
 variable until instantiating, so pretend it's value-dependent.  */
  else if (DECL_DECLARED_CONSTEXPR_P (expression)
   && !TREE_CONSTANT (expression))
return true;

And TREE_CONSTANT isn't set for v and vt because of a workaround in
cp_finish_decl:

  if (decl_maybe_constant_var_p (decl)
  /* FIXME setting TREE_CONSTANT on refs breaks the back end.  */
  && !TYPE_REF_P (type))
TREE_CONSTANT (decl) = true;

Naively removing this workaround lets us accept the second testcase, but
it re-introduces an ICE in g++.dg/opt/pr78373.C.

PR c++/101463

gcc/cp/ChangeLog:

* pt.cc (type_unification_real): Avoid checking dependence of
a substituted default template argument if we can assume it's
non-dependent.

gcc/testsuite/ChangeLog:

* g++.dg/cpp1z/nontype6.C: New test.
* g++.dg/cpp1z/nontype6a.C: New test.
---
 gcc/cp/pt.cc   |  9 +++--
 gcc/testsuite/g++.dg/cpp1z/nontype6.C  | 24 
 gcc/testsuite/g++.dg/cpp1z/nontype6a.C | 25 +
 3 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp1z/nontype6.C
 create mode 100644 gcc/testsuite/g++.dg/cpp1z/nontype6a.C

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 7208c721b0b..b801ce1f18c 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -23304,6 +23304,7 @@ type_unification_real (tree tparms,
 might be instantiation-dependent like access (87480).  */
  processing_template_decl_sentinel s (!any_dependent_targs);
  tree substed = NULL_TREE;
+ tristate dependent_p = tristate::unknown ();
  if (saw_undeduced == 1 && !any_dependent_targs)
{
  /* First instatiate in template context, in case we still
@@ -23312,8 +23313,9 @@ type_unification_real (tree tparms,
  substed = tsubst_template_arg (arg, full_targs, complain,
 NULL_TREE);
  --processing_template_decl;
+ dependent_p = uses_template_parms (substed);
  if (substed != error_mark_node
- && !uses_template_parms (substed))
+ && dependent_p.is_false ())
/* We replaced all the tparms, substitute again out of
   template context.  */
substed = NULL_TREE;
@@ -23321,8 +23323,11 @@ type_unification_real (tree tparms,
  if (!substed)
substed = tsubst_template_arg (arg, full_targs, complain,
   NULL_TREE);
+ if (dependent_p.is_unknown ())
+   dependent_p = (processing_template_decl
+  && uses_template_parms (substed));
 
- if (!uses_template_parms (substed))
+ if (dependent_p.is_false ())
arg = convert_template_argument (parm, substed, full_targs,
 complain, i, NULL_TREE);
  else if (saw_undeduced == 1)
diff --

[PATCH] Fortran: bogus warnings with REPEAT intrinsic and -Wconversion-extra [PR96724]

2024-01-05 Thread Harald Anlauf
Dear all,

the attached patch picks up a submission by Jose that was never reviewed:

  https://gcc.gnu.org/pipermail/fortran/2020-August/054902.html

The original patch was unnecessarily complex, as it could invoke
more conversions than necessary.  I chose to only convert to the
essential - and common - gfc_charlen_int_kind.  It's almost trivial
now.

Regtested on x86_64-pc-linux-gnu.  OK for mainline?

Thanks,
Harald

From 18f212aaca8a13fbd2f40cc7636b1a20185cc01e Mon Sep 17 00:00:00 2001
From: Harald Anlauf 
Date: Fri, 5 Jan 2024 22:24:25 +0100
Subject: [PATCH] Fortran: bogus warnings with REPEAT intrinsic and
 -Wconversion-extra [PR96724]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc/fortran/ChangeLog:

	PR fortran/96724
	* iresolve.cc (gfc_resolve_repeat): Force conversion of NCOPIES to
	gfc_charlen_int_kind before call to gfc_multiply.

gcc/testsuite/ChangeLog:

	PR fortran/96724
	* gfortran.dg/repeat_8.f90: New test.

Co-authored-by: José Rui Faustino de Sousa 
---
 gcc/fortran/iresolve.cc|   7 +-
 gcc/testsuite/gfortran.dg/repeat_8.f90 | 123 +
 2 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gfortran.dg/repeat_8.f90

diff --git a/gcc/fortran/iresolve.cc b/gcc/fortran/iresolve.cc
index 5bb0bbc477b..f25a5ba3aef 100644
--- a/gcc/fortran/iresolve.cc
+++ b/gcc/fortran/iresolve.cc
@@ -2363,7 +2363,12 @@ gfc_resolve_repeat (gfc_expr *f, gfc_expr *string,
 }
 
   if (tmp)
-f->ts.u.cl->length = gfc_multiply (tmp, gfc_copy_expr (ncopies));
+{
+  /* Force-convert ncopies to gfc_charlen_int_kind.  */
+  gfc_expr *e = gfc_copy_expr (ncopies);
+  gfc_convert_type_warn (e, &tmp->ts, 2, 0);
+  f->ts.u.cl->length = gfc_multiply (tmp, e);
+}
 }
 
 
diff --git a/gcc/testsuite/gfortran.dg/repeat_8.f90 b/gcc/testsuite/gfortran.dg/repeat_8.f90
new file mode 100644
index 000..9dd379ac96b
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/repeat_8.f90
@@ -0,0 +1,123 @@
+! { dg-do compile }
+! { dg-additional-options "-Wconversion-extra" }
+!
+! Test fix for PR fortran/96724
+!
+! Contributed by José Rui Faustino de Sousa 
+
+program repeat_p
+  use, intrinsic :: iso_fortran_env, only: int8, int16, int32, int64
+  implicit none
+
+  integer, parameter :: n = 20
+  integer, parameter :: ucs4 = selected_char_kind ('ISO_10646')
+
+  integer(kind=int8),  parameter :: p08 = int(n, kind=int8)
+  integer(kind=int16), parameter :: p16 = int(n, kind=int16)
+  integer(kind=int16), parameter :: p32 = int(n, kind=int32)
+  integer(kind=int16), parameter :: p64 = int(n, kind=int64)
+
+  integer(kind=int8)  :: i08
+  integer(kind=int16) :: i16
+  integer(kind=int32) :: i32
+  integer(kind=int64) :: i64
+
+  character(len=n,kind=1):: c
+  character(len=n,kind=ucs4) :: d
+
+  i08 = p08
+  c = repeat('X', 20_int8)
+  c = repeat('X', i08)
+  c = repeat('X', p08)
+  c = repeat('X', len08(c))
+  d = repeat(ucs4_'X', 20_int8)
+  d = repeat(ucs4_'X', i08)
+  d = repeat(ucs4_'X', p08)
+  d = repeat(ucs4_'X', len08(c))
+  i16 = p16
+  c = repeat('X', 20_int16)
+  c = repeat('X', i16)
+  c = repeat('X', p16)
+  c = repeat('X', len16(c))
+  d = repeat(ucs4_'X', 20_int16)
+  d = repeat(ucs4_'X', i16)
+  d = repeat(ucs4_'X', p16)
+  d = repeat(ucs4_'X', len16(c))
+  i32 = p32
+  c = repeat('X', 20_int32)
+  c = repeat('X', i32)
+  c = repeat('X', p32)
+  c = repeat('X', len32(c))
+  d = repeat(ucs4_'X', 20_int32)
+  d = repeat(ucs4_'X', i32)
+  d = repeat(ucs4_'X', p32)
+  d = repeat(ucs4_'X', len32(c))
+  i64 = p64
+  c = repeat('X', 20_int64)
+  c = repeat('X', i64)
+  c = repeat('X', p64)
+  c = repeat('X', len64(c))
+  d = repeat(ucs4_'X', 20_int64)
+  d = repeat(ucs4_'X', i64)
+  d = repeat(ucs4_'X', p64)
+  d = repeat(ucs4_'X', len64(c))
+
+contains
+
+  function len08(x) result(l)
+character(len=*), intent(in) :: x
+integer(kind=int8) :: l
+
+l = int(len(x), kind=int8)
+  end function len08
+
+  function len16(x) result(l)
+character(len=*), intent(in) :: x
+integer(kind=int16) :: l
+
+l = int(len(x), kind=int16)
+  end function len16
+
+  function len32(x) result(l)
+character(len=*), intent(in) :: x
+integer(kind=int32) :: l
+
+l = int(len(x), kind=int32)
+  end function len32
+
+  function len64(x) result(l)
+character(len=*), intent(in) :: x
+integer(kind=int64) :: l
+
+l = int(len(x), kind=int64)
+  end function len64
+
+  function ulen08(x) result(l)
+character(len=*,kind=ucs4), intent(in) :: x
+integer(kind=int8) :: l
+
+l = int(len(x), kind=int8)
+  end function ulen08
+
+  function ulen16(x) result(l)
+character(len=*,kind=ucs4), intent(in) :: x
+integer(kind=int16) :: l
+
+l = int(len(x), kind=int16)
+  end function ulen16
+
+  function ulen32(x) result(l)
+character(len=*,kind=ucs4), intent(in) :: x
+integer(kind=int32) :: l
+
+l = int(len(x), kind=int32)
+  end function ulen32
+
+  funct

[PATCH] PR target/112886, Add %S to print_operand for vector pair support

2024-01-05 Thread Michael Meissner
In looking at support for load vector pair and store vector pair for the
PowerPC in GCC, I noticed that we were missing a print_operand output modifier
if you are dealing with vector pairs to print the 2nd register in the vector
pair.

If the instruction inside of the asm used the Altivec encoding, then we could
use the %L modifier:

__vector_pair *p, *q, *r;
// ...
__asm__ ("vaddudm %0,%1,%2\n\tvaddudm %L0,%L1,%L2"
 : "=v" (*p)
 : "v" (*q), "v" (*r));

Likewise if we know the value to be in a tradiational FPR register, %L will
work for instructions that use the VSX encoding:

__vector_pair *p, *q, *r;
// ...
__asm__ ("xvadddp %x0,%x1,%x2\n\txvadddp %L0,%L1,%L2"
 : "=f" (*p)
 : "f" (*q), "f" (*r));

But if have a value that is in a traditional Altivec register, and the
instruction uses the VSX encoding, %L will a value between 0 and 31, when it
should give a value between 32 and 63.

This patch adds %S that acts like %x, except that it adds 1 to the
register number.

I have tested this on power10 and power9 little endian systems and on a power9
big endian system.  There were no regressions in the patch.  Can I apply it to
the trunk?

It would be nice if I could apply it to the open branches.  Can I backport it
after a burn-in period?

2024-01-04  Michael Meissner  

gcc/

PR target/112886
* config/rs6000/rs6000.cc (print_operand): Add %S output modifier.
* doc/md.texi (Modifiers): Mention %S can be used like %x.

gcc/testsuite/

PR target/112886
* /gcc.target/powerpc/pr112886.c: New test.
---
 gcc/config/rs6000/rs6000.cc | 10 +++---
 gcc/doc/md.texi |  5 +++--
 gcc/testsuite/gcc.target/powerpc/pr112886.c | 19 +++
 3 files changed, 29 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr112886.c

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 5a7e00b03d1..ba89377c9ec 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -14504,13 +14504,17 @@ print_operand (FILE *file, rtx x, int code)
print_operand (file, x, 0);
   return;
 
+case 'S':
 case 'x':
-  /* X is a FPR or Altivec register used in a VSX context.  */
+  /* X is a FPR or Altivec register used in a VSX context.  %x prints
+the VSX register number, %S prints the 2nd register number for
+vector pair, decimal 128-bit floating and IBM 128-bit binary floating
+values.  */
   if (!REG_P (x) || !VSX_REGNO_P (REGNO (x)))
-   output_operand_lossage ("invalid %%x value");
+   output_operand_lossage ("invalid %%%c value", (code == 'S' ? 'S' : 
'x'));
   else
{
- int reg = REGNO (x);
+ int reg = REGNO (x) + (code == 'S' ? 1 : 0);
  int vsx_reg = (FP_REGNO_P (reg)
 ? reg - 32
 : reg - FIRST_ALTIVEC_REGNO + 32);
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 47a87d6ceec..53ec957cb23 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -3386,8 +3386,9 @@ A VSX register (VSR), @code{vs0}@dots{}@code{vs63}.  This 
is either an
 FPR (@code{vs0}@dots{}@code{vs31} are @code{f0}@dots{}@code{f31}) or a VR
 (@code{vs32}@dots{}@code{vs63} are @code{v0}@dots{}@code{v31}).
 
-When using @code{wa}, you should use the @code{%x} output modifier, so that
-the correct register number is printed.  For example:
+When using @code{wa}, you should use either the @code{%x} or @code{%S}
+output modifier, so that the correct register number is printed.  For
+example:
 
 @smallexample
 asm ("xvadddp %x0,%x1,%x2"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr112886.c 
b/gcc/testsuite/gcc.target/powerpc/pr112886.c
new file mode 100644
index 000..07196bdc220
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr112886.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* PR target/112886: Test that print_operand %S gives the correct register
+   number for VSX registers (i.e. if the register is an Altivec register, the
+   register number is 32..63 instead of 0..31.  */
+
+void
+test (__vector_pair *p, __vector_pair *q, __vector_pair *r)
+{
+  __asm__ ("xvadddp %x0,%x1,%x2\n\txvadddp %S0,%S1,%S2"
+  : "=v" (*p)
+  : "v" (*q), "v" (*r));
+}
+
+/* { dg-final { scan-assembler-times {\mxvadddp 
(3[2-9]|[45][0-9]|6[0-3]),(3[2-9]|[45][0-9]|6[0-3]),(3[2-9]|[45][0-9]|6[0-3])\M}
 2 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 1 } } */
-- 
2.43.0


-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com


Repost [PATCH 0/6] PowerPC Future patches

2024-01-05 Thread Michael Meissner
I posted these patches on October 18th, 2023, and I never receieved any feedback
on the changes.  What changes do I need to make with these patches to get them
into GCC 14?

This patch is very preliminary support for a potential new feature to the
PowerPC that extends the current power10 MMA architecture.  This feature may or
may not be present in any specific future PowerPC processor.

In the current MMA subsystem for Power10, there are 8 512-bit accumulator
registers.  These accumulators are each tied to sets of 4 FPR registers.  When
you issue a prime instruction, it makes sure the accumulator is a copy of the 4
FPR registers the accumulator is tied to.  When you issue a deprime
instruction, it makes sure that the accumulator data content is logically
copied to the matching FPR register.

In the potential dense math system, the accumulators are moved to separate
registers called dense math registers (DM registers or DMR).  The DMRs are then
extended to 1,024 bits and new instructions will be added to deal with all
1,024 bits of the DMRs.

If you take existing MMA code, it will work as long as you don't do anything
with accumulators, and you follow the rules in the ISA 3.1 documentation for
using the MMA subsystem.

These patches add support for the 512-bit accumulators within the dense math
system, and for allocation of the 1,024-bit DMRs.  At this time, no additional
built-in functions will be done to support any dense math features other than
doing data movement between the DMRs and the VSX registers.  Before we can look
at adding any new dense math support other than data movement, we need the GCC
compiler to be able to allocate and use these DMRs.

There are 6 patches in this patch set:

1) The first patch just adds -mcpu=future as an option to add new support.
This is similar to the -mcpu=future that we did before power10 was announced.

2) The second patch enables GCC to use the load and store vector pair
instructions to optimize memory copy operations in the compiler.  For power10,
we needed to just stay with normal vector load/stores for memory copy
operations.

3) The third patch enables 512-bit accumulators that are located within in DMRs
instead of the FPRs.  This patch enables the register allocation, but it does
not move the existing MMA to use these registers.

4) The fourth patch switches the MMA subsystem to use 512-bit accumulators
within DMRs if you use -mcpu=future.

5) The fifth patch switches the names of the MMA instructions to use the dense
math equivalent name if -mcpu=future.

6) The sixth patch enables using the full 1,024-bit DMRs.  Right now, all you
can do with DMRs is move a VSX register to a DMR register, and to move a DMR
register to a VSX register.

In terms of changes, these patch now use the wD constraint for accumulators.
If you compile with -mcpu=power10, the wD constraint will match the equivalent
FPR register that overlaps with the accumulator.  If you compile with
-mcpu=future, the wD constraint will match the DMR register and not the FPR
register.

These patches also modifies the print_operand %A output modifier to print out
DMR register numbers if -mcpu=future, and continue to print out the FPR
register number divided by 4 for -mcpu=power10.

In general, if you only use the built-in functions, things work between the two
systems.  If you use extended asm, you will likely need to modify the code.
Going forward, hopefully if you modify your code to use the wD constraint and
%A output modifier, you can write code that switches more easily between the
two systems.

Again, these are preliminary patches for a potential future machine.  Things
will likely change in terms of implementation and usage over time.

Originally these patches were submitted in November 2022:
https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605581.html

-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com


Repost [PATCH 1/6] Add -mcpu=future

2024-01-05 Thread Michael Meissner
This patch implements support for a potential future PowerPC cpu.  Features
added with -mcpu=future, may or may not be added to new PowerPC processors.

This patch adds support for the -mcpu=future option.  If you use -mcpu=future,
the macro __ARCH_PWR_FUTURE__ is defined, and the assembler .machine directive
"future" is used.  Future patches in this series will add support for new
instructions that may be present in future PowerPC processors.

This particular patch does not any new features.  It exists as a ground work
for future patches to support for a possible PowerPC processor in the future.

This patch does not implement any differences in tuning when -mcpu=future is
used compared to -mcpu=power10.  If -mcpu=future is used, GCC will use power10
tuning.  If you explicitly use -mtune=future, you will get a warning that
-mtune=future is not supported, and default tuning will be set for power10.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2024-01-05   Michael Meissner  

gcc/

* config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Define
__ARCH_PWR_FUTURE__ if -mcpu=future.
* config/rs6000/rs6000-cpus.def (ISA_FUTURE_MASKS): New macro.
(POWERPC_MASKS): Add -mcpu=future support.
* config/rs6000/rs6000-opts.h (enum processor_type): Add
PROCESSOR_FUTURE.
* config/rs6000/rs6000-tables.opt: Regenerate.
* config/rs6000/rs6000.cc (rs600_cpu_index_lookup): New helper
function.
(rs6000_option_override_internal): Make -mcpu=future set
-mtune=power10.  If the user explicitly uses -mtune=future, give a
warning and reset the tuning to power10.
(rs6000_option_override_internal): Use power10 costs for future
machine.
(rs6000_machine_from_flags): Add support for -mcpu=future.
(rs6000_opt_masks): Likewise.
* config/rs6000/rs6000.h (ASM_CPU_SUPPORT): Likewise.
* config/rs6000/rs6000.md (cpu attribute): Likewise.
* config/rs6000/rs6000.opt (-mfuture): New undocumented debug switch.
* doc/invoke.texi (IBM RS/6000 and PowerPC Options): Document 
-mcpu=future.
---
 gcc/config/rs6000/rs6000-c.cc   |  2 +
 gcc/config/rs6000/rs6000-cpus.def   |  6 +++
 gcc/config/rs6000/rs6000-opts.h |  4 +-
 gcc/config/rs6000/rs6000-tables.opt |  3 ++
 gcc/config/rs6000/rs6000.cc | 58 -
 gcc/config/rs6000/rs6000.h  |  1 +
 gcc/config/rs6000/rs6000.md |  2 +-
 gcc/config/rs6000/rs6000.opt|  4 ++
 gcc/doc/invoke.texi |  2 +-
 9 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index ce0b14a8d37..f2fb5bef678 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -447,6 +447,8 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT 
flags)
 rs6000_define_or_undefine_macro (define_p, "_ARCH_PWR9");
   if ((flags & OPTION_MASK_POWER10) != 0)
 rs6000_define_or_undefine_macro (define_p, "_ARCH_PWR10");
+  if ((flags & OPTION_MASK_FUTURE) != 0)
+rs6000_define_or_undefine_macro (define_p, "_ARCH_PWR_FUTURE");
   if ((flags & OPTION_MASK_SOFT_FLOAT) != 0)
 rs6000_define_or_undefine_macro (define_p, "_SOFT_FLOAT");
   if ((flags & OPTION_MASK_RECIP_PRECISION) != 0)
diff --git a/gcc/config/rs6000/rs6000-cpus.def 
b/gcc/config/rs6000/rs6000-cpus.def
index d28cc87eb2a..8754635f3d9 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -88,6 +88,10 @@
 | OPTION_MASK_POWER10  \
 | OTHER_POWER10_MASKS)
 
+/* Flags for a potential future processor that may or may not be delivered.  */
+#define ISA_FUTURE_MASKS   (ISA_3_1_MASKS_SERVER   \
+| OPTION_MASK_FUTURE)
+
 /* Flags that need to be turned off if -mno-power9-vector.  */
 #define OTHER_P9_VECTOR_MASKS  (OPTION_MASK_FLOAT128_HW\
 | OPTION_MASK_P9_MINMAX)
@@ -135,6 +139,7 @@
 | OPTION_MASK_LOAD_VECTOR_PAIR \
 | OPTION_MASK_POWER10  \
 | OPTION_MASK_P10_FUSION   \
+| OPTION_MASK_FUTURE   \
 | OPTION_MASK_HTM  \
 | OPTION_MASK_ISEL \
 | OPTION_MASK_MFCRF\
@@ -267,3 +272,4 @@ RS6000_CPU ("powerpc64", PROCESSOR_POWERPC64, 
OPTION_MASK_PPC_GFXOPT
 RS6000_CPU ("powerpc64le", PROCESSOR_POWER8, MASK_POWERPC64
| ISA_2_7_MASKS_SERVER | OPTION_MASK_HTM)
 RS6000_CPU ("rs64", PROCESSOR_RS64A, OPTION_MASK_PPC_GFXOPT | MASK_POWERPC64)
+RS6000_CP

Repost [PATCH 2/6] PowerPC: Make -mcpu=future enable -mblock-ops-vector-pair.

2024-01-05 Thread Michael Meissner
This patch re-enables generating load and store vector pair instructions when
doing certain memory copy operations when -mcpu=future is used.

During power10 development, it was determined that using store vector pair
instructions were problematical in a few cases, so we disabled generating load
and store vector pair instructions for memory options by default.  This patch
re-enables generating these instructions if -mcpu=future is used.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2024-01-05   Michael Meissner  

gcc/

* config/rs6000/rs6000-cpus.def (ISA_FUTURE_MASKS): Add
-mblock-ops-vector-pair.
(POWERPC_MASKS): Likewise.
---
 gcc/config/rs6000/rs6000-cpus.def | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-cpus.def 
b/gcc/config/rs6000/rs6000-cpus.def
index 8754635f3d9..b6cd6d8cc84 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -90,6 +90,7 @@
 
 /* Flags for a potential future processor that may or may not be delivered.  */
 #define ISA_FUTURE_MASKS   (ISA_3_1_MASKS_SERVER   \
+| OPTION_MASK_BLOCK_OPS_VECTOR_PAIR\
 | OPTION_MASK_FUTURE)
 
 /* Flags that need to be turned off if -mno-power9-vector.  */
@@ -127,6 +128,7 @@
 
 /* Mask of all options to set the default isa flags based on -mcpu=.  */
 #define POWERPC_MASKS  (OPTION_MASK_ALTIVEC\
+| OPTION_MASK_BLOCK_OPS_VECTOR_PAIR\
 | OPTION_MASK_CMPB \
 | OPTION_MASK_CRYPTO   \
 | OPTION_MASK_DFP  \
-- 
2.43.0


-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com


Repost [PATCH 3/6] PowerPC: Add support for accumulators in DMR registers.

2024-01-05 Thread Michael Meissner
The MMA subsystem added the notion of accumulator registers as an optional
feature of ISA 3.1 (power10).  In ISA 3.1, these accumulators overlapped with
the traditional floating point registers 0..31, but logically the accumulator
registers were separate from the FPR registers.  In ISA 3.1, it was anticipated
that in future systems, the accumulator registers may no overlap with the FPR
registers.  This patch adds the support for dense math registers as separate
registers.

This particular patch does not change the MMA support to use the accumulators
within the dense math registers.  This patch just adds the basic support for
having separate DMRs.  The next patch will switch the MMA support to use the
accumulators if -mcpu=future is used.

For testing purposes, I added an undocumented option '-mdense-math' to enable
or disable the dense math support.

This patch adds a new constraint (wD).  If MMA is selected but dense math is
not selected (i.e. -mcpu=power10), the wD constraint will allow access to
accumulators that overlap with the VSX vector registers 0..31.  If both MMA and
dense math are selected (i.e. -mcpu=future), the wD constraint will only allow
dense math registers.

This patch modifies the existing %A output modifier.  If MMA is selected but
dense math is not selected, then %A output modifier converts the VSX register
number to the accumulator number, by dividing it by 4.  If both MMA and dense
math are selected, then %A will map the separate DMR registers into 0..7.

The intention is that user code using extended asm can be modified to run on
both MMA without dense math and MMA with dense math:

1)  If possible, don't use extended asm, but instead use the MMA built-in
functions;

2)  If you do need to write extended asm, change the d constraints
targetting accumulators should now use wD;

3)  Only use the built-in zero, assemble and disassemble functions create
move data between vector quad types and dense math accumulators.
I.e. do not use the xxmfacc, xxmtacc, and xxsetaccz directly in the
extended asm code.  The reason is these instructions assume there is a
1-to-1 correspondence between 4 adjacent FPR registers and an
accumulator that overlaps with those instructions.  With accumulators
now being separate registers, there no longer is a 1-to-1
correspondence.

It is possible that the mangling for DMRs and the GDB register numbers may
change in the future.

2024-01-05   Michael Meissner  

gcc/

* config/rs6000/constraints.md (wD constraint): New constraint.
* config/rs6000/mma.md (UNSPEC_DM_ASSEMBLE_ACC): New unspec.
(movxo): Convert into define_expand.
(movxo_vsx): Version of movxo where accumulators overlap with VSX vector
registers 0..31.
(movxo_dm): Verson of movxo that supports separate dense math
accumulators.
(mma_assemble_acc): Add dense math support to define_expand.
(mma_assemble_acc_vsx): Rename from mma_assemble_acc, and restrict it to
non dense math systems.
(mma_assemble_acc_dm): Dense math version of mma_assemble_acc.
(mma_disassemble_acc): Add dense math support to define_expand.
(mma_disassemble_acc_vsx): Rename from mma_disassemble_acc, and restrict
it to non dense math systems.
(mma_disassemble_acc_dm): Dense math version of mma_disassemble_acc.
* config/rs6000/predicates.md (dmr_operand): New predicate.
(accumulator_operand): Likewise.
* config/rs6000/rs6000-cpus.def (ISA_FUTURE_MASKS): Add -mdense-math.
(POWERPC_MASKS): Likewise.
* config/rs6000/rs6000.cc (enum rs6000_reg_type): Add DMR_REG_TYPE.
(enum rs6000_reload_reg_type): Add RELOAD_REG_DMR.
(LAST_RELOAD_REG_CLASS): Add support for DMR registers and the wD
constraint.
(reload_reg_map): Likewise.
(rs6000_reg_names): Likewise.
(alt_reg_names): Likewise.
(rs6000_hard_regno_nregs_internal): Likewise.
(rs6000_hard_regno_mode_ok_uncached): Likewise.
(rs6000_debug_reg_global): Likewise.
(rs6000_setup_reg_addr_masks): Likewise.
(rs6000_init_hard_regno_mode_ok): Likewise.
(rs6000_option_override_internal): Add checking for -mdense-math.
(rs6000_secondary_reload_memory): Add support for DMR registers.
(rs6000_secondary_reload_simple_move): Likewise.
(rs6000_preferred_reload_class): Likewise.
(rs6000_secondary_reload_class): Likewise.
(print_operand): Make %A handle both FPRs and DMRs.
(rs6000_dmr_register_move_cost): New helper function.
(rs6000_register_move_cost): Add support for DMR registers.
(rs6000_memory_move_cost): Likewise.
(rs6000_compute_pressure_classes): Likewise.
(rs6000_debugger_regno): Likewise.
(rs6000_opt_masks): Add -mdense-math.
(rs6000_split_multireg_move): Add support for DMR

Repost [PATCH 4/6] PowerPC: Make MMA insns support DMR registers.

2024-01-05 Thread Michael Meissner
This patch changes the MMA instructions to use either FPR registers
(-mcpu=power10) or DMRs (-mcpu=future).  In this patch, the existing MMA
instruction names are used.

A macro (__PPC_DMR__) is defined if the MMA instructions use the DMRs.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2024-01-05   Michael Meissner  

gcc/

* config/rs6000/mma.md (mma_): New define_expand to handle
mma_ for dense math and non dense math.
(mma_ insn): Restrict to non dense math.
(mma_xxsetaccz): Convert to define_expand to handle non dense math and
dense math.
(mma_xxsetaccz_vsx): Rename from mma_xxsetaccz and restrict usage to non
dense math.
(mma_xxsetaccz_dm): Dense math version of mma_xxsetaccz.
(mma_): Add support for dense math.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
* config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Define
__PPC_DMR__ if we have dense math instructions.
* config/rs6000/rs6000.cc (print_operand): Make %A handle only DMRs if
dense math and only FPRs if not dense math.
(rs6000_split_multireg_move): Do not generate the xxmtacc instruction to
prime the DMR registers or the xxmfacc instruction to de-prime
instructions if we have dense math register support.
---
 gcc/config/rs6000/mma.md  | 247 +-
 gcc/config/rs6000/rs6000-c.cc |   3 +
 gcc/config/rs6000/rs6000.cc   |  35 ++---
 3 files changed, 176 insertions(+), 109 deletions(-)

diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index bb898919ab5..525a85146ff 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -559,190 +559,249 @@ (define_insn "*mma_disassemble_acc_dm"
   "dmxxextfdmr256 %0,%1,2"
   [(set_attr "type" "mma")])
 
-(define_insn "mma_"
+;; MMA instructions that do not use their accumulators as an input, still must
+;; not allow their vector operands to overlap the registers used by the
+;; accumulator.  We enforce this by marking the output as early clobber.  If we
+;; have dense math, we don't need the whole prime/de-prime action, so just make
+;; thse instructions be NOPs.
+
+(define_expand "mma_"
+  [(set (match_operand:XO 0 "register_operand")
+   (unspec:XO [(match_operand:XO 1 "register_operand")]
+  MMA_ACC))]
+  "TARGET_MMA"
+{
+  if (TARGET_DENSE_MATH)
+{
+  if (!rtx_equal_p (operands[0], operands[1]))
+   emit_move_insn (operands[0], operands[1]);
+  DONE;
+}
+
+  /* Generate the prime/de-prime code.  */
+})
+
+(define_insn "*mma_"
   [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0")]
MMA_ACC))]
-  "TARGET_MMA"
+  "TARGET_MMA && !TARGET_DENSE_MATH"
   " %A0"
   [(set_attr "type" "mma")])
 
 ;; We can't have integer constants in XOmode so we wrap this in an
-;; UNSPEC_VOLATILE.
+;; UNSPEC_VOLATILE for the non-dense math case.  For dense math, we don't need
+;; to disable optimization and we can do a normal UNSPEC.
 
-(define_insn "mma_xxsetaccz"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=d")
+(define_expand "mma_xxsetaccz"
+  [(set (match_operand:XO 0 "register_operand")
(unspec_volatile:XO [(const_int 0)]
UNSPECV_MMA_XXSETACCZ))]
   "TARGET_MMA"
+{
+  if (TARGET_DENSE_MATH)
+{
+  emit_insn (gen_mma_xxsetaccz_dm (operands[0]));
+  DONE;
+}
+})
+
+(define_insn "*mma_xxsetaccz_vsx"
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=d")
+   (unspec_volatile:XO [(const_int 0)]
+   UNSPECV_MMA_XXSETACCZ))]
+  "TARGET_MMA && !TARGET_DENSE_MATH"
   "xxsetaccz %A0"
   [(set_attr "type" "mma")])
 
+
+(define_insn "mma_xxsetaccz_dm"
+  [(set (match_operand:XO 0 "dmr_operand" "=wD")
+   (unspec:XO [(const_int 0)]
+  UNSPECV_MMA_XXSETACCZ))]
+  "TARGET_DENSE_MATH"
+  "dmsetdmrz %0"
+  [(set_attr "type" "mma")])
+
 (define_insn "mma_"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
-   (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,?wa")
-   (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")]
+  [(set (match_operand:XO 0 "accumulator_operand" "=wD,&d,&d")
+   (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa,v,?wa")
+   (match_operand:V16QI 2 "vsx_register_operand" "wa,v,?wa")]
MMA_VV))]
   "TARGET_MMA"
   " %A0,%x1,%x2"
-  [(set_attr "type" "mma")])
+  [(set_attr "type" "mma")
+   (set_attr "isa" "dm,not_dm,not_dm")])
 
 (define_insn "mma_"
-  [(set (match_operand:X

Repost [PATCH 5/6] PowerPC: Switch to dense math names for all MMA operations.

2024-01-05 Thread Michael Meissner
This patch changes the assembler instruction names for MMA instructions from
the original name used in power10 to the new name when used with the dense math
system.  I.e. xvf64gerpp becomes dmxvf64gerpp.  The assembler will emit the
same bits for either spelling.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2024-01-05   Michael Meissner  

gcc/

* config/rs6000/mma.md (vvi4i4i8_dm): New int attribute.
(avvi4i4i8_dm): Likewise.
(vvi4i4i2_dm): Likewise.
(avvi4i4i2_dm): Likewise.
(vvi4i4_dm): Likewise.
(avvi4i4_dm): Likewise.
(pvi4i2_dm): Likewise.
(apvi4i2_dm): Likewise.
(vvi4i4i4_dm): Likewise.
(avvi4i4i4_dm): Likewise.
(mma_): Add support for running on DMF systems, generating the dense
math instruction and using the dense math accumulators.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.

gcc/testsuite/

* gcc.target/powerpc/dm-double-test.c: New test.
* lib/target-supports.exp (check_effective_target_ppc_dmr_ok): New
target test.
---
 gcc/config/rs6000/mma.md  |  98 +++--
 .../gcc.target/powerpc/dm-double-test.c   | 194 ++
 gcc/testsuite/lib/target-supports.exp |  19 ++
 3 files changed, 299 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dm-double-test.c

diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 525a85146ff..f06e6bbb184 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -227,13 +227,22 @@ (define_int_attr apv  [(UNSPEC_MMA_XVF64GERPP 
"xvf64gerpp")
 
 (define_int_attr vvi4i4i8  [(UNSPEC_MMA_PMXVI4GER8 "pmxvi4ger8")])
 
+(define_int_attr vvi4i4i8_dm   [(UNSPEC_MMA_PMXVI4GER8 
"pmdmxvi4ger8")])
+
 (define_int_attr avvi4i4i8 [(UNSPEC_MMA_PMXVI4GER8PP   
"pmxvi4ger8pp")])
 
+(define_int_attr avvi4i4i8_dm  [(UNSPEC_MMA_PMXVI4GER8PP   
"pmdmxvi4ger8pp")])
+
 (define_int_attr vvi4i4i2  [(UNSPEC_MMA_PMXVI16GER2"pmxvi16ger2")
 (UNSPEC_MMA_PMXVI16GER2S   "pmxvi16ger2s")
 (UNSPEC_MMA_PMXVF16GER2"pmxvf16ger2")
 (UNSPEC_MMA_PMXVBF16GER2   
"pmxvbf16ger2")])
 
+(define_int_attr vvi4i4i2_dm   [(UNSPEC_MMA_PMXVI16GER2"pmdmxvi16ger2")
+(UNSPEC_MMA_PMXVI16GER2S   
"pmdmxvi16ger2s")
+(UNSPEC_MMA_PMXVF16GER2"pmdmxvf16ger2")
+(UNSPEC_MMA_PMXVBF16GER2   
"pmdmxvbf16ger2")])
+
 (define_int_attr avvi4i4i2 [(UNSPEC_MMA_PMXVI16GER2PP  "pmxvi16ger2pp")
 (UNSPEC_MMA_PMXVI16GER2SPP 
"pmxvi16ger2spp")
 (UNSPEC_MMA_PMXVF16GER2PP  "pmxvf16ger2pp")
@@ -245,25 +254,54 @@ (define_int_attr avvi4i4i2
[(UNSPEC_MMA_PMXVI16GER2PP  "pmxvi16ger2pp")
 (UNSPEC_MMA_PMXVBF16GER2NP 
"pmxvbf16ger2np")
 (UNSPEC_MMA_PMXVBF16GER2NN 
"pmxvbf16ger2nn")])
 
+(define_int_attr avvi4i4i2_dm  [(UNSPEC_MMA_PMXVI16GER2PP  
"pmdmxvi16ger2pp")
+(UNSPEC_MMA_PMXVI16GER2SPP 
"pmdmxvi16ger2spp")
+(UNSPEC_MMA_PMXVF16GER2PP  
"pmdmxvf16ger2pp")
+(UNSPEC_MMA_PMXVF16GER2PN  
"pmdmxvf16ger2pn")
+(UNSPEC_MMA_PMXVF16GER2NP  
"pmdmxvf16ger2np")
+(UNSPEC_MMA_PMXVF16GER2NN  
"pmdmxvf16ger2nn")
+(UNSPEC_MMA_PMXVBF16GER2PP 
"pmdmxvbf16ger2pp")
+(UNSPEC_MMA_PMXVBF16GER2PN 
"pmdmxvbf16ger2pn")
+(UNSPEC_MMA_PMXVBF16GER2NP 
"pmdmxvbf16ger2np")
+(UNSPEC_MMA_PMXVBF16GER2NN 
"pmdmxvbf16ger2nn")])
+
 (define_int_attr vvi4i4[(UNSPEC_MMA_PMXVF32GER 
"pmxvf32ger")])
 
+(define_int_attr vvi4i4_dm [(UNSPEC_MMA_PMXVF32GER 
"pmdmxvf32ger")])
+
 (define_int_attr avvi4i4   [(UNSPEC_MMA_PMXVF32GERPP   "pmxvf32gerpp")
 (UNSPEC_MMA_PMXVF32GERPN   "pmxvf32gerpn")
 (UNSPEC_MMA_PMXVF32GERNP   "pmxvf32gernp")
 (UNSPEC_MMA_PMXVF32GERNN   
"pmxvf32gernn")])
 
+(define_int_attr avvi4i4_dm[(UNSPEC_MMA_PMXVF32GERPP   
"pmdmxvf32gerpp")
+(UNSPEC_MMA_PMXVF32GERPN   
"pmdmxvf32gerpn")
+ 

Repost [PATCH 6/6] PowerPC: Add support for 1,024 bit DMR registers.

2024-01-05 Thread Michael Meissner
This patch is a prelimianry patch to add the full 1,024 bit dense math register
(DMRs) for -mcpu=future.  The MMA 512-bit accumulators map onto the top of the
DMR register.

This patch only adds the new 1,024 bit register support.  It does not add
support for any instructions that need 1,024 bit registers instead of 512 bit
registers.

I used the new mode 'TDOmode' to be the opaque mode used for 1,204 bit
registers.  The 'wD' constraint added in previous patches is used for these
registers.  I added support to do load and store of DMRs via the VSX registers,
since there are no load/store dense math instructions.  I added the new keyword
'__dmr' to create 1,024 bit types that can be loaded into DMRs.  At present, I
don't have aliases for __dmr512 and __dmr1024 that we've discussed internally.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2024-01-05   Michael Meissner  

gcc/

* config/rs6000/mma.md (UNSPEC_DM_INSERT512_UPPER): New unspec.
(UNSPEC_DM_INSERT512_LOWER): Likewise.
(UNSPEC_DM_EXTRACT512): Likewise.
(UNSPEC_DMR_RELOAD_FROM_MEMORY): Likewise.
(UNSPEC_DMR_RELOAD_TO_MEMORY): Likewise.
(movtdo): New define_expand and define_insn_and_split to implement 1,024
bit DMR registers.
(movtdo_insert512_upper): New insn.
(movtdo_insert512_lower): Likewise.
(movtdo_extract512): Likewise.
(reload_dmr_from_memory): Likewise.
(reload_dmr_to_memory): Likewise.
* config/rs6000/rs6000-builtin.cc (rs6000_type_string): Add DMR
support.
(rs6000_init_builtins): Add support for __dmr keyword.
* config/rs6000/rs6000-call.cc (rs6000_return_in_memory): Add support
for TDOmode.
(rs6000_function_arg): Likewise.
* config/rs6000/rs6000-modes.def (TDOmode): New mode.
* config/rs6000/rs6000.cc (rs6000_hard_regno_nregs_internal): Add
support for TDOmode.
(rs6000_hard_regno_mode_ok_uncached): Likewise.
(rs6000_hard_regno_mode_ok): Likewise.
(rs6000_modes_tieable_p): Likewise.
(rs6000_debug_reg_global): Likewise.
(rs6000_setup_reg_addr_masks): Likewise.
(rs6000_init_hard_regno_mode_ok): Add support for TDOmode.  Setup reload
hooks for DMR mode.
(reg_offset_addressing_ok_p): Add support for TDOmode.
(rs6000_emit_move): Likewise.
(rs6000_secondary_reload_simple_move): Likewise.
(rs6000_secondary_reload_class): Likewise.
(rs6000_mangle_type): Add mangling for __dmr type.
(rs6000_dmr_register_move_cost): Add support for TDOmode.
(rs6000_split_multireg_move): Likewise.
(rs6000_invalid_conversion): Likewise.
* config/rs6000/rs6000.h (VECTOR_ALIGNMENT_P): Add TDOmode.
(enum rs6000_builtin_type_index): Add DMR type nodes.
(dmr_type_node): Likewise.
(ptr_dmr_type_node): Likewise.

gcc/testsuite/

* gcc.target/powerpc/dm-1024bit.c: New test.
---
 gcc/config/rs6000/mma.md  | 152 ++
 gcc/config/rs6000/rs6000-builtin.cc   |  13 ++
 gcc/config/rs6000/rs6000-call.cc  |  13 +-
 gcc/config/rs6000/rs6000-modes.def|   4 +
 gcc/config/rs6000/rs6000.cc   | 135 
 gcc/config/rs6000/rs6000.h|   7 +-
 gcc/testsuite/gcc.target/powerpc/dm-1024bit.c |  63 
 7 files changed, 351 insertions(+), 36 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dm-1024bit.c

diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index f06e6bbb184..37de9030903 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -92,6 +92,11 @@ (define_c_enum "unspec"
UNSPEC_MMA_XXMFACC
UNSPEC_MMA_XXMTACC
UNSPEC_DM_ASSEMBLE_ACC
+   UNSPEC_DM_INSERT512_UPPER
+   UNSPEC_DM_INSERT512_LOWER
+   UNSPEC_DM_EXTRACT512
+   UNSPEC_DMR_RELOAD_FROM_MEMORY
+   UNSPEC_DMR_RELOAD_TO_MEMORY
   ])
 
 (define_c_enum "unspecv"
@@ -879,3 +884,150 @@ (define_insn "mma_"
   [(set_attr "type" "mma")
(set_attr "prefixed" "yes")
(set_attr "isa" "dm,not_dm,not_dm")])
+
+
+;; TDOmode (i.e. __dmr).
+(define_expand "movtdo"
+  [(set (match_operand:TDO 0 "nonimmediate_operand")
+   (match_operand:TDO 1 "input_operand"))]
+  "TARGET_DENSE_MATH"
+{
+  rs6000_emit_move (operands[0], operands[1], TDOmode);
+  DONE;
+})
+
+(define_insn_and_split "*movtdo"
+  [(set (match_operand:TDO 0 "nonimmediate_operand" "=wa,m,wa,wD,wD,wa")
+   (match_operand:TDO 1 "input_operand" "m,wa,wa,wa,wD,wD"))]
+  "TARGET_DENSE_MATH
+   && (gpc_reg_operand (operands[0], TDOmode)
+   || gpc_reg_operand (operands[1], TDOmode))"
+  "@
+   #
+   #
+   #
+   #
+   dmmr %0,%1
+   #"
+  "&& reload_completed
+   && (!dmr_operand (operands[0], TDOmode) || !dmr_operand (operands[1], 
TDOmode))"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+
+  i

Re: Re: [committed] RISC-V: Add crypto vector builtin function.

2024-01-05 Thread 钟居哲
Thanks Jeff.

Yeah, I aggree we are not doing thing terribly wrong but Palmer request revert 
of the vector-crypto,
so I revert it (actually, I asked Li Pan revert it).

Actually, Wang Feng has fixed the issue:
https://gcc.gnu.org/pipermail/gcc-patches/2024-January/641903.html 
It's just a pretty simple typo cause the ICE.

Soon, vector-crypto will be committed.

Eswin guys are working on various vector extension (vector crypto, BF16 vector, 
...etc).
And I have told them only vector-crypto can accepted in the GCC-14 release and 
defer BF16 vector to GCC-15.

So, I believe we won't have any more new features until GCC-14 release.

Thanks.


juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2024-01-05 23:50
To: Palmer Dabbelt; juzhe.zhong
CC: gcc-patches; Kito Cheng; Kito.cheng
Subject: Re: [committed] RISC-V: Add crypto vector builtin function.
 
 
On 1/4/24 20:24, Palmer Dabbelt wrote:
> On Thu, 04 Jan 2024 19:17:21 PST (-0800), juzhe.zh...@rivai.ai wrote:
>> Hi, Wang Feng.
>>
>> Your patch has some ICEs:
>> FAIL: gcc.target/riscv/rvv/base/zvbc-intrinsic.c (internal compiler 
>> error: RTL check: expected code 'const_int', have 'reg' in 
>> vlmax_avl_type_p, at config/riscv/riscv-v.cc:4930)
>> FAIL: gcc.target/riscv/rvv/base/zvbc-intrinsic.c (test for excess errors)
>> FAIL: gcc.target/riscv/rvv/base/zvbc_vx_constraint-1.c (internal 
>> compiler error: RTL check: expected code 'const_int', have 'reg' in 
>> vlmax_avl_type_p, at config/riscv/riscv-v.cc:4930)
>> FAIL: gcc.target/riscv/rvv/base/zvbc_vx_constraint-1.c (test for 
>> excess errors)
>> FAIL: gcc.target/riscv/rvv/base/zvbc_vx_constraint-2.c (internal 
>> compiler error: RTL check: expected code 'const_int', have 'reg' in 
>> vlmax_avl_type_p, at config/riscv/riscv-v.cc:4930)
>> FAIL: gcc.target/riscv/rvv/base/zvbc_vx_constraint-2.c (test for 
>> excess errors)
> 
> So let's just revert it, it doesn't even look like it was reviewed. 
> We've set a really bad precedent here where we're just merging a bunch 
> of unreviewed code and sorting out the regressions in trunk, that's not 
> the right way to do things.
> 
>>
>> I suspect you didn't enable rtl check in the regression:
>>
>> ../../configure --enable-gcc-checking=rtl.
>> Plz enable rtl check in the regression tests.
We haven't ever required folks to test with RTL checking enabled due to 
its compile-time cost.  So I don't think Feng did anything wrong here.
 
IIRC, Jakub's standard practice over in the x86 world is to do a 
bootstrap and regression test with RTL checking enabled in the spring as 
we get closer to the release to weed out these kinds of things that can 
slip through.
 
Clearly there's a bug and we should fix it, but it's not a sign that 
anything has gone terribly wrong.
 
jeff
 


[COMMITTED] Regenerate libgomp/configure for copyright year update

2024-01-05 Thread Mark Wielaard
commit a945c346f57ba40fc80c14ac59be0d43624e559d updated
libgomp/plugin/configfrag.ac but didn't regenerate/update
libgomp/configure which includes that configfrag.

libgomp/Changelog:

* configure: Regenerate.
---
 libgomp/configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgomp/configure b/libgomp/configure
index c69a13cfe14..b3646c9936d 100755
--- a/libgomp/configure
+++ b/libgomp/configure
@@ -15159,7 +15159,7 @@ _ACEOF
 
 # Plugins for offload execution, configure.ac fragment.  -*- mode: autoconf -*-
 #
-# Copyright (C) 2014-2023 Free Software Foundation, Inc.
+# Copyright (C) 2014-2024 Free Software Foundation, Inc.
 #
 # Contributed by Mentor Embedded.
 #
-- 
2.39.3



Re: [committed] RISC-V: Clean up testsuite for multi-lib testing [NFC]

2024-01-05 Thread 钟居哲
Hi, kito.

This patch causes these following regression FAILs:

FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)

spawn -ignore SIGHUP 
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/build/dev-rv64gcv-lp64d-medany-newlib-spike-release-m1-scalable/build-gcc-newlib-stage2/gcc/xgcc
 
-B/work/home/jzzhong/work/docker/riscv-gnu-toolchain/build/dev-rv64gcv-lp64d-medany-newlib-spike-release-m1-scalable/build-gcc-newlib-stage2/gcc/
 
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c
 -march=rv64gcv -mabi=lp64d -mcmodel=medany -fdiagnostics-plain-output 
-ftree-vectorize -O2 --param riscv-autovec-lmul=m1 --param 
riscv-autovec-preference=scalable -lm -o ./single_rgroup_run-3.exe^M
In file included from 
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-3.c:4,^M
 from 
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c:4:^M
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c:
 In function 'main':^M
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-3.h:108:9:
 error: implicit declaration of function 'assert' 
[-Wimplicit-function-declaration]^M
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-3.h:174:3:
 note: in expansion of macro 'run_6'^M
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c:16:3:
 note: in expansion of macro 'TEST_ALL'^M
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-3.h:108:9:
 note: 'assert' is defined in header ''; this is probably fixable by 
adding '#include '^M
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-3.h:174:3:
 note: in expansion of macro 'run_6'^M
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c:16:3:
 note: in expansion of macro 'TEST_ALL'^M
compiler exited with status 1
FAIL: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c (test for 
excess errors)
Excess errors:
/work/home/jzzhong/work/docker/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/single_rgroup-3.h:108:9:
 error: implicit declaration of function 'assert' 
[-Wimplicit-function-declaration]

UNRESOLVED: gcc.target/riscv/rvv/autovec/partial/single_rgroup_run-3.c 
compilation failed to produce executable


Could you fix it ?



juzhe.zh...@rivai.ai
 
From: Kito Cheng
Date: 2024-01-05 16:39
To: gcc-patches; kito.cheng; juzhe.zhong
CC: Kito Cheng
Subject: [committed] RISC-V: Clean up testsuite for multi-lib testing [NFC]
- Drop unnecessary including for stdlib.h and math.h
- Drop assert.h / assert, use __builtin_abort instead.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/binop/shift-scalar-template.h:
Use __builtin_abort instead of assert.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-1.c: Drop math.h.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax_zvfh-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax_zvfh-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax_zvfh-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax_zvfh-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin_zvfh-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin_zvfh-2.c: Ditto.
* gcc.target

[Committed V2] RISC-V: Allow simplification non-vlmax with len = NUNITS reg to reg move

2024-01-05 Thread Juzhe-Zhong
V2: Address comments from Robin.

While working on fixing a bug, I notice this following code has redundant move:

#include "riscv_vector.h"
void
f (float x, float y, void *out)
{
  float f[4] = { x, x, x, y };
  vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4);
  __riscv_vse32_v_f32m1 (out, v, 4);
}

Before this patch:

f:
vsetivlizero,4,e32,m1,ta,ma
addisp,sp,-16
vfmv.v.fv1,fa0
vfslide1down.vf v1,v1,fa1
vmv.v.v v1,v1   > redundant move.
vse32.v v1,0(a0)
addisp,sp,16
jr  ra

The rootcause is that the complicate vmv.v.v pattern doesn't simplify it
into simple (set (reg) (reg)) reg-to-reg move pattern.

Currently, we support such simplification for VLMAX.

However, the case I found is non-VLMAX but with LEN = NUNITS which should be
considered as equivalent to VLMAX.

Add a simple fix for such situation.

Tested on both RV32/RV64 no regressions.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (whole_reg_to_reg_move_p): New function.
* config/riscv/riscv-v.cc (whole_reg_to_reg_move_p): Ditto.
* config/riscv/vector.md: Allow non-vlmax with len = NUNITS 
simplification.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/vf_avl-4.c: New test.

---
 gcc/config/riscv/riscv-protos.h   |  1 +
 gcc/config/riscv/riscv-v.cc   | 23 +++
 gcc/config/riscv/vector.md|  9 ++--
 .../gcc.target/riscv/rvv/base/vf_avl-4.c  | 13 +++
 4 files changed, 39 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 0f0337cfb38..00a5b645abe 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -687,6 +687,7 @@ bool imm_avl_p (machine_mode);
 bool can_be_broadcasted_p (rtx);
 bool gather_scatter_valid_offset_p (machine_mode);
 HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
+bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
 }
 
 /* We classify builtin types into two classes:
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index ec859645415..2491522191a 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -5117,4 +5117,27 @@ estimated_poly_value (poly_int64 val, unsigned int kind)
   return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
 }
 
+/* Return true it is whole register-register move.  */
+bool
+whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
+{
+  /* An operation is a whole-register move if either
+ (1) Its vlmax operand equals VLMAX
+ (2) Its vl operand equals the number of units of its mode.  */
+  if (register_operand (ops[0], mode)
+  && register_operand (ops[3], mode)
+  && satisfies_constraint_vu (ops[2])
+  && satisfies_constraint_Wc1 (ops[1]))
+{
+  if (INTVAL (ops[avl_type_index]) == VLMAX)
+   return true;
+  /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
+into NON-VLMAX with LEN = NUNITS.  */
+  else if (CONST_INT_P (ops[4])
+  && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode)))
+   return true;
+}
+  return false;
+}
+
 } // namespace riscv_vector
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 3d2c1c3ce8f..be5beb5ab64 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1724,10 +1724,7 @@
vse.v\t%3,%0%p1
vmv.v.v\t%0,%3
vmv.v.v\t%0,%3"
-  "&& register_operand (operands[0], mode)
-   && register_operand (operands[3], mode)
-   && satisfies_constraint_vu (operands[2])
-   && INTVAL (operands[7]) == riscv_vector::VLMAX"
+  "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode, 7)"
   [(set (match_dup 0) (match_dup 3))]
   ""
   [(set_attr "type" "vlde,vlde,vlde,vste,vimov,vimov")
@@ -1776,9 +1773,7 @@
vmmv.m\t%0,%3
vmclr.m\t%0
vmset.m\t%0"
-  "&& register_operand (operands[0], mode)
-   && register_operand (operands[3], mode)
-   && INTVAL (operands[5]) == riscv_vector::VLMAX"
+  "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode, 5)"
   [(set (match_dup 0) (match_dup 3))]
   ""
   [(set_attr "type" "vldm,vstm,vmalu,vmalu,vmalu")
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c
new file mode 100644
index 000..1b4bfd96481
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d --param 
riscv-autovec-preference=fixed-vlmax" } */
+
+#include "riscv_vector.h"
+void
+f (float x, float y, void *out)
+{
+  float f[4] = { x, x, x, y };
+  vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4);
+  __riscv_vse32_v_f32m1 (out, v, 4);
+}
+
+/* { dg-final { scan-assembler-not {vmv} } } */
-- 

Re: Re: [PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]

2024-01-05 Thread 钟居哲
Thanks Robin.

is_gimple_constant makes more senes. Committed with addressing your comments.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2024-01-05 17:54
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Teach liveness computation loop invariant shift 
amount[Dynamic LMUL]
> 1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize 
> shift with vector shift amount,
> that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant.
> 2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize 
> shift with scalar shift amount,
> that is, vectorization of 'a[i] >> x', the shift amount is loop invariant.
> 
 
> +static bool
> +loop_invariant_op_p (class loop *loop,
> +  tree op)
> +{
> +  if (is_gimple_min_invariant (op))
> +return true;
> +  if (SSA_NAME_IS_DEFAULT_DEF (op)
> +  || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op
> +return true;
> +  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
> +}
> +
 
Looks like this is straight from tree-ssa-loop-ch.  Do we need
is_gimple_min_invariant (is_gimple_constant could be sufficient?)
and DEFAULT_DEF for our case?  The rhs of a shift should never contain
a default def?
 
I'm not entirely happy about the "loop invariant" heuristic/proxy
of the shift amount being vectorizable.  That seems like something
that could bite us in the future in case we do slp-like vectorization
on loop-invariant (but varying) data.
 
As it helps for now and is not a correctness issue I'd still tend to
go forward with it.
 
Regards
Robin
 


[Committed V2] RISC-V: Teach liveness computation loop invariant shift amount

2024-01-05 Thread Juzhe-Zhong
1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize shift 
with vector shift amount,
that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant.
2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize 
shift with scalar shift amount,
that is, vectorization of 'a[i] >> x', the shift amount is loop invariant.

For the 2) case, we don't need to allocate a vector register group for shift 
amount.

So consider this following case:

void
f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x,
   int n)
{
  for (int i = 0; i < n; i++)
{
  int tmp = b[i] >> x;
  int tmp2 = tmp * b[i];
  c[i] = tmp2 * b[i];
  d[i] = tmp * tmp2 * b[i] >> x;
}
}

Before this patch, we choose LMUL = 4, now after this patch, we can choose LMUL 
= 8:

f:
ble a5,zero,.L5
.L3:
vsetvli a0,a5,e32,m8,ta,ma
sllia6,a0,2
vle32.v v16,0(a1)
vsra.vx v24,v16,a4
vmul.vv v8,v24,v16
vmul.vv v0,v8,v16
vse32.v v0,0(a2)
vmul.vv v8,v8,v24
vmul.vv v8,v8,v16
vsra.vx v8,v8,a4
vse32.v v8,0(a3)
add a1,a1,a6
add a2,a2,a6
add a3,a3,a6
sub a5,a5,a0
bne a5,zero,.L3
.L5:
ret

Tested on both RV32/RV64 no regression.  Ok for trunk ?

Note that we will apply same heuristic for vadd.vx, ... etc when the 
late-combine pass from
Richard Sandiford is committed (Since we need late combine pass to do vv->vx 
transformation for vadd).

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (loop_invariant_op_p): New 
function.
(variable_vectorized_p): Teach loop invariant.
(has_unexpected_spills_p): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 31 +++--
 .../costmodel/riscv/rvv/dynamic-lmul4-12.c| 40 
 .../costmodel/riscv/rvv/dynamic-lmul8-14.c| 64 +++
 3 files changed, 131 insertions(+), 4 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index ec8156fbaf8..3bae581d6fd 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -230,9 +230,24 @@ get_biggest_mode (machine_mode mode1, machine_mode mode2)
   return mode1_size >= mode2_size ? mode1 : mode2;
 }
 
+/* Return true if OP is invariant.  */
+
+static bool
+loop_invariant_op_p (class loop *loop,
+tree op)
+{
+  if (is_gimple_constant (op))
+return true;
+  if (SSA_NAME_IS_DEFAULT_DEF (op)
+  || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op
+return true;
+  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
+}
+
 /* Return true if the variable should be counted into liveness.  */
 static bool
-variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p)
+variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var,
+  bool lhs_p)
 {
   if (!var)
 return false;
@@ -275,6 +290,10 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
 || !tree_fits_shwi_p (var)
 || !IN_RANGE (tree_to_shwi (var), -16, 15)
 || gimple_assign_rhs1 (stmt) != var;
+   case LSHIFT_EXPR:
+   case RSHIFT_EXPR:
+ return gimple_assign_rhs2 (stmt) != var
+|| !loop_invariant_op_p (loop, var);
default:
  break;
}
@@ -312,10 +331,12 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
The live range of SSA 2 is [0, 4] in bb 3.  */
 static machine_mode
 compute_local_live_ranges (
+  loop_vec_info loop_vinfo,
   const hash_map> &program_points_per_bb,
   hash_map> &live_ranges_per_bb)
 {
   machine_mode biggest_mode = QImode;
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   if (!program_points_per_bb.is_empty ())
 {
   auto_vec visited_vars;
@@ -339,7 +360,8 @@ compute_local_live_ranges (
  unsigned int point = program_point.point;
  gimple *stmt = program_point.stmt;
  tree lhs = gimple_get_lhs (stmt);
- if (variable_vectorized_p (program_point.stmt_info, lhs, true))
+ if (variable_vectorized_p (loop, program_point.stmt_info, lhs,
+true))
{
  biggest_mode = get_biggest_mode (biggest_mode,
   TYPE_MODE (TREE_TYPE (lhs)));
@@ -356,7 +378,7 @@ compute_local_live_ranges (
  for (i = 0; i < gimple_num_args (stmt); i++)
{

Re: [PATCH v2 6/8] libstdc++: Optimize std::is_pointer compilation performance

2024-01-05 Thread Ken Matsui
On Thu, Jan 4, 2024 at 2:16 PM Patrick Palka  wrote:
>
> On Thu, 4 Jan 2024, Patrick Palka wrote:
>
> > On Sat, 23 Dec 2023, Ken Matsui wrote:
> >
> > > This patch optimizes the compilation performance of std::is_pointer
> > > by dispatching to the new __is_pointer built-in trait.
> > >
> > > libstdc++-v3/ChangeLog:
> > >
> > > * include/bits/cpp_type_traits.h (__is_pointer): Use
> > > __is_pointer built-in trait.  Optimize its implementation.
> > > * include/std/type_traits (is_pointer): Likewise.
> > > (is_pointer_v): Likewise.
> > >
> > > Co-authored-by: Jonathan Wakely 
> > > Signed-off-by: Ken Matsui 
> > > ---
> > >  libstdc++-v3/include/bits/cpp_type_traits.h | 29 ++
> > >  libstdc++-v3/include/std/type_traits| 44 +
> > >  2 files changed, 65 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/libstdc++-v3/include/bits/cpp_type_traits.h 
> > > b/libstdc++-v3/include/bits/cpp_type_traits.h
> > > index 4312f32a4e0..c348df97f72 100644
> > > --- a/libstdc++-v3/include/bits/cpp_type_traits.h
> > > +++ b/libstdc++-v3/include/bits/cpp_type_traits.h
> > > @@ -363,6 +363,13 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
> > >//
> > >// Pointer types
> > >//
> > > +#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
> > > +  template
> > > +struct __is_pointer : __truth_type<_IsPtr>
> > > +{
> > > +  enum { __value = _IsPtr };
> > > +};
> > > +#else
> > >template
> > >  struct __is_pointer
> > >  {
> > > @@ -377,6 +384,28 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
> > >typedef __true_type __type;
> > >  };
> > >
> > > +  template
> > > +struct __is_pointer<_Tp* const>
> > > +{
> > > +  enum { __value = 1 };
> > > +  typedef __true_type __type;
> > > +};
> > > +
> > > +  template
> > > +struct __is_pointer<_Tp* volatile>
> > > +{
> > > +  enum { __value = 1 };
> > > +  typedef __true_type __type;
> > > +};
> > > +#endif
> > > +
> > > +  template
> > > +struct __is_pointer<_Tp* const volatile>
> > > +{
> > > +  enum { __value = 1 };
> > > +  typedef __true_type __type;
> > > +};
> > > +
> > >//
> > >// An arithmetic type is an integer type or a floating point type
> > >//
> > > diff --git a/libstdc++-v3/include/std/type_traits 
> > > b/libstdc++-v3/include/std/type_traits
> > > index 30b0778e58a..d53911b2fa0 100644
> > > --- a/libstdc++-v3/include/std/type_traits
> > > +++ b/libstdc++-v3/include/std/type_traits
> > > @@ -542,19 +542,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > >  : public true_type { };
> > >  #endif
> > >
> > > -  template
> > > -struct __is_pointer_helper
> > > +  /// is_pointer
> > > +#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
> > > +  template
> > > +struct is_pointer
> > > +: public __bool_constant<__is_pointer(_Tp)>
> > > +{ };
> > > +#else
> > > +  template
> > > +struct is_pointer
> > >  : public false_type { };
> > >
> > >template
> > > -struct __is_pointer_helper<_Tp*>
> > > +struct is_pointer<_Tp*>
> > >  : public true_type { };
> > >
> > > -  /// is_pointer
> > >template
> > > -struct is_pointer
> > > -: public __is_pointer_helper<__remove_cv_t<_Tp>>::type
> > > -{ };
> > > +struct is_pointer<_Tp* const>
> > > +: public true_type { };
> > > +
> > > +  template
> > > +struct is_pointer<_Tp* volatile>
> > > +: public true_type { };
> > > +
> > > +  template
> > > +struct is_pointer<_Tp* const volatile>
> > > +: public true_type { };
> > > +#endif
> > >
> > >/// is_lvalue_reference
> > >template
> > > @@ -3252,8 +3266,22 @@ template 
> > >inline constexpr bool is_array_v<_Tp[_Num]> = true;
> > >  #endif
> > >
> > > +#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
> > > +template 
> > > +  inline constexpr bool is_pointer_v = __is_pointer(_Tp);
> > > +#else
> > >  template 
> > > -  inline constexpr bool is_pointer_v = is_pointer<_Tp>::value;
> > > +  inline constexpr bool is_pointer_v = false;
> > > +template 
> > > +  inline constexpr bool is_pointer_v<_Tp*> = true;
> > > +template 
> > > +  inline constexpr bool is_pointer_v<_Tp* const> = true;
> > > +template 
> > > +  inline constexpr bool is_pointer_v<_Tp* volatile> = true;
> > > +template 
> > > +  inline constexpr bool is_pointer_v<_Tp* const volatile> = true;
> >
> > Is this fallback implementation faster than the current implementation?
> > LGTM if so.
>

Yes.  Here are the benchmarks: 1 is current impl vs. built-in, and 2
is new impl vs. built-in.  There is no explicit benchmark that
compares the current impl and the new impl, but I think this shows the
new impl is faster.

1. https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624227.html

Time: -62.1344%
Peak Memory Usage: -52.4281%
Total Memory Usage: -53.5889%

2. https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624326.html

Time: -2.79488%
Peak Memory Usage: -2.39379%
Total Memory Usage: -3.39559%

> By the way, a related o

Re: [PATCH v2 6/8] libstdc++: Optimize std::is_pointer compilation performance

2024-01-05 Thread Ken Matsui
On Thu, Jan 4, 2024 at 2:13 PM Jonathan Wakely  wrote:
>
> On Sat, 23 Dec 2023 at 22:07, Ken Matsui  wrote:
> >
> > This patch optimizes the compilation performance of std::is_pointer
> > by dispatching to the new __is_pointer built-in trait.
> >
> > libstdc++-v3/ChangeLog:
> >
> > * include/bits/cpp_type_traits.h (__is_pointer): Use
> > __is_pointer built-in trait.  Optimize its implementation.
> > * include/std/type_traits (is_pointer): Likewise.
> > (is_pointer_v): Likewise.
> >
> > Co-authored-by: Jonathan Wakely 
> > Signed-off-by: Ken Matsui 
> > ---
> >  libstdc++-v3/include/bits/cpp_type_traits.h | 29 ++
> >  libstdc++-v3/include/std/type_traits| 44 +
> >  2 files changed, 65 insertions(+), 8 deletions(-)
> >
> > diff --git a/libstdc++-v3/include/bits/cpp_type_traits.h 
> > b/libstdc++-v3/include/bits/cpp_type_traits.h
> > index 4312f32a4e0..c348df97f72 100644
> > --- a/libstdc++-v3/include/bits/cpp_type_traits.h
> > +++ b/libstdc++-v3/include/bits/cpp_type_traits.h
> > @@ -363,6 +363,13 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
> >//
> >// Pointer types
> >//
> > +#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
> > +  template
> > +struct __is_pointer : __truth_type<_IsPtr>
> > +{
> > +  enum { __value = _IsPtr };
> > +};
> > +#else
> >template
> >  struct __is_pointer
> >  {
> > @@ -377,6 +384,28 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
> >typedef __true_type __type;
> >  };
> >
> > +  template
> > +struct __is_pointer<_Tp* const>
> > +{
> > +  enum { __value = 1 };
> > +  typedef __true_type __type;
> > +};
> > +
> > +  template
> > +struct __is_pointer<_Tp* volatile>
> > +{
> > +  enum { __value = 1 };
> > +  typedef __true_type __type;
> > +};
> > +#endif
> > +
> > +  template
> > +struct __is_pointer<_Tp* const volatile>
> > +{
> > +  enum { __value = 1 };
> > +  typedef __true_type __type;
> > +};
>
> Why is this partial specialization outside the #else group?
>

This is totally my fault.  Thank you for pointing this out!

>
> > +
> >//
> >// An arithmetic type is an integer type or a floating point type
> >//
> > diff --git a/libstdc++-v3/include/std/type_traits 
> > b/libstdc++-v3/include/std/type_traits
> > index 30b0778e58a..d53911b2fa0 100644
> > --- a/libstdc++-v3/include/std/type_traits
> > +++ b/libstdc++-v3/include/std/type_traits
> > @@ -542,19 +542,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >  : public true_type { };
> >  #endif
> >
> > -  template
> > -struct __is_pointer_helper
> > +  /// is_pointer
> > +#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
> > +  template
> > +struct is_pointer
> > +: public __bool_constant<__is_pointer(_Tp)>
> > +{ };
> > +#else
> > +  template
> > +struct is_pointer
> >  : public false_type { };
> >
> >template
> > -struct __is_pointer_helper<_Tp*>
> > +struct is_pointer<_Tp*>
> >  : public true_type { };
> >
> > -  /// is_pointer
> >template
> > -struct is_pointer
> > -: public __is_pointer_helper<__remove_cv_t<_Tp>>::type
> > -{ };
> > +struct is_pointer<_Tp* const>
> > +: public true_type { };
> > +
> > +  template
> > +struct is_pointer<_Tp* volatile>
> > +: public true_type { };
> > +
> > +  template
> > +struct is_pointer<_Tp* const volatile>
> > +: public true_type { };
> > +#endif
> >
> >/// is_lvalue_reference
> >template
> > @@ -3252,8 +3266,22 @@ template 
> >inline constexpr bool is_array_v<_Tp[_Num]> = true;
> >  #endif
> >
> > +#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
> > +template 
> > +  inline constexpr bool is_pointer_v = __is_pointer(_Tp);
> > +#else
> >  template 
> > -  inline constexpr bool is_pointer_v = is_pointer<_Tp>::value;
> > +  inline constexpr bool is_pointer_v = false;
> > +template 
> > +  inline constexpr bool is_pointer_v<_Tp*> = true;
> > +template 
> > +  inline constexpr bool is_pointer_v<_Tp* const> = true;
> > +template 
> > +  inline constexpr bool is_pointer_v<_Tp* volatile> = true;
> > +template 
> > +  inline constexpr bool is_pointer_v<_Tp* const volatile> = true;
> > +#endif
> > +
> >  template 
> >inline constexpr bool is_lvalue_reference_v = false;
> >  template 
> > --
> > 2.43.0
> >
>


[PATCH v3 0/8] Optimize more type traits

2024-01-05 Thread Ken Matsui
Changes in v3:

- Rebased on top of master.
- Fixed __is_pointer in cpp_type_traits.h.

Changes in v2:

- Removed testsuite_tr1.h includes from the testcases.

---

This patch series implements __is_const, __is_volatile, __is_pointer,
and __is_unbounded_array built-in traits, which were isolated from my
previous patch series "Optimize type traits compilation performance"
because they contained performance regression.  I confirmed that this
patch series does not cause any performance regression.  The main reason
of the performance regression were the exhaustiveness of the benchmarks
and the instability of the benchmark results.  Here are new benchmark
results:

is_const: 
https://github.com/ken-matsui/gcc-bench/blob/main/is_const.md#sat-dec-23-090605-am-pst-2023

time: -4.36603%, peak memory: -0.300891%, total memory: -0.247934%

is_volatile_v: 
https://github.com/ken-matsui/gcc-bench/blob/main/is_volatile_v.md#sat-dec-23-091518-am-pst-2023

time: -4.06816%, peak memory: -0.609298%, total memory: -0.659134%

is_pointer: 
https://github.com/ken-matsui/gcc-bench/blob/main/is_pointer.md#sat-dec-23-124903-pm-pst-2023

time: -2.47124%, peak memory: -2.98207%, total memory: -4.0811%

is_unbounded_array_v: 
https://github.com/ken-matsui/gcc-bench/blob/main/is_unbounded_array_v.md#sat-dec-23-010046-pm-pst-2023

time: -1.50025%, peak memory: -1.07386%, total memory: -2.32394%

Ken Matsui (8):
  c++: Implement __is_const built-in trait
  libstdc++: Optimize std::is_const compilation performance
  c++: Implement __is_volatile built-in trait
  libstdc++: Optimize std::is_volatile compilation performance
  c++: Implement __is_pointer built-in trait
  libstdc++: Optimize std::is_pointer compilation performance
  c++: Implement __is_unbounded_array built-in trait
  libstdc++: Optimize std::is_unbounded_array compilation performance

 gcc/cp/constraint.cc  | 12 +++
 gcc/cp/cp-trait.def   |  4 +
 gcc/cp/semantics.cc   | 16 
 gcc/testsuite/g++.dg/ext/has-builtin-1.C  | 12 +++
 gcc/testsuite/g++.dg/ext/is_const.C   | 20 +
 gcc/testsuite/g++.dg/ext/is_pointer.C | 51 +
 gcc/testsuite/g++.dg/ext/is_unbounded_array.C | 37 ++
 gcc/testsuite/g++.dg/ext/is_volatile.C| 20 +
 libstdc++-v3/include/bits/cpp_type_traits.h   | 31 +++-
 libstdc++-v3/include/std/type_traits  | 73 +--
 10 files changed, 267 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/ext/is_const.C
 create mode 100644 gcc/testsuite/g++.dg/ext/is_pointer.C
 create mode 100644 gcc/testsuite/g++.dg/ext/is_unbounded_array.C
 create mode 100644 gcc/testsuite/g++.dg/ext/is_volatile.C

-- 
2.43.0



[PATCH v3 3/8] c++: Implement __is_volatile built-in trait

2024-01-05 Thread Ken Matsui
This patch implements built-in trait for std::is_volatile.

gcc/cp/ChangeLog:

* cp-trait.def: Define __is_volatile.
* constraint.cc (diagnose_trait_expr): Handle CPTK_IS_VOLATILE.
* semantics.cc (trait_expr_value): Likewise.
(finish_trait_expr): Likewise.

gcc/testsuite/ChangeLog:

* g++.dg/ext/has-builtin-1.C: Test existence of __is_volatile.
* g++.dg/ext/is_volatile.C: New test.

Signed-off-by: Ken Matsui 
---
 gcc/cp/constraint.cc |  3 +++
 gcc/cp/cp-trait.def  |  1 +
 gcc/cp/semantics.cc  |  4 
 gcc/testsuite/g++.dg/ext/has-builtin-1.C |  3 +++
 gcc/testsuite/g++.dg/ext/is_volatile.C   | 20 
 5 files changed, 31 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/ext/is_volatile.C

diff --git a/gcc/cp/constraint.cc b/gcc/cp/constraint.cc
index 669803b586c..6f13546b9b2 100644
--- a/gcc/cp/constraint.cc
+++ b/gcc/cp/constraint.cc
@@ -3828,6 +3828,9 @@ diagnose_trait_expr (tree expr, tree args)
 case CPTK_IS_UNION:
   inform (loc, "  %qT is not a union", t1);
   break;
+case CPTK_IS_VOLATILE:
+  inform (loc, "  %qT is not a volatile type", t1);
+  break;
 case CPTK_REF_CONSTRUCTS_FROM_TEMPORARY:
   inform (loc, "  %qT is not a reference that binds to a temporary "
  "object of type %qT (direct-initialization)", t1, t2);
diff --git a/gcc/cp/cp-trait.def b/gcc/cp/cp-trait.def
index 36faed9c0b3..e9347453829 100644
--- a/gcc/cp/cp-trait.def
+++ b/gcc/cp/cp-trait.def
@@ -92,6 +92,7 @@ DEFTRAIT_EXPR (IS_TRIVIALLY_ASSIGNABLE, 
"__is_trivially_assignable", 2)
 DEFTRAIT_EXPR (IS_TRIVIALLY_CONSTRUCTIBLE, "__is_trivially_constructible", -1)
 DEFTRAIT_EXPR (IS_TRIVIALLY_COPYABLE, "__is_trivially_copyable", 1)
 DEFTRAIT_EXPR (IS_UNION, "__is_union", 1)
+DEFTRAIT_EXPR (IS_VOLATILE, "__is_volatile", 1)
 DEFTRAIT_EXPR (REF_CONSTRUCTS_FROM_TEMPORARY, 
"__reference_constructs_from_temporary", 2)
 DEFTRAIT_EXPR (REF_CONVERTS_FROM_TEMPORARY, 
"__reference_converts_from_temporary", 2)
 DEFTRAIT_TYPE (REMOVE_CV, "__remove_cv", 1)
diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
index 1a6f08c37ec..0f304cdc642 100644
--- a/gcc/cp/semantics.cc
+++ b/gcc/cp/semantics.cc
@@ -12501,6 +12501,9 @@ trait_expr_value (cp_trait_kind kind, tree type1, tree 
type2)
 case CPTK_IS_UNION:
   return type_code1 == UNION_TYPE;
 
+case CPTK_IS_VOLATILE:
+  return CP_TYPE_VOLATILE_P (type1);
+
 case CPTK_REF_CONSTRUCTS_FROM_TEMPORARY:
   return ref_xes_from_temporary (type1, type2, /*direct_init=*/true);
 
@@ -12671,6 +12674,7 @@ finish_trait_expr (location_t loc, cp_trait_kind kind, 
tree type1, tree type2)
 case CPTK_IS_SAME:
 case CPTK_IS_SCOPED_ENUM:
 case CPTK_IS_UNION:
+case CPTK_IS_VOLATILE:
   break;
 
 case CPTK_IS_LAYOUT_COMPATIBLE:
diff --git a/gcc/testsuite/g++.dg/ext/has-builtin-1.C 
b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
index e3640faeb96..b2e2f2f694d 100644
--- a/gcc/testsuite/g++.dg/ext/has-builtin-1.C
+++ b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
@@ -158,6 +158,9 @@
 #if !__has_builtin (__is_union)
 # error "__has_builtin (__is_union) failed"
 #endif
+#if !__has_builtin (__is_volatile)
+# error "__has_builtin (__is_volatile) failed"
+#endif
 #if !__has_builtin (__reference_constructs_from_temporary)
 # error "__has_builtin (__reference_constructs_from_temporary) failed"
 #endif
diff --git a/gcc/testsuite/g++.dg/ext/is_volatile.C 
b/gcc/testsuite/g++.dg/ext/is_volatile.C
new file mode 100644
index 000..80a1cfc880d
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/is_volatile.C
@@ -0,0 +1,20 @@
+// { dg-do compile { target c++11 } }
+
+#define SA(X) static_assert((X),#X)
+
+class ClassType { };
+using cClassType = const ClassType;
+using vClassType = volatile ClassType;
+using cvClassType = const volatile ClassType;
+
+// Positive tests.
+SA(__is_volatile(volatile int));
+SA(__is_volatile(const volatile int));
+SA(__is_volatile(vClassType));
+SA(__is_volatile(cvClassType));
+
+// Negative tests.
+SA(!__is_volatile(int));
+SA(!__is_volatile(const int));
+SA(!__is_volatile(ClassType));
+SA(!__is_volatile(cClassType));
-- 
2.43.0



[PATCH v3 2/8] libstdc++: Optimize std::is_const compilation performance

2024-01-05 Thread Ken Matsui
This patch optimizes the compilation performance of std::is_const
by dispatching to the new __is_const built-in trait.

libstdc++-v3/ChangeLog:

* include/std/type_traits (is_const): Use __is_const built-in
trait.
(is_const_v): Likewise.

Signed-off-by: Ken Matsui 
---
 libstdc++-v3/include/std/type_traits | 12 
 1 file changed, 12 insertions(+)

diff --git a/libstdc++-v3/include/std/type_traits 
b/libstdc++-v3/include/std/type_traits
index 3b1b419..e64ed1de891 100644
--- a/libstdc++-v3/include/std/type_traits
+++ b/libstdc++-v3/include/std/type_traits
@@ -835,6 +835,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   // Type properties.
 
   /// is_const
+#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_const)
+  template
+struct is_const
+: public __bool_constant<__is_const(_Tp)>
+{ };
+#else
   template
 struct is_const
 : public false_type { };
@@ -842,6 +848,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template
 struct is_const<_Tp const>
 : public true_type { };
+#endif
 
   /// is_volatile
   template
@@ -3315,10 +3322,15 @@ template 
   inline constexpr bool is_member_pointer_v = is_member_pointer<_Tp>::value;
 #endif
 
+#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_const)
+template 
+  inline constexpr bool is_const_v = __is_const(_Tp);
+#else
 template 
   inline constexpr bool is_const_v = false;
 template 
   inline constexpr bool is_const_v = true;
+#endif
 
 #if _GLIBCXX_USE_BUILTIN_TRAIT(__is_function)
 template 
-- 
2.43.0



[PATCH v3 4/8] libstdc++: Optimize std::is_volatile compilation performance

2024-01-05 Thread Ken Matsui
This patch optimizes the compilation performance of std::is_volatile
by dispatching to the new __is_volatile built-in trait.

libstdc++-v3/ChangeLog:

* include/std/type_traits (is_volatile): Use __is_volatile
built-in trait.
(is_volatile_v): Likewise.

Signed-off-by: Ken Matsui 
---
 libstdc++-v3/include/std/type_traits | 12 
 1 file changed, 12 insertions(+)

diff --git a/libstdc++-v3/include/std/type_traits 
b/libstdc++-v3/include/std/type_traits
index e64ed1de891..2bfc31b141d 100644
--- a/libstdc++-v3/include/std/type_traits
+++ b/libstdc++-v3/include/std/type_traits
@@ -851,6 +851,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
 
   /// is_volatile
+#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_volatile)
+  template
+struct is_volatile
+: public __bool_constant<__is_volatile(_Tp)>
+{ };
+#else
   template
 struct is_volatile
 : public false_type { };
@@ -858,6 +864,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template
 struct is_volatile<_Tp volatile>
 : public true_type { };
+#endif
 
   /// is_trivial
   template
@@ -3344,10 +3351,15 @@ template 
   inline constexpr bool is_function_v<_Tp&&> = false;
 #endif
 
+#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_volatile)
+template 
+  inline constexpr bool is_volatile_v = __is_volatile(_Tp);
+#else
 template 
   inline constexpr bool is_volatile_v = false;
 template 
   inline constexpr bool is_volatile_v = true;
+#endif
 
 template 
   inline constexpr bool is_trivial_v = __is_trivial(_Tp);
-- 
2.43.0



[PATCH v3 7/8] c++: Implement __is_unbounded_array built-in trait

2024-01-05 Thread Ken Matsui
This patch implements built-in trait for std::is_unbounded_array.

gcc/cp/ChangeLog:

* cp-trait.def: Define __is_unbounded_array.
* constraint.cc (diagnose_trait_expr): Handle
CPTK_IS_UNBOUNDED_ARRAY.
* semantics.cc (trait_expr_value): Likewise.
(finish_trait_expr): Likewise.

gcc/testsuite/ChangeLog:

* g++.dg/ext/has-builtin-1.C: Test existence of
__is_unbounded_array.
* g++.dg/ext/is_unbounded_array.C: New test.

Signed-off-by: Ken Matsui 
---
 gcc/cp/constraint.cc  |  3 ++
 gcc/cp/cp-trait.def   |  1 +
 gcc/cp/semantics.cc   |  4 ++
 gcc/testsuite/g++.dg/ext/has-builtin-1.C  |  3 ++
 gcc/testsuite/g++.dg/ext/is_unbounded_array.C | 37 +++
 5 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/ext/is_unbounded_array.C

diff --git a/gcc/cp/constraint.cc b/gcc/cp/constraint.cc
index 3d7c0509f6b..9454739a76f 100644
--- a/gcc/cp/constraint.cc
+++ b/gcc/cp/constraint.cc
@@ -3828,6 +3828,9 @@ diagnose_trait_expr (tree expr, tree args)
 case CPTK_IS_TRIVIALLY_COPYABLE:
   inform (loc, "  %qT is not trivially copyable", t1);
   break;
+case CPTK_IS_UNBOUNDED_ARRAY:
+  inform (loc, "  %qT is not an unbounded array", t1);
+  break;
 case CPTK_IS_UNION:
   inform (loc, "  %qT is not a union", t1);
   break;
diff --git a/gcc/cp/cp-trait.def b/gcc/cp/cp-trait.def
index 18e2d0f3480..05514a51c21 100644
--- a/gcc/cp/cp-trait.def
+++ b/gcc/cp/cp-trait.def
@@ -92,6 +92,7 @@ DEFTRAIT_EXPR (IS_TRIVIAL, "__is_trivial", 1)
 DEFTRAIT_EXPR (IS_TRIVIALLY_ASSIGNABLE, "__is_trivially_assignable", 2)
 DEFTRAIT_EXPR (IS_TRIVIALLY_CONSTRUCTIBLE, "__is_trivially_constructible", -1)
 DEFTRAIT_EXPR (IS_TRIVIALLY_COPYABLE, "__is_trivially_copyable", 1)
+DEFTRAIT_EXPR (IS_UNBOUNDED_ARRAY, "__is_unbounded_array", 1)
 DEFTRAIT_EXPR (IS_UNION, "__is_union", 1)
 DEFTRAIT_EXPR (IS_VOLATILE, "__is_volatile", 1)
 DEFTRAIT_EXPR (REF_CONSTRUCTS_FROM_TEMPORARY, 
"__reference_constructs_from_temporary", 2)
diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
index 4b75ff744d4..a2ab945e50a 100644
--- a/gcc/cp/semantics.cc
+++ b/gcc/cp/semantics.cc
@@ -12501,6 +12501,9 @@ trait_expr_value (cp_trait_kind kind, tree type1, tree 
type2)
 case CPTK_IS_TRIVIALLY_COPYABLE:
   return trivially_copyable_p (type1);
 
+case CPTK_IS_UNBOUNDED_ARRAY:
+  return array_of_unknown_bound_p (type1);
+
 case CPTK_IS_UNION:
   return type_code1 == UNION_TYPE;
 
@@ -12677,6 +12680,7 @@ finish_trait_expr (location_t loc, cp_trait_kind kind, 
tree type1, tree type2)
 case CPTK_IS_REFERENCE:
 case CPTK_IS_SAME:
 case CPTK_IS_SCOPED_ENUM:
+case CPTK_IS_UNBOUNDED_ARRAY:
 case CPTK_IS_UNION:
 case CPTK_IS_VOLATILE:
   break;
diff --git a/gcc/testsuite/g++.dg/ext/has-builtin-1.C 
b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
index 96b7a89e4f1..b1430e9bd8b 100644
--- a/gcc/testsuite/g++.dg/ext/has-builtin-1.C
+++ b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
@@ -158,6 +158,9 @@
 #if !__has_builtin (__is_trivially_copyable)
 # error "__has_builtin (__is_trivially_copyable) failed"
 #endif
+#if !__has_builtin (__is_unbounded_array)
+# error "__has_builtin (__is_unbounded_array) failed"
+#endif
 #if !__has_builtin (__is_union)
 # error "__has_builtin (__is_union) failed"
 #endif
diff --git a/gcc/testsuite/g++.dg/ext/is_unbounded_array.C 
b/gcc/testsuite/g++.dg/ext/is_unbounded_array.C
new file mode 100644
index 000..283a74e1a0a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/is_unbounded_array.C
@@ -0,0 +1,37 @@
+// { dg-do compile { target c++11 } }
+
+#define SA(X) static_assert((X),#X)
+
+#define SA_TEST_CATEGORY(TRAIT, TYPE, EXPECT)  \
+  SA(TRAIT(TYPE) == EXPECT);   \
+  SA(TRAIT(const TYPE) == EXPECT); \
+  SA(TRAIT(volatile TYPE) == EXPECT);  \
+  SA(TRAIT(const volatile TYPE) == EXPECT)
+
+class ClassType { };
+class IncompleteClass;
+union IncompleteUnion;
+
+SA_TEST_CATEGORY(__is_unbounded_array, int[2], false);
+SA_TEST_CATEGORY(__is_unbounded_array, int[], true);
+SA_TEST_CATEGORY(__is_unbounded_array, int[2][3], false);
+SA_TEST_CATEGORY(__is_unbounded_array, int[][3], true);
+SA_TEST_CATEGORY(__is_unbounded_array, float*[2], false);
+SA_TEST_CATEGORY(__is_unbounded_array, float*[], true);
+SA_TEST_CATEGORY(__is_unbounded_array, float*[2][3], false);
+SA_TEST_CATEGORY(__is_unbounded_array, float*[][3], true);
+SA_TEST_CATEGORY(__is_unbounded_array, ClassType[2], false);
+SA_TEST_CATEGORY(__is_unbounded_array, ClassType[], true);
+SA_TEST_CATEGORY(__is_unbounded_array, ClassType[2][3], false);
+SA_TEST_CATEGORY(__is_unbounded_array, ClassType[][3], true);
+SA_TEST_CATEGORY(__is_unbounded_array, IncompleteClass[2][3], false);
+SA_TEST_CATEGORY(__is_unbounded_array, IncompleteClass[][3], true);
+SA_TEST_CATEGORY(__is_unbounded_array, in

[PATCH v3 5/8] c++: Implement __is_pointer built-in trait

2024-01-05 Thread Ken Matsui
This patch implements built-in trait for std::is_pointer.

gcc/cp/ChangeLog:

* cp-trait.def: Define __is_pointer.
* constraint.cc (diagnose_trait_expr): Handle CPTK_IS_POINTER.
* semantics.cc (trait_expr_value): Likewise.
(finish_trait_expr): Likewise.

gcc/testsuite/ChangeLog:

* g++.dg/ext/has-builtin-1.C: Test existence of __is_pointer.
* g++.dg/ext/is_pointer.C: New test.

Signed-off-by: Ken Matsui 
---
 gcc/cp/constraint.cc |  3 ++
 gcc/cp/cp-trait.def  |  1 +
 gcc/cp/semantics.cc  |  4 ++
 gcc/testsuite/g++.dg/ext/has-builtin-1.C |  3 ++
 gcc/testsuite/g++.dg/ext/is_pointer.C| 51 
 5 files changed, 62 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/ext/is_pointer.C

diff --git a/gcc/cp/constraint.cc b/gcc/cp/constraint.cc
index 6f13546b9b2..3d7c0509f6b 100644
--- a/gcc/cp/constraint.cc
+++ b/gcc/cp/constraint.cc
@@ -3795,6 +3795,9 @@ diagnose_trait_expr (tree expr, tree args)
 case CPTK_IS_POD:
   inform (loc, "  %qT is not a POD type", t1);
   break;
+case CPTK_IS_POINTER:
+  inform (loc, "  %qT is not a pointer", t1);
+  break;
 case CPTK_IS_POLYMORPHIC:
   inform (loc, "  %qT is not a polymorphic type", t1);
   break;
diff --git a/gcc/cp/cp-trait.def b/gcc/cp/cp-trait.def
index e9347453829..18e2d0f3480 100644
--- a/gcc/cp/cp-trait.def
+++ b/gcc/cp/cp-trait.def
@@ -82,6 +82,7 @@ DEFTRAIT_EXPR (IS_NOTHROW_CONVERTIBLE, 
"__is_nothrow_convertible", 2)
 DEFTRAIT_EXPR (IS_OBJECT, "__is_object", 1)
 DEFTRAIT_EXPR (IS_POINTER_INTERCONVERTIBLE_BASE_OF, 
"__is_pointer_interconvertible_base_of", 2)
 DEFTRAIT_EXPR (IS_POD, "__is_pod", 1)
+DEFTRAIT_EXPR (IS_POINTER, "__is_pointer", 1)
 DEFTRAIT_EXPR (IS_POLYMORPHIC, "__is_polymorphic", 1)
 DEFTRAIT_EXPR (IS_REFERENCE, "__is_reference", 1)
 DEFTRAIT_EXPR (IS_SAME, "__is_same", 2)
diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
index 0f304cdc642..4b75ff744d4 100644
--- a/gcc/cp/semantics.cc
+++ b/gcc/cp/semantics.cc
@@ -12471,6 +12471,9 @@ trait_expr_value (cp_trait_kind kind, tree type1, tree 
type2)
 case CPTK_IS_POD:
   return pod_type_p (type1);
 
+case CPTK_IS_POINTER:
+  return TYPE_PTR_P (type1);
+
 case CPTK_IS_POLYMORPHIC:
   return CLASS_TYPE_P (type1) && TYPE_POLYMORPHIC_P (type1);
 
@@ -12670,6 +12673,7 @@ finish_trait_expr (location_t loc, cp_trait_kind kind, 
tree type1, tree type2)
 case CPTK_IS_MEMBER_OBJECT_POINTER:
 case CPTK_IS_MEMBER_POINTER:
 case CPTK_IS_OBJECT:
+case CPTK_IS_POINTER:
 case CPTK_IS_REFERENCE:
 case CPTK_IS_SAME:
 case CPTK_IS_SCOPED_ENUM:
diff --git a/gcc/testsuite/g++.dg/ext/has-builtin-1.C 
b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
index b2e2f2f694d..96b7a89e4f1 100644
--- a/gcc/testsuite/g++.dg/ext/has-builtin-1.C
+++ b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
@@ -125,6 +125,9 @@
 #if !__has_builtin (__is_pod)
 # error "__has_builtin (__is_pod) failed"
 #endif
+#if !__has_builtin (__is_pointer)
+# error "__has_builtin (__is_pointer) failed"
+#endif
 #if !__has_builtin (__is_polymorphic)
 # error "__has_builtin (__is_polymorphic) failed"
 #endif
diff --git a/gcc/testsuite/g++.dg/ext/is_pointer.C 
b/gcc/testsuite/g++.dg/ext/is_pointer.C
new file mode 100644
index 000..d6e39565950
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/is_pointer.C
@@ -0,0 +1,51 @@
+// { dg-do compile { target c++11 } }
+
+#define SA(X) static_assert((X),#X)
+
+SA(!__is_pointer(int));
+SA(__is_pointer(int*));
+SA(__is_pointer(int**));
+
+SA(__is_pointer(const int*));
+SA(__is_pointer(const int**));
+SA(__is_pointer(int* const));
+SA(__is_pointer(int** const));
+SA(__is_pointer(int* const* const));
+
+SA(__is_pointer(volatile int*));
+SA(__is_pointer(volatile int**));
+SA(__is_pointer(int* volatile));
+SA(__is_pointer(int** volatile));
+SA(__is_pointer(int* volatile* volatile));
+
+SA(__is_pointer(const volatile int*));
+SA(__is_pointer(const volatile int**));
+SA(__is_pointer(const int* volatile));
+SA(__is_pointer(volatile int* const));
+SA(__is_pointer(int* const volatile));
+SA(__is_pointer(const int** volatile));
+SA(__is_pointer(volatile int** const));
+SA(__is_pointer(int** const volatile));
+SA(__is_pointer(int* const* const volatile));
+SA(__is_pointer(int* volatile* const volatile));
+SA(__is_pointer(int* const volatile* const volatile));
+
+SA(!__is_pointer(int&));
+SA(!__is_pointer(const int&));
+SA(!__is_pointer(volatile int&));
+SA(!__is_pointer(const volatile int&));
+
+SA(!__is_pointer(int&&));
+SA(!__is_pointer(const int&&));
+SA(!__is_pointer(volatile int&&));
+SA(!__is_pointer(const volatile int&&));
+
+SA(!__is_pointer(int[3]));
+SA(!__is_pointer(const int[3]));
+SA(!__is_pointer(volatile int[3]));
+SA(!__is_pointer(const volatile int[3]));
+
+SA(!__is_pointer(int(int)));
+SA(__is_pointer(int(*const)(int)));
+SA(__is_pointer(int(*volatile)(int)));
+SA(__is_pointer(int(*const volatile)(int)

[PATCH v3 6/8] libstdc++: Optimize std::is_pointer compilation performance

2024-01-05 Thread Ken Matsui
This patch optimizes the compilation performance of std::is_pointer
by dispatching to the new __is_pointer built-in trait.

libstdc++-v3/ChangeLog:

* include/bits/cpp_type_traits.h (__is_pointer): Use
__is_pointer built-in trait.  Optimize its implementation.
* include/std/type_traits (is_pointer): Likewise.
(is_pointer_v): Likewise.

Co-authored-by: Jonathan Wakely 
Signed-off-by: Ken Matsui 
---
 libstdc++-v3/include/bits/cpp_type_traits.h | 31 ++-
 libstdc++-v3/include/std/type_traits| 44 +
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/libstdc++-v3/include/bits/cpp_type_traits.h 
b/libstdc++-v3/include/bits/cpp_type_traits.h
index 59f1a1875eb..210a9ea00da 100644
--- a/libstdc++-v3/include/bits/cpp_type_traits.h
+++ b/libstdc++-v3/include/bits/cpp_type_traits.h
@@ -363,6 +363,13 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
   //
   // Pointer types
   //
+#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
+  template
+struct __is_pointer : __truth_type<_IsPtr>
+{
+  enum { __value = _IsPtr };
+};
+#else
   template
 struct __is_pointer
 {
@@ -377,6 +384,28 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
   typedef __true_type __type;
 };
 
+  template
+struct __is_pointer<_Tp* const>
+{
+  enum { __value = 1 };
+  typedef __true_type __type;
+};
+
+  template
+struct __is_pointer<_Tp* volatile>
+{
+  enum { __value = 1 };
+  typedef __true_type __type;
+};
+
+  template
+struct __is_pointer<_Tp* const volatile>
+{
+  enum { __value = 1 };
+  typedef __true_type __type;
+};
+#endif
+
   //
   // An arithmetic type is an integer type or a floating point type
   //
@@ -387,7 +416,7 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
 
   //
   // A scalar type is an arithmetic type or a pointer type
-  // 
+  //
   template
 struct __is_scalar
 : public __traitor<__is_arithmetic<_Tp>, __is_pointer<_Tp> >
diff --git a/libstdc++-v3/include/std/type_traits 
b/libstdc++-v3/include/std/type_traits
index 2bfc31b141d..018bf20ba1d 100644
--- a/libstdc++-v3/include/std/type_traits
+++ b/libstdc++-v3/include/std/type_traits
@@ -542,19 +542,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 : public true_type { };
 #endif
 
-  template
-struct __is_pointer_helper
+  /// is_pointer
+#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
+  template
+struct is_pointer
+: public __bool_constant<__is_pointer(_Tp)>
+{ };
+#else
+  template
+struct is_pointer
 : public false_type { };
 
   template
-struct __is_pointer_helper<_Tp*>
+struct is_pointer<_Tp*>
 : public true_type { };
 
-  /// is_pointer
   template
-struct is_pointer
-: public __is_pointer_helper<__remove_cv_t<_Tp>>::type
-{ };
+struct is_pointer<_Tp* const>
+: public true_type { };
+
+  template
+struct is_pointer<_Tp* volatile>
+: public true_type { };
+
+  template
+struct is_pointer<_Tp* const volatile>
+: public true_type { };
+#endif
 
   /// is_lvalue_reference
   template
@@ -3252,8 +3266,22 @@ template 
   inline constexpr bool is_array_v<_Tp[_Num]> = true;
 #endif
 
+#if _GLIBCXX_USE_BUILTIN_TRAIT(__is_pointer)
+template 
+  inline constexpr bool is_pointer_v = __is_pointer(_Tp);
+#else
 template 
-  inline constexpr bool is_pointer_v = is_pointer<_Tp>::value;
+  inline constexpr bool is_pointer_v = false;
+template 
+  inline constexpr bool is_pointer_v<_Tp*> = true;
+template 
+  inline constexpr bool is_pointer_v<_Tp* const> = true;
+template 
+  inline constexpr bool is_pointer_v<_Tp* volatile> = true;
+template 
+  inline constexpr bool is_pointer_v<_Tp* const volatile> = true;
+#endif
+
 template 
   inline constexpr bool is_lvalue_reference_v = false;
 template 
-- 
2.43.0



[PATCH v3 1/8] c++: Implement __is_const built-in trait

2024-01-05 Thread Ken Matsui
This patch implements built-in trait for std::is_const.

gcc/cp/ChangeLog:

* cp-trait.def: Define __is_const.
* constraint.cc (diagnose_trait_expr): Handle CPTK_IS_CONST.
* semantics.cc (trait_expr_value): Likewise.
(finish_trait_expr): Likewise.

gcc/testsuite/ChangeLog:

* g++.dg/ext/has-builtin-1.C: Test existence of __is_const.
* g++.dg/ext/is_const.C: New test.

Signed-off-by: Ken Matsui 
---
 gcc/cp/constraint.cc |  3 +++
 gcc/cp/cp-trait.def  |  1 +
 gcc/cp/semantics.cc  |  4 
 gcc/testsuite/g++.dg/ext/has-builtin-1.C |  3 +++
 gcc/testsuite/g++.dg/ext/is_const.C  | 20 
 5 files changed, 31 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/ext/is_const.C

diff --git a/gcc/cp/constraint.cc b/gcc/cp/constraint.cc
index fef68cf7ab2..669803b586c 100644
--- a/gcc/cp/constraint.cc
+++ b/gcc/cp/constraint.cc
@@ -3734,6 +3734,9 @@ diagnose_trait_expr (tree expr, tree args)
 case CPTK_IS_CLASS:
   inform (loc, "  %qT is not a class", t1);
   break;
+case CPTK_IS_CONST:
+  inform (loc, "  %qT is not a const type", t1);
+  break;
 case CPTK_IS_CONSTRUCTIBLE:
   if (!t2)
 inform (loc, "  %qT is not default constructible", t1);
diff --git a/gcc/cp/cp-trait.def b/gcc/cp/cp-trait.def
index 394f006f20f..36faed9c0b3 100644
--- a/gcc/cp/cp-trait.def
+++ b/gcc/cp/cp-trait.def
@@ -64,6 +64,7 @@ DEFTRAIT_EXPR (IS_ASSIGNABLE, "__is_assignable", 2)
 DEFTRAIT_EXPR (IS_BASE_OF, "__is_base_of", 2)
 DEFTRAIT_EXPR (IS_BOUNDED_ARRAY, "__is_bounded_array", 1)
 DEFTRAIT_EXPR (IS_CLASS, "__is_class", 1)
+DEFTRAIT_EXPR (IS_CONST, "__is_const", 1)
 DEFTRAIT_EXPR (IS_CONSTRUCTIBLE, "__is_constructible", -1)
 DEFTRAIT_EXPR (IS_CONVERTIBLE, "__is_convertible", 2)
 DEFTRAIT_EXPR (IS_EMPTY, "__is_empty", 1)
diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
index 082fe2db4f2..1a6f08c37ec 100644
--- a/gcc/cp/semantics.cc
+++ b/gcc/cp/semantics.cc
@@ -12415,6 +12415,9 @@ trait_expr_value (cp_trait_kind kind, tree type1, tree 
type2)
 case CPTK_IS_CLASS:
   return NON_UNION_CLASS_TYPE_P (type1);
 
+case CPTK_IS_CONST:
+  return CP_TYPE_CONST_P (type1);
+
 case CPTK_IS_CONSTRUCTIBLE:
   return is_xible (INIT_EXPR, type1, type2);
 
@@ -12657,6 +12660,7 @@ finish_trait_expr (location_t loc, cp_trait_kind kind, 
tree type1, tree type2)
 case CPTK_IS_ARRAY:
 case CPTK_IS_BOUNDED_ARRAY:
 case CPTK_IS_CLASS:
+case CPTK_IS_CONST:
 case CPTK_IS_ENUM:
 case CPTK_IS_FUNCTION:
 case CPTK_IS_MEMBER_FUNCTION_POINTER:
diff --git a/gcc/testsuite/g++.dg/ext/has-builtin-1.C 
b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
index 02b4b4d745d..e3640faeb96 100644
--- a/gcc/testsuite/g++.dg/ext/has-builtin-1.C
+++ b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
@@ -71,6 +71,9 @@
 #if !__has_builtin (__is_class)
 # error "__has_builtin (__is_class) failed"
 #endif
+#if !__has_builtin (__is_const)
+# error "__has_builtin (__is_const) failed"
+#endif
 #if !__has_builtin (__is_constructible)
 # error "__has_builtin (__is_constructible) failed"
 #endif
diff --git a/gcc/testsuite/g++.dg/ext/is_const.C 
b/gcc/testsuite/g++.dg/ext/is_const.C
new file mode 100644
index 000..8a0e8df72a9
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/is_const.C
@@ -0,0 +1,20 @@
+// { dg-do compile { target c++11 } }
+
+#define SA(X) static_assert((X),#X)
+
+class ClassType { };
+using cClassType = const ClassType;
+using vClassType = volatile ClassType;
+using cvClassType = const volatile ClassType;
+
+// Positive tests.
+SA(__is_const(const int));
+SA(__is_const(const volatile int));
+SA(__is_const(cClassType));
+SA(__is_const(cvClassType));
+
+// Negative tests.
+SA(!__is_const(int));
+SA(!__is_const(volatile int));
+SA(!__is_const(ClassType));
+SA(!__is_const(vClassType));
-- 
2.43.0



[PATCH v3 8/8] libstdc++: Optimize std::is_unbounded_array compilation performance

2024-01-05 Thread Ken Matsui
This patch optimizes the compilation performance of
std::is_unbounded_array by dispatching to the new
__is_unbounded_array built-in trait.

libstdc++-v3/ChangeLog:

* include/std/type_traits (is_unbounded_array_v): Use
__is_unbounded_array built-in trait.

Signed-off-by: Ken Matsui 
---
 libstdc++-v3/include/std/type_traits | 5 +
 1 file changed, 5 insertions(+)

diff --git a/libstdc++-v3/include/std/type_traits 
b/libstdc++-v3/include/std/type_traits
index 018bf20ba1d..36344a014c3 100644
--- a/libstdc++-v3/include/std/type_traits
+++ b/libstdc++-v3/include/std/type_traits
@@ -3675,11 +3675,16 @@ template
   /// True for a type that is an array of unknown bound.
   /// @ingroup variable_templates
   /// @since C++20
+# if _GLIBCXX_USE_BUILTIN_TRAIT(__is_unbounded_array)
+  template
+inline constexpr bool is_unbounded_array_v = __is_unbounded_array(_Tp);
+# else
   template
 inline constexpr bool is_unbounded_array_v = false;
 
   template
 inline constexpr bool is_unbounded_array_v<_Tp[]> = true;
+# endif
 
   /// True for a type that is an array of known bound.
   /// @since C++20
-- 
2.43.0



[Committed] RISC-V: Update MAX_SEW for available vsevl info[VSETVL PASS]

2024-01-05 Thread Juzhe-Zhong
This patch fixes a bug of VSETVL PASS in this following situation:

Ignore curr info since prev info available with it:
  prev_info: VALID (insn 8, bb 2)
Demand fields: demand_ratio_and_ge_sew demand_avl
SEW=16, VLMUL=mf4, RATIO=64, MAX_SEW=64
TAIL_POLICY=agnostic, MASK_POLICY=agnostic
AVL=(const_int 1 [0x1])
VL=(nil)
  curr_info: VALID (insn 12, bb 2)
Demand fields: demand_ge_sew demand_non_zero_avl
SEW=16, VLMUL=m1, RATIO=16, MAX_SEW=32
TAIL_POLICY=agnostic, MASK_POLICY=agnostic
AVL=(const_int 1 [0x1])
VL=(nil)

We should update prev_info MAX_SEW from 64 into 32.

Before this patch:
foo:
vsetivlizero,1,e64,m1,ta,ma
vle64.v v1,0(a1)
vmv.s.x v3,a0
vfmv.s.fv2,fa0
vadd.vv v1,v1,v1
ret

After this patch:
foo:
vsetivlizero,1,e16,mf4,ta,ma
vle64.v v1,0(a1)
vmv.s.x v3,a0
vfmv.s.fv2,fa0
vsetvli zero,zero,e64,m1,ta,ma
vadd.vv v1,v1,v1
ret

Tested on both RV32 and RV64 no regression. Committed.

PR target/113248

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pre_vsetvl::fuse_local_vsetvl_info):

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr113248.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc| 17 +
 .../gcc.target/riscv/rvv/vsetvl/pr113248.c  | 15 +++
 2 files changed, 32 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 3a2ea9ad44a..7d748edc0ef 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2876,6 +2876,23 @@ pre_vsetvl::fuse_local_vsetvl_info ()
  curr_info.dump (dump_file, "");
  fprintf (dump_file, "\n");
}
+ /* Even though prev_info is available with curr_info,
+we need to update the MAX_SEW of prev_info since
+we don't check MAX_SEW in available_p check.
+
+prev_info:
+Demand fields: demand_ratio_and_ge_sew demand_avl
+SEW=16, VLMUL=mf4, RATIO=64, MAX_SEW=64
+
+curr_info:
+Demand fields: demand_ge_sew demand_non_zero_avl
+SEW=16, VLMUL=m1, RATIO=16, MAX_SEW=32
+
+In the example above, prev_info is available with
+curr_info, we need to update prev_info MAX_SEW from
+64 into 32.  */
+ prev_info.set_max_sew (
+   MIN (prev_info.get_max_sew (), curr_info.get_max_sew ()));
  if (!curr_info.vl_used_by_non_rvv_insn_p ()
  && vsetvl_insn_p (curr_info.get_insn ()->rtl ()))
m_delete_list.safe_push (curr_info);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c
new file mode 100644
index 000..b3b506177df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-mtune=generic-ooo --param=riscv-autovec-preference=scalable 
-march=rv32gc_zve64f_zvfh -mabi=ilp32d -O3" } */
+
+#include "riscv_vector.h"
+
+void foo(_Float16 y, int64_t *i64p)
+{
+  vint64m1_t vx =__riscv_vle64_v_i64m1 (i64p, 1);
+  vx = __riscv_vadd_vv_i64m1 (vx, vx, 1);
+  vfloat16m1_t vy =__riscv_vfmv_s_f_f16m1 (y, 1);
+  asm volatile ("# use %0 %1" : : "vr"(vx), "vr" (vy));
+}
+
+/* { dg-final { scan-assembler-times 
{vsetivli\s+zero,\s*1,\s*e16,\s*mf4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times 
{vsetvli\s+zero,\s*zero,\s*e64,\s*m1,\s*t[au],\s*m[au]} 1 } } */
-- 
2.36.3



Re: [pushed][PATCH] LoongArch: Improve lasx_xvpermi_q_ insn pattern

2024-01-05 Thread chenglulu

Pushed to r14-6968.

在 2024/1/5 下午3:37, Jiahao Xu 写道:

For instruction xvpermi.q, unused bits in operands[3] need be set to 0 to avoid
causing undefined behavior on LA464.

gcc/ChangeLog:

* config/loongarch/lasx.md: Set the unused bits in operand[3] to 0.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/vector/lasx/lasx-xvpremi.c: Removed.
* gcc.target/loongarch/vector/lasx/lasx-xvpermi_q.c: New test.

diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index 027021b45d5..d7329f29f5f 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -635,6 +635,8 @@
 (set_attr "mode" "")])
  
  ;; xvpermi.q

+;; Unused bits in operands[3] need be set to 0 to avoid
+;; causing undefined behavior on LA464.
  (define_insn "lasx_xvpermi_q_"
[(set (match_operand:LASX 0 "register_operand" "=f")
(unspec:LASX
@@ -643,7 +645,12 @@
   (match_operand 3 "const_uimm8_operand")]
  UNSPEC_LASX_XVPERMI_Q))]
"ISA_HAS_LASX"
-  "xvpermi.q\t%u0,%u2,%3"
+{
+  int mask = 0x33;
+  mask &= INTVAL (operands[3]);
+  operands[3] = GEN_INT (mask);
+  return "xvpermi.q\t%u0,%u2,%3";
+}
[(set_attr "type" "simd_splat")
 (set_attr "mode" "")])
  
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvpermi_q.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvpermi_q.c

new file mode 100644
index 000..dbc29d2fb22
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvpermi_q.c
@@ -0,0 +1,64 @@
+/* { dg-options "-mlasx -w -fno-strict-aliasing" } */
+#include "../simd_correctness_check.h"
+#include 
+
+int
+main ()
+{
+  __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result;
+  __m256 __m256_op0, __m256_op1, __m256_op2, __m256_out, __m256_result;
+  __m256d __m256d_op0, __m256d_op1, __m256d_op2, __m256d_out, __m256d_result;
+
+  int int_op0, int_op1, int_op2, int_out, int_result, i = 1, fail;
+  long int long_op0, long_op1, long_op2, lont_out, lont_result;
+  long int long_int_out, long_int_result;
+  unsigned int unsigned_int_out, unsigned_int_result;
+  unsigned long int unsigned_long_int_out, unsigned_long_int_result;
+
+  *((unsigned long*)& __m256i_op0[3]) = 0x7fe37fe3001d001d;
+  *((unsigned long*)& __m256i_op0[2]) = 0x7fff7fff7fff;
+  *((unsigned long*)& __m256i_op0[1]) = 0x7fe37fe3001d001d;
+  *((unsigned long*)& __m256i_op0[0]) = 0x7fff7fff7fff;
+  *((unsigned long*)& __m256i_op1[3]) = 0x7575757575757575;
+  *((unsigned long*)& __m256i_op1[2]) = 0x7575757575757575;
+  *((unsigned long*)& __m256i_op1[1]) = 0x7575757575757575;
+  *((unsigned long*)& __m256i_op1[0]) = 0x7575757575757575;
+  *((unsigned long*)& __m256i_result[3]) = 0x7fe37fe3001d001d;
+  *((unsigned long*)& __m256i_result[2]) = 0x7fff7fff7fff;
+  *((unsigned long*)& __m256i_result[1]) = 0x7fe37fe3001d001d;
+  *((unsigned long*)& __m256i_result[0]) = 0x7fff7fff7fff;
+  __m256i_out = __lasx_xvpermi_q (__m256i_op0, __m256i_op1, 0x2a);
+  ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out);
+
+  *((unsigned long*)& __m256i_op0[3]) = 0x;
+  *((unsigned long*)& __m256i_op0[2]) = 0x0019001c;
+  *((unsigned long*)& __m256i_op0[1]) = 0x;
+  *((unsigned long*)& __m256i_op0[0]) = 0x0019001c;
+  *((unsigned long*)& __m256i_op1[3]) = 0x;
+  *((unsigned long*)& __m256i_op1[2]) = 0x01fe;
+  *((unsigned long*)& __m256i_op1[1]) = 0x;
+  *((unsigned long*)& __m256i_op1[0]) = 0x01fe;
+  *((unsigned long*)& __m256i_result[3]) = 0x;
+  *((unsigned long*)& __m256i_result[2]) = 0x0019001c;
+  *((unsigned long*)& __m256i_result[1]) = 0x;
+  *((unsigned long*)& __m256i_result[0]) = 0x01fe;
+  __m256i_out = __lasx_xvpermi_q (__m256i_op0, __m256i_op1, 0xb9);
+  ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out);
+
+  *((unsigned long*)& __m256i_op0[3]) = 0x00ff00ff00ff00ff;
+  *((unsigned long*)& __m256i_op0[2]) = 0x00ff00ff00ff00ff;
+  *((unsigned long*)& __m256i_op0[1]) = 0x00ff00ff00ff00ff;
+  *((unsigned long*)& __m256i_op0[0]) = 0x00ff00ff00ff00ff;
+  *((unsigned long*)& __m256i_op1[3]) = 0x;
+  *((unsigned long*)& __m256i_op1[2]) = 0x;
+  *((unsigned long*)& __m256i_op1[1]) = 0x;
+  *((unsigned long*)& __m256i_op1[0]) = 0x;
+  *((unsigned long*)& __m256i_result[3]) = 0x;
+  *((unsigned long*)& __m256i_result[2]) = 0x;
+  *((unsigned long*)& __m256i_result[1]) = 0x00ff00ff00ff00ff;
+  *((unsigned long*)& __m256i_result[0]) = 0x00ff00ff00ff00ff;
+  __m256i_out = __lasx_xvpermi_q (__m256i_op0, __m256i_op1, 0xca);
+  ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvpremi.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvpremi.c
deleted file mode 100644
index e9